# spaCy

In [1]:
import spacy
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_md')
#nlp = spacy.load('ja_core_news_md')
nlp_ja = spacy.load('ja_ginza')

In [3]:
with open('../data/the_beatles.txt', 'r') as f:
    doc = nlp(f.read())

with open('../data/the_beatles_ja.txt', 'r') as f:
    doc_ja = nlp_ja(f.read())

In [4]:
[_ for _ in doc.sents][2:6]

[The story of the Beatles begins in the late 1950s, when John Lennon and Paul McCartney first crossed paths.,
 Both budding musicians, they formed a skiffle group called The Quarrymen, laying the foundation for what would eventually become the Beatles.,
 The addition of George Harrison and Ringo Starr completed the lineup, setting the stage for a musical journey that would redefine the very essence of popular music.
 ,
 In 1962, the Beatles signed with EMI's Parlophone label, with George Martin as their producer.]

In [5]:
doc.ents[:10]

(Beatles,
 Beatles,
 the 1960s,
 John Lennon,
 Paul McCartney,
 George Harrison,
 Ringo Starr,
 Beatles,
 Liverpool,
 England)

In [6]:
output = []
output_ja = []
columns = ['text', 'lower_', 'lemma_', 'pos_', 'tag_', 'is_alpha', 'is_ascii', 'is_digit', 'is_stop', 'like_num', 'dep_']

for token in doc[:30]:
    output.append([token.text, token.lower_, token.lemma_, token.pos_, token.tag_, \
                    token.is_alpha, token.is_ascii, \
                    token.is_digit, token.is_stop, \
                    token.like_num, token.dep_
                   ])

for token in doc_ja[:30]:
    output_ja.append([token.text, token.lower_, token.lemma_, token.pos_, token.tag_, \
                    token.is_alpha, token.is_ascii, \
                    token.is_digit, token.is_stop, \
                    token.like_num, token.dep_
                   ])

df = pd.DataFrame(output, columns=columns)
df_ja = pd.DataFrame(output_ja, columns=columns)

In [7]:
df

Unnamed: 0,text,lower_,lemma_,pos_,tag_,is_alpha,is_ascii,is_digit,is_stop,like_num,dep_
0,The,the,the,DET,DT,True,True,False,True,False,det
1,Beatles,beatles,Beatles,PROPN,NNPS,True,True,False,False,False,nsubj
2,:,:,:,PUNCT,:,False,True,False,False,False,punct
3,An,an,an,DET,DT,True,True,False,True,False,det
4,Enduring,enduring,Enduring,PROPN,NNP,True,True,False,False,False,compound
5,Legacy,legacy,Legacy,PROPN,NNP,True,True,False,False,False,appos
6,in,in,in,ADP,IN,True,True,False,True,False,prep
7,the,the,the,DET,DT,True,True,False,True,False,det
8,Tapestry,tapestry,Tapestry,PROPN,NNP,True,True,False,False,False,pobj
9,of,of,of,ADP,IN,True,True,False,True,False,prep


In [8]:
df_ja

Unnamed: 0,text,lower_,lemma_,pos_,tag_,is_alpha,is_ascii,is_digit,is_stop,like_num,dep_
0,ビートルズ,ビートルズ,ビートルズ,NOUN,名詞-固有名詞-人名-一般,True,False,False,False,False,nsubj
1,（,（,(,PUNCT,補助記号-括弧開,False,False,False,False,False,punct
2,The Beatles,the beatles,The Beatles,NOUN,名詞-固有名詞-人名-一般,False,True,False,False,False,compound
3,）,）,),PUNCT,補助記号-括弧閉,False,False,False,False,False,punct
4,は,は,は,ADP,助詞-係助詞,True,False,False,True,False,case
5,、,、,、,PUNCT,補助記号-読点,False,False,False,False,False,punct
6,20,20,20,NUM,名詞-数詞,False,True,True,False,True,nummod
7,世紀,世紀,世紀,NOUN,名詞-普通名詞-一般,True,False,False,False,False,nmod
8,の,の,の,ADP,助詞-格助詞,True,False,False,True,False,case
9,最も,最も,最も,ADV,副詞,True,False,False,False,False,advmod


In [9]:
output = []
output_ja = []

for ent in doc.ents[:30]:
    output.append([ent.text, ent.label_])

for ent in doc_ja.ents[:30]:
    output_ja.append([ent.text, ent.label_])

In [10]:
output

[['Beatles', 'PERSON'],
 ['Beatles', 'PERSON'],
 ['the 1960s', 'DATE'],
 ['John Lennon', 'PERSON'],
 ['Paul McCartney', 'PERSON'],
 ['George Harrison', 'PERSON'],
 ['Ringo Starr', 'PERSON'],
 ['Beatles', 'PERSON'],
 ['Liverpool', 'GPE'],
 ['England', 'GPE'],
 ['Beatles', 'PERSON'],
 ['the late 1950s', 'DATE'],
 ['John Lennon', 'PERSON'],
 ['Paul McCartney', 'PERSON'],
 ['first', 'ORDINAL'],
 ['Beatles', 'PERSON'],
 ['George Harrison', 'PERSON'],
 ['Ringo Starr', 'PERSON'],
 ['1962', 'DATE'],
 ['Beatles', 'PERSON'],
 ['EMI', 'ORG'],
 ['Parlophone', 'ORG'],
 ['George Martin', 'PERSON'],
 ['Love Me Do', 'WORK_OF_ART'],
 ['Beatlemania', 'PERSON'],
 ['Beatles', 'PERSON'],
 ['Beatles', 'PERSON'],
 ['Please Please Me', 'WORK_OF_ART'],
 ['1963', 'DATE'],
 ['Beatles', 'PERSON']]

In [11]:
spacy.explain('GPE')

'Countries, cities, states'

In [12]:
output_ja

[['ビートルズ', 'Show_Organization'],
 ['20世紀', 'Date'],
 ['一つ', 'N_Product'],
 ['ビートルズ', 'Person'],
 ['1960年代初頭', 'Date'],
 ['イギリス', 'Country'],
 ['リバプール', 'Pro_Sports_Organization'],
 ['ジョン・レノン', 'Person'],
 ['ポール・マッカートニー', 'Person'],
 ['ジョージ・ハリスン', 'Person'],
 ['リンゴ', 'Flora'],
 ['4人', 'N_Person'],
 ['メンバー', 'Position_Vocation'],
 ['ロックンロール', 'Position_Vocation'],
 ['1960年代', 'Date'],
 ['ビートルズ', 'Person'],
 ['レット・イット・ビー', 'Music'],
 ['ヘイ・ジュード', 'Person'],
 ['ビートルズ', 'Person'],
 ['1960年', 'Date'],
 ['ハンブルク', 'Game'],
 ['ビートルズ', 'Show_Organization'],
 ['イギリス', 'Country'],
 ['Please Please Me', 'Music'],
 ['1963年', 'Date'],
 ['ジョン・レノン', 'Person'],
 ['ポール・マッカートニー', 'Person'],
 ['ジョージ・ハリスン', 'Person'],
 ['サージェント・ペパーズ・ロンリー・', 'Music'],
 ['1967年', 'Date']]