<a href="https://colab.research.google.com/github/binliu0630/NLP/blob/master/Spacy_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import spacy

In [0]:
# download languge model
!python -m spacy download en
!python -m spacy download en_core_web_lg


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')

Collecting en_core_web_lg==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz#egg=en_core_web_lg==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    100% |████████████████████████████████| 852.3MB 57.9MB/s 
[?25hInstalling collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core-web-lg-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



In [0]:
nlp = spacy.load('en')

In [0]:
import pandas as pd


In [0]:
text = pd.DataFrame({'text':['Hello  World!',
                  'Next week I\'ll be in Madrid.',
                  'I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ.']})

In [0]:
' '.join(text.text)

"Hello  World! Next week I'll be in Madrid. I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ."

In [0]:
doc = nlp(' '.join(text.text))

In [0]:
# token

In [0]:
df = pd.DataFrame()
for i, token in enumerate(doc):
  df.loc[i, 'Text'] = token.text
  df.loc[i, 'Inx'] = token.idx
  df.loc[i, 'lemma'] = token.lemma_
  df.loc[i, 'is_punct'] = token.is_punct
  df.loc[i, 'is_space'] = token.is_space
  df.loc[i, 'shape'] = token.shape_
  df.loc[i, 'pos'] = token.pos_
  df.loc[i, 'tag'] = token.tag_

In [0]:
df

Unnamed: 0,Text,Inx,lemma,is_punct,is_space,shape,pos,tag
0,Hello,0.0,hello,False,False,Xxxxx,INTJ,UH
1,,6.0,,False,True,,SPACE,
2,World,7.0,world,False,False,Xxxxx,NOUN,NN
3,!,12.0,!,True,False,!,PUNCT,.
4,Next,14.0,next,False,False,Xxxx,ADJ,JJ
5,week,19.0,week,False,False,xxxx,NOUN,NN
6,I,24.0,-PRON-,False,False,X,PRON,PRP
7,'ll,25.0,will,False,False,'xx,VERB,MD
8,be,29.0,be,False,False,xx,VERB,VB
9,in,32.0,in,False,False,xx,ADP,IN


In [0]:
# sentence

In [0]:
for sent in doc.sents:
  print(sent)

Hello  World!
Next week I'll be in Madrid.
I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ.


In [0]:
# entity

In [0]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Next week DATE
Madrid GPE
2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


In [0]:
from spacy import displacy

In [0]:

displacy.render(doc, style ='ent', jupyter = True)

In [0]:
# chunk


In [0]:
for chunk in doc.noun_chunks:
  print(chunk.text, chunk.label_, chunk.root.text)

Hello  World NP World
I NP I
Madrid NP Madrid
I NP I
2 shares NP shares
9 a.m. NP a.m.
the stock NP stock
just 2 days NP days
the WSJ NP WSJ


In [0]:
displacy.render(doc, style = 'dep', jupyter = True, options = {'distance':90})

In [0]:
nlp = spacy.load('en_core_web_lg')

In [0]:
# word vector shape
print(nlp.vocab['banana'].vector.shape)

(300,)


In [0]:
from scipy import spatial
cosine_similarity = lambda x,y: 1 - spatial.distance.cosine(x, y)

In [0]:
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['king'].vector
king = nlp.vocab['queen'].vector

In [0]:
maybe_king = man -woman + queen

In [0]:
computed_similarity = []
for word in nlp.vocab:
  if not word.has_vector:
    continue
  similarity = cosine_similarity(maybe_king, word.vector)
  computed_similarity.append((word, similarity))
  
computed_similarity = sorted(computed_similarity, key = lambda item: -item[1])

In [0]:
print([(w[0].text, w[1]) for w in computed_similarity[:10]])

[('King', 0.8575966358184814), ('KING', 0.8575966358184814), ('king', 0.8575966358184814), ('KIng', 0.8575966358184814), ('Kings', 0.6851363182067871), ('KINGS', 0.6851363182067871), ('kings', 0.6851363182067871), ('lord', 0.5916184782981873), ('Lord', 0.5916184782981873), ('LORD', 0.5916184782981873)]


In [0]:
# similarity interface on token, doc


In [0]:
b = nlp.vocab['banana']
d = nlp.vocab['dog']
d.similarity(b)

0.24327643

In [0]:
target = nlp('Cats are beautiful animals.')
doc1 = nlp('Dogs are awesome.')
doc2 = nlp('Some gorgeous creatures are felines')
doc3 = nlp('Dolphins are swimming mammals.')

In [0]:
target.similarity(doc1)

0.8901766262114666

In [0]:
target.similarity(doc2)

0.8713768488723188

In [0]:
target.similarity(doc3)

0.7822956256736615

spaCy + StanfordNLP

https://www.google.com/search?client=safari&rls=en&q=course+v3+fastai&ie=UTF-8&oe=UTF-8