<a href="https://colab.research.google.com/github/binliu0630/NLP/blob/master/Spacy_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# # download languge model
# !python -m spacy download en
# !python -m spacy download en_core_web_lg

In [0]:
import pandas as pd
import spacy
nlp = spacy.load('en')


In [0]:
# Another way to build the nlp object
from spacy.lang.en import English
nlp = English()

# Token, Span object

In [0]:
doc = nlp('Hello world!')

for token in doc:
  print(token.text)

Hello
world
!


In [0]:
token = doc[0]
print(token.text)

Hello


In [0]:
span = doc[0:2]
print(span.text)

Hello world


# Lexical Attributes

In [0]:
doc = nlp('It cost $5.')

In [0]:
print('Index: ', [token.i for token in doc])
print('Text: ', [token.text for token in doc])
print('is_alpha: ', [token.is_alpha for token in doc])
print('is_punct: ', [token.is_punct for token in doc])
print('like_num: ', [token.like_num for token in doc])


Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'cost', '$', '5', '.']
is_alpha:  [True, True, False, False, False]
is_punct:  [False, False, False, False, True]
like_num:  [False, False, False, True, False]


In [0]:
# foreign language
from spacy.lang.de import German
nlp = German()
doc = nlp("Liebe Grüße!")
print(doc[1].text)

Grüße


In [0]:
from spacy.lang.es import Spanish
nlp = Spanish()
doc = nlp('La tormenta tropical sería un huracán de categoría 1')
doc[0].text

'La'

# Statistical Models

- Enable spaCy to predict linguistic attributes in context
 
 Part-of-speech tags
 
 Syntactic dependencies
 
 Named entities
- Trained on labeled example texts
- Can be updated with more examples to fine-tune predictions

In [0]:
#!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [0]:
# POS, dep, head

doc = nlp("She ate the pizza")

for token in doc:
  # The "dep underscore" attribute returns the predicted dependency label.
  # The head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to.
  print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [0]:
# NE

doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
  print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [0]:
# the exlain method in spacy
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [2]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126237 sha256=ea2691f80614d943f9f0a3b8679135c0a9bffdcece7311ae58adc9a5f395ce90
  Stored in directory: /tmp/pip-ephem-wheel-cache-_s7hgpe9/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model vi

In [3]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)

OSError: ignored

In [4]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [0]:
text = pd.DataFrame({'text':['Hello  World!',
                  'Next week I\'ll be in Madrid.',
                  'I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ.']})

In [0]:
' '.join(text.text)

"Hello  World! Next week I'll be in Madrid. I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ."

In [0]:
doc = nlp(' '.join(text.text))

In [0]:
# token

In [0]:
df = pd.DataFrame()
for i, token in enumerate(doc):
  df.loc[i, 'Text'] = token.text
  df.loc[i, 'Inx'] = token.idx
  df.loc[i, 'lemma'] = token.lemma_
  df.loc[i, 'is_punct'] = token.is_punct
  df.loc[i, 'is_space'] = token.is_space
  df.loc[i, 'shape'] = token.shape_
  df.loc[i, 'pos'] = token.pos_
  df.loc[i, 'tag'] = token.tag_

In [0]:
df

Unnamed: 0,Text,Inx,lemma,is_punct,is_space,shape,pos,tag
0,Hello,0.0,hello,False,False,Xxxxx,INTJ,UH
1,,6.0,,False,True,,SPACE,
2,World,7.0,world,False,False,Xxxxx,NOUN,NN
3,!,12.0,!,True,False,!,PUNCT,.
4,Next,14.0,next,False,False,Xxxx,ADJ,JJ
5,week,19.0,week,False,False,xxxx,NOUN,NN
6,I,24.0,-PRON-,False,False,X,PRON,PRP
7,'ll,25.0,will,False,False,'xx,VERB,MD
8,be,29.0,be,False,False,xx,VERB,VB
9,in,32.0,in,False,False,xx,ADP,IN


In [0]:
# sentence

In [0]:
for sent in doc.sents:
  print(sent)

Hello  World!
Next week I'll be in Madrid.
I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ.


In [0]:
# entity

In [0]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Next week DATE
Madrid GPE
2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


In [0]:
from spacy import displacy

In [0]:

displacy.render(doc, style ='ent', jupyter = True)

In [0]:
# chunk


In [0]:
for chunk in doc.noun_chunks:
  print(chunk.text, chunk.label_, chunk.root.text)

Hello  World NP World
I NP I
Madrid NP Madrid
I NP I
2 shares NP shares
9 a.m. NP a.m.
the stock NP stock
just 2 days NP days
the WSJ NP WSJ


In [0]:
displacy.render(doc, style = 'dep', jupyter = True, options = {'distance':90})

In [0]:
nlp = spacy.load('en_core_web_lg')

In [0]:
# word vector shape
print(nlp.vocab['banana'].vector.shape)

(300,)


In [0]:
from scipy import spatial
cosine_similarity = lambda x,y: 1 - spatial.distance.cosine(x, y)

In [0]:
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['king'].vector
king = nlp.vocab['queen'].vector

In [0]:
maybe_king = man -woman + queen

In [0]:
computed_similarity = []
for word in nlp.vocab:
  if not word.has_vector:
    continue
  similarity = cosine_similarity(maybe_king, word.vector)
  computed_similarity.append((word, similarity))
  
computed_similarity = sorted(computed_similarity, key = lambda item: -item[1])

In [0]:
print([(w[0].text, w[1]) for w in computed_similarity[:10]])

[('King', 0.8575966358184814), ('KING', 0.8575966358184814), ('king', 0.8575966358184814), ('KIng', 0.8575966358184814), ('Kings', 0.6851363182067871), ('KINGS', 0.6851363182067871), ('kings', 0.6851363182067871), ('lord', 0.5916184782981873), ('Lord', 0.5916184782981873), ('LORD', 0.5916184782981873)]


In [0]:
# similarity interface on token, doc


In [0]:
b = nlp.vocab['banana']
d = nlp.vocab['dog']
d.similarity(b)

0.24327643

In [0]:
target = nlp('Cats are beautiful animals.')
doc1 = nlp('Dogs are awesome.')
doc2 = nlp('Some gorgeous creatures are felines')
doc3 = nlp('Dolphins are swimming mammals.')

In [0]:
target.similarity(doc1)

0.8901766262114666

In [0]:
target.similarity(doc2)

0.8713768488723188

In [0]:
target.similarity(doc3)

0.7822956256736615

spaCy + StanfordNLP

https://www.google.com/search?client=safari&rls=en&q=course+v3+fastai&ie=UTF-8&oe=UTF-8