<a href="https://colab.research.google.com/github/ashu5644/NLP_workspace/blob/main/notebooks/spacy_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy

Lingustic Annotations

In [3]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
  print(f'TOKEN_TEXT: {token.text}, TOKEN_POS:{token.pos_}, TOKEN_DEP: {token.dep_}')

TOKEN_TEXT: Apple, TOKEN_POS:PROPN, TOKEN_DEP: nsubj
TOKEN_TEXT: is, TOKEN_POS:AUX, TOKEN_DEP: aux
TOKEN_TEXT: looking, TOKEN_POS:VERB, TOKEN_DEP: ROOT
TOKEN_TEXT: at, TOKEN_POS:ADP, TOKEN_DEP: prep
TOKEN_TEXT: buying, TOKEN_POS:VERB, TOKEN_DEP: pcomp
TOKEN_TEXT: U.K., TOKEN_POS:PROPN, TOKEN_DEP: dobj
TOKEN_TEXT: startup, TOKEN_POS:NOUN, TOKEN_DEP: dep
TOKEN_TEXT: for, TOKEN_POS:ADP, TOKEN_DEP: prep
TOKEN_TEXT: $, TOKEN_POS:SYM, TOKEN_DEP: quantmod
TOKEN_TEXT: 1, TOKEN_POS:NUM, TOKEN_DEP: compound
TOKEN_TEXT: billion, TOKEN_POS:NUM, TOKEN_DEP: pobj


Tokenization

In [4]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
  print(f'token: {token}')

token: Apple
token: is
token: looking
token: at
token: buying
token: U.K.
token: startup
token: for
token: $
token: 1
token: billion


POS and Dependencies

In [7]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
import pandas as pd
df = pd.DataFrame()
for idx, token in enumerate(doc):
  df.loc[idx, 'text'] = token.text
  df.loc[idx, 'lemma'] = token.lemma_
  df.loc[idx, 'pos'] = token.pos_
  df.loc[idx, 'tag'] = token.tag_
  df.loc[idx, 'dep'] = token.dep_
  df.loc[idx, 'shape'] = token.shape_
  df.loc[idx, 'is_alpha'] = token.is_alpha
  df.loc[idx, 'is_stop'] = token.is_stop
df

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,X.X.,False,False
6,startup,startup,NOUN,NN,dep,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


Visulization

In [8]:
from spacy import displacy
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


Named Entity

In [11]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
df = pd.DataFrame()
for idx, ent in enumerate(doc.ents):
  df.loc[idx, 'text'] = ent.text
  df.loc[idx,'start_idx'] = ent.start_char
  df.loc[idx, 'end_idx'] = ent.end_char
  df.loc[idx, 'entity'] = ent.label_
df

Unnamed: 0,text,start_idx,end_idx,entity
0,Apple,0.0,5.0,ORG
1,U.K.,27.0,31.0,GPE
2,$1 billion,44.0,54.0,MONEY


Word Vectors

In [16]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
  print(token.text, token.has_vector, token.vector_norm, token.is_oov, token.vector)
  break

Apple True 8.6495 True [-1.23110223e+00 -1.19172668e+00  1.58404320e-01  3.59881461e-01
  6.80532396e-01  1.70100391e-01  1.32927656e+00  8.85804072e-02
 -7.72866249e-01 -6.79355115e-02  1.04016542e+00  7.55584240e-03
 -1.67930603e+00 -4.59604681e-01 -1.09400225e+00 -5.35519540e-01
  1.15980649e+00  2.33544409e-03 -7.30589986e-01 -5.64724982e-01
 -7.49180377e-01  2.19044581e-01  1.00360096e-01  9.37369764e-02
 -1.13666034e+00  3.31228107e-01  2.32216024e+00  1.18360245e+00
 -2.16722190e-01  1.58354163e+00  2.49963343e-01 -1.07244039e+00
  1.42693490e-01  6.90324783e-01 -4.03066576e-01  5.47101021e-01
 -4.54420447e-01 -3.23455632e-01 -1.63471639e-01  2.91558909e+00
 -1.32943809e+00  1.20826773e-01  3.99443865e-01  8.78926873e-01
 -1.63153493e+00  1.14091444e+00 -4.30475116e-01  1.74879467e+00
  6.38306856e-01 -8.98894131e-01 -6.78731680e-01  7.68426299e-01
 -6.43653631e-01 -1.36196291e+00 -7.04809189e-01 -5.67580342e-01
  3.89744520e-01 -4.15740728e-01  2.76996970e-01 -2.90538669e-01
 -

Similarity

In [17]:
doc1 = nlp("I am going to jaipur")
doc2 = nlp("You are coming from patna")
doc3 = nlp("mango is called king of fruits")
doc1.similarity(doc2), doc2.similarity(doc3), doc3.similarity(doc1)

  doc1.similarity(doc2), doc2.similarity(doc3), doc3.similarity(doc1)


(0.6593600391250509, 0.44829684804152065, 0.3138693254694571)