In [None]:
# pipeline = bunch of components
# text => tokenization => pipeline => doc

In [None]:
import spacy

In [None]:
nlp = spacy.blank("en")
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day")

for token in doc :
  print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day


In [None]:
nlp.pipe_names

[]

In [None]:
# for each lang , we can use/download trained pipeline for that lang
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
nlp = spacy.load("en_core_web_sm") # upr blank tha spacy.load , yaha trained pipeline daali h

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f2f9a973e80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f2f9a973fa0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f2f9a9bbd80>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f2f9aa78ac0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f2f9a7c2200>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f2f9a9bbe60>)]

In [None]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day")

for token in doc :
  print(token, " | " , token.pos_ , " | " , token.lemma_)
# pos -> part of speech -> each word has a grammer identity -> example -> verb , noun , pronoun
# lemma -> base word of every word -> example -> eat is base word for eating

Captain  |  PROPN  |  Captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day


In [None]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

# ent -> entities 
for ent in doc.ents :
  print(ent.text, " | " , ent.label_ , " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [None]:
from spacy import displacy 

displacy.render(doc,style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n</div>'

In [None]:
# we can also add custom pipe
source_nlp = spacy.load("en_core_web_sm")
nlp=spacy.blank("en")
nlp.add_pipe("ner",source=source_nlp)
nlp.pipe_names

['ner']

# ***Stemming and Lemmatization***

In [None]:
# stemming -> just cutting down the suffixed and getting base word 
# example -> talking => talk ; eating => eat
# dumb rules 

# lemmatization -> use knowledge of language , derive a base word 


# spacy -> no stemming , only lemma
# nltk -> both

In [None]:
import nltk

In [None]:
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer()

In [None]:
words = ["eating" , "eats" , "eat" , "ate" , "adjustable" , "rating" , "ability" , "meeting"]
for word in words :
  print(word , " | " , stemmer.stem(word) )

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rating  |  rate
ability  |  abil
meeting  |  meet


In [None]:
# stemming -> ate : ate , ability : abil ; dumb library

In [None]:
# lemma ->
nlp = spacy.load("en_core_web_sm")
doc = nlp("eating eats eat ate adjustable rating ability meeting")
for token in doc :
  print(token , " | " , token.lemma_) # token.lemma -> gives hashword / kindof ascii of lemma_ 

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rating  |  rating
ability  |  ability
meeting  |  meeting


In [None]:
# better and smart ; but takes time 

In [None]:
# attribute ruler component in pipeline -> customize pipeline

ar = nlp.get_pipe('attribute_ruler')
ar.add([[{"TEXT":"Bro"}],[{"TEXT" : "Brah"}]],{"LEMMA":"Brother"}) # customize
doc = nlp("Bro , u wanna go ? Brah , dont say no , i am done")

for token in doc : 
  print(token.text , " | " ,token.lemma_)


Bro  |  Brother
,  |  ,
u  |  u
wanna  |  wanna
go  |  go
?  |  ?
Brah  |  Brother
,  |  ,
do  |  do
nt  |  not
say  |  say
no  |  no
,  |  ,
i  |  I
am  |  be
done  |  do


Smol exercise

In [None]:
output = []
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
for word in lst_words :
  output.append(stemmer.stem(word))
print(output)

['run', 'paint', 'walk', 'dress', 'like', 'children', 'whom', 'good', 'ate', 'fish']


In [None]:
output = []
doc = nlp("running painting walking dressing likely children who good ate fishing")
for token in doc :
  output.append(token.lemma_)
print(output)

['run', 'paint', 'walk', 'dress', 'likely', 'child', 'who', 'good', 'eat', 'fishing']
