In [67]:
# !pip install nltk, spacy

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp("Elon Musk can pocket another $32 billion of Tesla shares")
print(doc, '\n')

for token in doc: 
    print(token.text,"::", token.pos_,"::", token.dep_)

Elon Musk can pocket another $32 billion of Tesla shares 

Elon :: PROPN :: compound
Musk :: PROPN :: nsubj
can :: AUX :: aux
pocket :: VERB :: ROOT
another :: DET :: det
$ :: SYM :: quantmod
32 :: NUM :: compound
billion :: NUM :: dobj
of :: ADP :: prep
Tesla :: PROPN :: compound
shares :: NOUN :: pobj


In [7]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x29e6618b348>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x29e6618b768>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x29e65e6eeb8>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x29e661cd048>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x29e6615b0c8>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x29e661541c8>)]

In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [None]:
# Print each particular sentence

In [13]:
doc = nlp(u'In its discrete form, a hidden Markov process can be visualized as a generalization of the urn problem with replacamarement (where each item from the urn is returned to the original urn before the next step).[6] Consider this example: in a room that is not visible to an observer there is a genie. The room contains urns X1, X2, X3, ... each of which contains a known mix of balls, each ball labeled y1, y2, y3, ... . The genie chooses an urn in that room and randomly draws a ball from that urn. It then puts the ball onto a conveyor belt, where the observer can observe the sequence of the balls but not the sequence of urns from which they were drawn. The genie    has some procedure to choose urns; the choice of the urn for Amar the n-th ball depends only upon a random number and the Amarchoice of the urn for the (n − 1)-th ball. Contact 923-098-1233, 2230912301')

for sentence in doc.sents:
    print(sentence,'\n')

In its discrete form, a hidden Markov process can be visualized as a generalization of the urn problem with replacamarement (where each item from the urn is returned to the original urn before the next step).[6] Consider this example: in a room that is not visible to an observer there is a genie. 

The room contains urns X1, X2, X3, ... each of which contains a known mix of balls, each ball labeled y1, y2, y3, ... . 

The genie chooses an urn in that room and randomly draws a ball from that urn. 

It then puts the ball onto a conveyor belt, where the observer can observe the sequence of the balls but not the sequence of urns from which they were drawn. 

The genie    has some procedure to choose urns; the choice of the urn for Amar the n-th ball depends only upon a random number and the Amarchoice of the urn for the (n − 1)-th ball. 

Contact 923-098-1233, 2230912301 



# Tokenization

Tokens are the basic building blocks of a Doc object - everything that helps us understand the meaning of the thext, is dereved from tokens and their relationship to one another.

- __Prefix__: Characters at the beginning _Eg. $ ( "_
- __Suffix__: Characters at the end _Eg. km ) , . !"_
- __Infix__: Characters in between _Eg. - -- / ..._
- __Exception__: Special-case rule to split a string into several tokens to prevent a token from being split when puntuation rules are applied. _Eg let's U.S._

In [33]:
mystr1 = " 'We\'re goint to be great in N.Y '"
doc1 = nlp(mystr1)
mystr2 = "Here's an example - If we took away the 'when', we would have two equal clauses; ‘My Dad laughed’ and ‘I told a joke.’. It would be easier to make $5 million in Australia."
doc2 = nlp(mystr2)

In [34]:
for token in doc1:
    print(token.text)

 
'
We
're
goint
to
be
greate
in
N.Y
'


In [35]:
len(doc1), len(doc1.vocab)

(11, 873)

In [36]:
for token in doc2:
    print(token.text)

Here
's
an
example
-
If
we
took
away
the
'
when
'
,
we
would
have
two
equal
clauses
;
‘
My
Dad
laughed
’
and
‘
I
told
a
joke
.
’
.
It
would
be
easier
to
make
$
5
million
in
Australia
.


In [37]:
# print entitiels in doc1

for entity in doc1.ents:
    print(entity, '-', entity.label_)

N.Y - GPE


In [40]:
# print entitiels in doc2

for entity in doc2.ents:
    print(entity, '-', entity.label_, f"({str(spacy.explain(entity.label_))})")
    print('\n')

two - CARDINAL (Numerals that do not fall under another type)


$5 million - MONEY (Monetary values, including unit)


Australia - GPE (Countries, cities, states)




In [None]:
# Finding noun chunks

In [45]:
doc3 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

for chunk in doc3.noun_chunks:
    print(chunk, chunk.label_, f"({str(spacy.explain(chunk.label_))})")

Autonomous cars NP (noun phrase)
insurance liability NP (noun phrase)
manufacturers NP (noun phrase)


## Visualize Tokenization

In [46]:
from spacy import displacy

In [50]:
doc5 = nlp(u"Apple is going to build a U.K. factory for $23 million.")

In [52]:
# Entities
displacy.render(doc5, style='ent', jupyter=True, options={'distance':110})

In [60]:
# Dependencies
displacy.render(doc2, style='dep', jupyter=True, options={'distance':110})

In [63]:
# It can be served as an image as well, for better visualization.
displacy.serve(doc2, style='dep')


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# Stemming 

#### Using PorterStemmer

In [68]:
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

In [71]:
doc5 = "She's visited her friends in Milan throughout the years."
doc5_words = doc5.split()
for word in doc5_words:
    print(word," -> ",p_stemmer.stem(word))

She's  ->  she'
visited  ->  visit
her  ->  her
friends  ->  friend
in  ->  in
Milan  ->  milan
throughout  ->  throughout
the  ->  the
years.  ->  years.


In [79]:
# another example 
words = ['run', 'ran', 'runner', 'runs', 'fairness', 'generous', 'generation', 'generously', 'generate', 'publicly', 'easily', 'fairly']
for word in words:
    print(word," -> ",p_stemmer.stem(word))

run  ->  run
ran  ->  ran
runner  ->  runner
runs  ->  run
fairness  ->  fair
generous  ->  gener
generation  ->  gener
generously  ->  gener
generate  ->  gener
publicly  ->  publicli
easily  ->  easili
fairly  ->  fairli


#### Using SnowballStemmer

In [80]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')

In [81]:
for word in words:
    print(word," -> ",s_stemmer.stem(word))

run  ->  run
ran  ->  ran
runner  ->  runner
runs  ->  run
fairness  ->  fair
generous  ->  generous
generation  ->  generat
generously  ->  generous
generate  ->  generat
publicly  ->  public
easily  ->  easili
fairly  ->  fair


## Lemmatization 

In [93]:
doc6 = nlp(' '.join(words))

for token in doc6:
    print(f'{token.text:{12}}'+f'{token.pos_:{12}}', token.lemma_)

run         VERB         run
ran         VERB         run
runner      NOUN         runner
runs        NOUN         run
fairness    NOUN         fairness
generous    ADJ          generous
generation  NOUN         generation
generously  ADV          generously
generate    VERB         generate
publicly    ADV          publicly
easily      ADV          easily
fairly      ADV          fairly


## Stop words

In [97]:
# How to remove stop-words

print(len(nlp.Defaults.stop_words), nlp.Defaults.stop_words)

326 {'all', 'top', 'before', 'herself', 'whose', 'never', 'per', 'would', '’s', 'down', 'former', 'becoming', 'due', 'among', 'herein', 'sometimes', 'into', '’ll', 'off', 'this', 'none', 'put', 'move', "'d", 'show', 'himself', 'somewhere', 'keep', 'alone', 'hence', 'between', 'make', 'few', 'next', '’ve', 'nobody', 'one', 'via', 'whereupon', 'also', 'quite', 'perhaps', 'for', 'so', 'ca', 'did', 'on', 'nevertheless', 'eight', 'therein', 'mostly', 'is', 'part', 'will', '’m', 'not', 'hundred', 'whom', 'might', 'in', '‘m', 'must', 'whence', 'meanwhile', 'less', 'most', 'here', 'various', 'along', 'just', 'until', 'out', 'became', 'they', 'anywhere', 'any', 'these', 'many', 'because', 'therefore', 'else', '‘s', '‘re', 'above', 'cannot', 'themselves', 'last', 'often', 'itself', 'or', 'can', 'other', 'may', 'formerly', 'that', 'unless', 'could', 'a', 'ours', 'them', 'well', 'even', 'say', 'fifty', 'third', 'besides', 'are', 'then', 'n‘t', 'nothing', 'through', 'name', 'everyone', 'four', 'acr

In [110]:
# Check if a word is stopword
nlp.vocab['soon'].is_stop, nlp.vocab['well'].is_stop

(False, True)

In [107]:
# Add a custom word to stopwords dict
nlp.Defaults.stop_words.add('soon')
nlp.vocab['soon'].is_stop = True
nlp.vocab['soon'].is_stop

True

In [109]:
# Remove a stopword from the main dict
nlp.Defaults.stop_words.remove('well')
nlp.vocab['soon'].is_stop = False
nlp.vocab['soon'].is_stop

False