In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

## Basics of Spacy

In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
doc = nlp(u'Tesla is looking to buy U.S. startup for $6 million')

In [5]:
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
to 94 PART aux
buy 100 VERB xcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj


In [6]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x24c24763310>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x24c245b8680>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x24c245a0b20>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x24c245a0a60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x24c247f66c0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x24c247efc80>)]

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [8]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [9]:
for token in doc2:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
n't 94 PART neg
looking 100 VERB ROOT
into 85 ADP prep
startups 92 NOUN pobj
anymore 86 ADV advmod
. 97 PUNCT punct


In [10]:
doc[0].pos_

'PROPN'

In [11]:
doc[0].lemma_

'Tesla'

In [12]:
doc[0].tag_

'NNP'

In [13]:
doc[0].shape_

'Xxxxx'

In [14]:
doc[0].is_alpha

True

In [15]:
doc[0].is_stop

False

In [16]:
span = doc[2:5]

In [17]:
span

looking to buy

In [18]:
type(span)

spacy.tokens.span.Span

In [19]:
type(doc)

spacy.tokens.doc.Doc

In [10]:
doc = nlp(u'This is a first sentence. This is the second sentence. This is the third sentence')

In [11]:
for sentence in doc.sents:
    print(len(sentence))
    

6
6
5


In [19]:
# target Score = -3.67

doc1 = nlp(u"The commutator is peculiar, consisting of only three segments of a copper ring, while in the simplest of other continuous current generators several times that number exist, and frequently 120! segments are to be found. These three segments are made so as to be removable in a moment for cleaning or replacement. They are mounted upon a metal support, and are surrounded on all sides by a free air space, and cannot, therefore, lose their insulated condition. This feature of air insulation is peculiar to this system, and is very important as a factor in the durability of the commutator. Besides this, the commutator is sustained by supports carried in flanges upon the shaft, which flanges, as an additional safeguard, are coated all over with hard rubber, one of the finest known insulators. It may be stated, without fear of contradiction, that no other commutator made is so thoroughly insulated and protected. The three commutator segments virtually constitute a single copper ring, mounted in free air, and cut into three equal pieces by slots across its face.")

In [20]:
# # Target Score = 1.5

# doc2 = nlp(u"For her last birthday, Sisanda had a special treat â€“ her parents got permission for her to have a party at the game reserve. The giraffes at the reserve were curious about this group of people. They stretched out their long necks for the best view of the party and they even seemed to want some of the birthday cake! Sisanda loved the giraffes. All animals were special to her, but it was the quiet and gentle giraffes that stole her heart. She could spend all day watching them.One Friday, Sisanda's father came home from work early. He looked very upset. "What's wrong, Baba?" Sisanda asked. "Today a swarm of bees stung a mother giraffe," explained Sisanda's father. "Her head was so swollen from all the stings that her beautiful eyes were closed. We tried everything to help her, but it was no use â€“ she died. And the saddest part of all is that she had a young calf that still needs her."")


In [21]:
# -3.5

doc3 = nlp(u"For staining Bacillus tuberculosis the following is confidently commended as preferable to the materials and methods heretofore in use. Take glycerine, 20 parts; fuchsin, 3 parts; aniline oil, 2 parts; carbolic acid, 2 parts. The solution is readily and speedily effected, with no danger of precipitation, and can be kept in stock without risk of deterioration. When wanted for use, put about two drops into a watch glass (a small pomatum pot is better) full of water and gently shake or stir. Just here there is some danger of precipitating the coloring matter, but the difficulty is easily avoided by gentle instead of vigorous stirring. After the stain is once dissolved in the water no further trouble occurs; if any evaporation takes place by being left too long, it is the water that goes, not the main solvent. The color should now be a light, translucent red, much too diffuse for writing ink. Put in the smeared cover glass, after passing it a few times through a flame, and leave it, at the ordinary temperature of a comfortable room, half an hour.")

In [22]:
# 1.7

doc4 = nlp(u"When you think of dinosaurs and where they lived, what do you picture? Do you see hot, steamy swamps, thick jungles, or sunny plains? Dinosaurs lived in those places, yes. But did you know that some dinosaurs lived in the cold and the darkness near the North and South Poles? This surprised scientists, too. Paleontologists used to believe that dinosaurs lived only in the warmest parts of the world. They thought that dinosaurs could only have lived in places where turtles, crocodiles, and snakes live today. Later, these dinosaur scientists began finding bones in surprising places. One of those surprising fossil beds is a place called Dinosaur Cove, Australia. One hundred million years ago, Australia was connected to Antarctica. Both continents were located near the South Pole. Today, paleontologists dig dinosaur fossils out of the ground. They think about what those ancient bones must mean.")

In [28]:
for sent in doc1.sents:
    print(len(sent))

31
9
18
31
25
39
22
28


In [48]:
# Number of characters per word 
lst = []
for sent in doc1.sents:
    for char in sent:
#         print(len(char))
        lst.append(len(char))
lst.sort(reverse=True)
lst[:3]

[13, 11, 10]

In [50]:
# Number of characters per word 
lst = []
for sent in doc3.sents:
    for char in sent:
#         print(len(char))
        lst.append(len(char))
lst.sort(reverse=True)
lst[:3]

[13, 13, 13]

In [51]:
# Number of characters per word and getting the top 3
lst = []
for sent in doc4.sents:
    for char in sent:
#         print(len(char))
        lst.append(len(char))
lst.sort(reverse=True)
lst[:3]

[15, 15, 10]

In [26]:
for sent in doc3.sents:
    print(len(sent))

20
23
25
30
25
37
18
34


In [59]:
for sent in doc4.sents:
    print(len(sent))

15
15
8
21
6
16
20
12
15
12
9
11
10


In [60]:
# Number of words per sentence and getting the top most
lst = []
for sent in doc4.sents:
    lst.append(len(sent))
print(max(lst))

21


In [58]:
lst

[15, 15, 8, 21, 6, 16, 20, 12, 15, 12, 9, 11, 10]

In [22]:
doc4[6]

This

In [23]:
doc4[6].is_sent_start

True

## Tokenization - Create individual words from a doc or sentence

In [24]:
mystring = '"We\'re moving to L.A.!"'

In [25]:
mystring

'"We\'re moving to L.A.!"'

In [26]:
print(mystring)

"We're moving to L.A.!"


In [27]:
doc = nlp(mystring)

In [28]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [29]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at https://www.oursite.com")

In [30]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
https://www.oursite.com


In [31]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [32]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [33]:
len(doc3)

9

In [34]:
len(doc3.vocab)

800

In [35]:
doc4 = nlp(u"I'm going to visit St. Louis in the U.S. next year")

In [36]:
for t in doc4:
    print(t)

I
'm
going
to
visit
St.
Louis
in
the
U.S.
next
year


In [37]:
len(doc4)

12

In [38]:
len(doc4.vocab)

805

In [39]:
doc5 = nlp(u"It is better to give than receive.")

In [40]:
doc5[0]

It

In [41]:
doc5[2:5]

better to give

In [42]:
doc6 = nlp(u"Apple is going to buy a company in India for $6 Million")

In [43]:
for token in doc6:
    print(token.text, end=' | ')

Apple | is | going | to | buy | a | company | in | India | for | $ | 6 | Million | 

In [44]:
for entity in doc6.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


India
GPE
Countries, cities, states


$6 Million
MONEY
Monetary values, including unit




In [45]:
doc7 = nlp(u"Autonomous cars shift insurance liablity toward manufacturers")

In [46]:
for chunk in doc7.noun_chunks:
    print(chunk)

Autonomous cars
insurance liablity
manufacturers


## Visualization with Spacy

In [47]:
from spacy import displacy

In [48]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [49]:
displacy.render(doc,style='dep')

In [50]:
displacy.render(doc,style='dep',options={'distance':80})

In [51]:
displacy.render(doc,style='ent',options={'distance':200})

In [52]:
# displacy.serve(doc, style='dep') # To render spacy visuals using .py script

## Stemming - Create variations of a word like 'run', 'ran', 'run'

In [53]:
import nltk

In [54]:
from nltk.stem.porter import PorterStemmer

In [55]:
p_stemmer = PorterStemmer()

In [56]:
words = ['run','runner','ran','runs','easily','fairly','fairness']

In [57]:
for word in words:
    print(word + '--->' + p_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fairli
fairness--->fair


In [58]:
from nltk.stem.snowball import SnowballStemmer

In [59]:
s_stemmer = SnowballStemmer('english')

In [60]:
for word in words:
    print(word + '--->' + s_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fair
fairness--->fair


In [61]:
words = ['generous','generation','generously','generate']

In [62]:
for word in words:
    print(s_stemmer.stem(word))

generous
generat
generous
generat


## Lematization - A better version of Stemming in Spacy

In [63]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [64]:
for token in doc:
    print(token.text,'\t\t',token.pos_,'\t\t',token.lemma,'\t\t',token.lemma_)

I 		 PRON 		 4690420944186131903 		 I
am 		 AUX 		 10382539506755952630 		 be
a 		 DET 		 11901859001352538922 		 a
runner 		 NOUN 		 12640964157389618806 		 runner
running 		 VERB 		 12767647472892411841 		 run
in 		 ADP 		 3002984154512732771 		 in
a 		 DET 		 11901859001352538922 		 a
race 		 NOUN 		 8048469955494714898 		 race
because 		 SCONJ 		 16950148841647037698 		 because
I 		 PRON 		 4690420944186131903 		 I
love 		 VERB 		 3702023516439754181 		 love
to 		 PART 		 3791531372978436496 		 to
run 		 VERB 		 12767647472892411841 		 run
since 		 SCONJ 		 10066841407251338481 		 since
I 		 PRON 		 4690420944186131903 		 I
ran 		 VERB 		 12767647472892411841 		 run
today 		 NOUN 		 11042482332948150395 		 today


In [65]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [66]:
show_lemmas(doc)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   4690420944186131903    I
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   4690420944186131903    I
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today


## Stop Words - Like 'a', 'an', 'the' (most common words) generally need to be removed

In [67]:
# Spacy library's default list of stop words and we can add more to it if needed

print(nlp.Defaults.stop_words)

{'whole', '‘d', 'i', '‘s', 'your', 'hers', 'third', 'these', 'the', 'my', 'none', 'behind', 'other', 'up', 'should', "'m", 'even', 'thereby', 'already', 'who', 'forty', 'to', 'he', 'am', "'d", 'doing', "'ll", 'perhaps', 'than', 'anywhere', 'and', 'become', 'less', 'made', 'front', 'an', '‘ll', 'over', 'across', 'hereupon', 'while', 'those', 'regarding', 'our', 'whence', 'besides', 'although', 'out', 'using', 'before', 'since', 'name', 'thereafter', 'next', 'say', 'fifty', 'this', 'take', 'some', 'nobody', 'call', 'hence', 'further', 'much', 'anyone', 'same', 'almost', 'one', 'fifteen', 'once', 'seemed', 'becoming', 'often', 'via', 're', 'otherwise', 'not', 'ten', 'quite', 'what', 'indeed', 'whereby', 'five', 'meanwhile', 'mostly', 'therein', 'a', 'serious', 'sometimes', 'anyway', 'be', 'yet', 'becomes', 'us', 'around', 'ourselves', 'when', 'was', 'thence', '‘m', 'three', 'more', 'full', 'within', 'on', '’d', '’ll', 'towards', 'various', 'themselves', 'elsewhere', 'formerly', 'they', 'b

In [68]:
nlp.vocab['is'].is_stop

True

In [69]:
nlp.vocab['mystery'].is_stop

False

In [70]:
nlp.Defaults.stop_words.add('btw')

In [71]:
nlp.vocab['btw'].is_stop = True

In [72]:
len(nlp.Defaults.stop_words)

327

In [73]:
nlp.vocab['btw'].is_stop

True

In [74]:
nlp.Defaults.stop_words.remove('btw')

In [75]:
nlp.vocab['btw'].is_stop = False

In [76]:
nlp.vocab['btw'].is_stop

False

## Phrase Matching and Vocabulary - Pattern matching similar to Regex but more powerful

In [77]:
from spacy.matcher import Matcher

In [78]:
matcher = Matcher(nlp.vocab)

In [79]:
#Pattern matching
#SolarPower
#solar-power
#solar power

pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] # Here '*' indicated zero or more
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [80]:
# for n in range(1,4):
matcher.add('SolarPower', [pattern1,pattern2,pattern3])

In [81]:
doc = nlp(u"The Solar--Power industry continues to grow a solarpower increases. Solar-power is amazing.")

In [82]:
found_matches = matcher(doc)

In [83]:
print(found_matches)

[(8656102463236116519, 1, 4), (8656102463236116519, 9, 10), (8656102463236116519, 12, 15)]


In [84]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 4 Solar--Power
8656102463236116519 SolarPower 9 10 solarpower
8656102463236116519 SolarPower 12 15 Solar-power


In [85]:
from spacy.matcher import PhraseMatcher

In [86]:
matcher = PhraseMatcher(nlp.vocab)

In [87]:
with open('C:\\Users\\akumar5\\Documents\\Python Bootcamp\\NLP with Deep Learning\\TextFiles\\reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [88]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)

In [89]:
# (match_id, start, end)
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]

## Part of Speech Tagging

In [90]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [91]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [92]:
doc[4].pos_

'VERB'

In [93]:
doc[4].tag_

'VBD'

In [94]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [95]:
doc2 = nlp(u"I read books on NLP.")

In [96]:
word = doc2[1]

In [97]:
word.text

'read'

In [98]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [99]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [100]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [101]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [102]:
doc.vocab[90].text

'DET'

In [103]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}  {doc.vocab[k].text:{5}} {v}")

84  ADJ   3
85  ADP   1
90  DET   2
92  NOUN  3
94  PART  1
97  PUNCT 1
100  VERB  1


In [104]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

In [105]:
TAG_counts

{15267657372422890137: 2,
 10554686591937588953: 3,
 15308085513773655218: 3,
 17109001835818727656: 1,
 1292078113972184607: 1,
 74: 1,
 12646065887601541794: 1}

In [106]:
for k,v in sorted(TAG_counts.items()):
    print(f"{k}  {doc.vocab[k].text:{5}} {v}")

74  POS   1
1292078113972184607  IN    1
10554686591937588953  JJ    3
12646065887601541794  .     1
15267657372422890137  DT    2
15308085513773655218  NN    3
17109001835818727656  VBD   1


In [107]:
len(doc.vocab)

2274

In [108]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

In [109]:
DEP_counts

{415: 2,
 402: 3,
 429: 1,
 8206900633647566924: 1,
 443: 1,
 440: 1,
 8110129090154140942: 1,
 439: 1,
 445: 1}

In [110]:
for k,v in sorted(DEP_counts.items()):
    print(f"{k}  {doc.vocab[k].text:{5}} {v}")

402  amod  3
415  det   2
429  nsubj 1
439  pobj  1
440  poss  1
443  prep  1
445  punct 1
8110129090154140942  case  1
8206900633647566924  ROOT  1


### Visualizing Parts of Speech (POS)

In [111]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [112]:
displacy.render(doc,style='dep')

In [113]:
options = {'distance':100, 'compact':'True', 'color':'black', 'bg':'white','font':'Arial'}

In [114]:
displacy.render(doc,style='dep', options=options)

In [115]:
doc2 = nlp(u"This is a sentence. This is a second sentence that is larger.")

In [116]:
for sent in doc2.sents:
    print(sent)

This is a sentence.
This is a second sentence that is larger.


In [117]:
spans = list(doc2.sents)

In [121]:
# displacy.serve(spans, style='dep')

## Sentiment Analysis using NLTK

In [122]:
import nltk

In [123]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\akumar5\AppData\Roaming\nltk_data...


True

In [124]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer