### Spacy Basics

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [3]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x29e228d2788>),
 ('parser', <spacy.pipeline.DependencyParser at 0x29e228bab28>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x29e228d7108>)]

In [5]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [10]:
for token in doc:
    print(token, token.pos_, token.dep_,)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [16]:
doc2 = nlp(u'Tesla isnt looking at buying U.S. startup for $6 million')

In [17]:
for token in doc2:
    print(token, token.pos_, token.dep_,)

Tesla PROPN nsubj
is VERB aux
nt ADV advmod
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [18]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [20]:
doc3[16:30], type(doc3[16:30])

("Life is what happens to us while we are making other plans",
 spacy.tokens.span.Span)

In [21]:
type(doc3)

spacy.tokens.doc.Doc

In [23]:
doc4 = nlp('I like to eat. My favourite food is chicken. This is end sentences.')

In [24]:
for sentence in doc4.sents:
    print(sentence)

I like to eat.
My favourite food is chicken.
This is end sentences.


### Tokenization - Part One

In [26]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [32]:
doc1 = "Apple is looking to invest $6 millions at Hong Kong"

In [34]:
for t in nlp(doc1):
    print(t, end = " | ")

Apple | is | looking | to | invest | $ | 6 | millions | at | Hong | Kong | 

In [35]:
for ent in nlp(doc1).ents:
    print(ent)

Apple
$6 millions
Hong Kong


### Tokenization - Part One

In [5]:
from spacy import displacy

In [4]:
doc = nlp(u"Apple is going to open a factory in U.K which cost $5.5 millions.")

In [7]:
displacy.render(doc, style  = "dep", jupyter = True, options = {'distance': 110})

In [8]:
doc2 = nlp(u"This is a sentence.")
displacy.render(doc2, jupyter=True, style = "dep")

In [12]:
doc3 = nlp(u"My name is Amin. I'm a computer science graduate. ")
displacy.render(doc3, style = "ent" , jupyter = True, options = {'distance' : 110})

### Stemming

In [13]:
from nltk.stem.porter import PorterStemmer

In [14]:
p_stemmer = PorterStemmer()

In [22]:
words = ['eat', 'ate', 'eaten', 'eats', 'easily', 'easy', 'easier', 'fairly', 'fairness']

In [23]:
for word in words:
    print(word, "----->" , p_stemmer.stem(word))

eat -----> eat
ate -----> ate
eaten -----> eaten
eats -----> eat
easily -----> easili
easy -----> easi
easier -----> easier
fairly -----> fairli
fairness -----> fair


In [24]:
from nltk.stem.snowball import SnowballStemmer

In [25]:
s_stemmer = SnowballStemmer(language = 'english')

In [26]:
for word in words:
    print(word, "----->" , s_stemmer.stem(word))

eat -----> eat
ate -----> ate
eaten -----> eaten
eats -----> eat
easily -----> easili
easy -----> easi
easier -----> easier
fairly -----> fair
fairness -----> fair


### Lemmatization

In [27]:
doc = nlp(u"Apple is going to open a factory in U.K which cost $5.5 millions.")

In [39]:
for word in doc:
    print(f"{word.text:{10}} {word.pos_:{10}} {word.lemma:<{30}} {word.lemma_:{10}}")

Apple      PROPN      8566208034543834098            apple     
is         VERB       10382539506755952630           be        
going      VERB       8004577259940138793            go        
to         PART       3791531372978436496            to        
open       VERB       8092125317261700160            open      
a          DET        11901859001352538922           a         
factory    NOUN       3867587030844307195            factory   
in         ADP        3002984154512732771            in        
U.K        PROPN      11016456344514401883           u.k       
which      ADJ        7063653163634019529            which     
cost       VERB       7764511969837601066            cost      
$          SYM        11283501755624150392           $         
5.5        NUM        8345312113965934542            5.5       
millions   NOUN       17365054503653917826           million   
.          PUNCT      12646065887601541794           .         


In [46]:
def show_lemma(text):
    for token in text:
        print(f"{token.text:{10}} {token.pos_:{10}} {token.lemma:<{30}} {token.lemma_:{10}}")

In [50]:
doc3 = nlp(u"I join this running event because I love to ran.")

In [49]:
show_lemma(doc3)

I          PRON       561228191312463089             -PRON-    
join       VERB       16238441731120403936           join      
this       DET        1995909169258310477            this      
running    VERB       12767647472892411841           run       
event      NOUN       16065740214838660377           event     
because    ADP        16950148841647037698           because   
I          PRON       561228191312463089             -PRON-    
love       VERB       3702023516439754181            love      
to         PART       3791531372978436496            to        
run        VERB       12767647472892411841           run       
.          PUNCT      12646065887601541794           .         


### Stop words

In [51]:
nlp.Defaults.stop_words

{'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'g

In [52]:
len(nlp.Defaults.stop_words)

305

In [54]:
nlp.vocab['is'].is_stop

True

In [55]:
nlp.vocab['btw'].is_stop

False

In [56]:
nlp.vocab['btw'].is_stop = True

In [57]:
nlp.vocab['btw'].is_stop

True

In [58]:
nlp.Defaults.stop_words.remove('again')

In [59]:
nlp.vocab['again'].is_stop = False

In [60]:
nlp.vocab['again'].is_stop

False

### Phase Matching and Vocabulary

In [67]:
from spacy.matcher import Matcher

In [107]:
matcher = Matcher(nlp.vocab)

In [79]:
#SolarPower
pattern1 = [{'LOWER': 'solarpower'}]

#Solar power
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

#Solar-power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [86]:
doc = nlp(u'The SolaR Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [87]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [88]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 SolaR Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


### Phase Matching and Vocabulary - Phrase Matching

In [108]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [102]:
with open('UPDATED_NLP_COURSE/TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [103]:
doc3

REAGANOMICS
https://en.wikipedia.org/wiki/Reaganomics

Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.

The four pillars of Reagan's economic policy were to reduce the growth of government spending, reduce the federal income tax and capital gains tax, reduce government regulation, and tighten the money supply in order to reduce inflation.[2]

The results of Reaganomics are still debated. Supporters point to the end of stagflation, stronger GDP growth, and an entrepreneur revolution in the decades that followed.[3][4] Critics point to the widening income gap, an atmosphere of greed, and the national debt tripling in eight years which ultimately reversed the pos

In [109]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [110]:
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_patterns


[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [114]:
# convert the phrase into Spacy Doc object
phrase_patterns = [nlp(text) for text in phrase_list]

# add the phrases into mathcer object
matcher.add('VoodooEconomics', None, *phrase_patterns)

In [115]:
# Build a list of matches:
matches = matcher(doc3)

In [116]:
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2985, 2989)]

In [117]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3473369816841043438 VoodooEconomics 41 45 supply-side economics
3473369816841043438 VoodooEconomics 49 53 trickle-down economics
3473369816841043438 VoodooEconomics 54 56 voodoo economics
3473369816841043438 VoodooEconomics 61 65 free-market economics
3473369816841043438 VoodooEconomics 673 677 supply-side economics
3473369816841043438 VoodooEconomics 2985 2989 trickle-down economics
