## Course Information
🎥 **[COURSE VIDEOS](https://www.udemy.com/course/nlp-natural-language-processing-with-python/learn)**

## Setup

In [7]:
# DRIVE
from google.colab import drive
drive.mount('/content/drive')

data_path = 'drive/MyDrive/Colab Notebooks/NLP Course/data'



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Spacy

**If thing start to fail in the future run the following**


In [8]:
# !pip install https://github.com/explosion/spacy-models/releases/download/en_trf_xlnetbasecased_lg-2.2.0/en_trf_xlnetbasecased_lg-2.2.0.tar.gz

In [9]:
import spacy
# import spacy_transformers
# nlp = spacy.load('en_trf_xlnetbasecased_lg') 
nlp = spacy.load('en_core_web_sm')
print("All Spacy happy!")

All Spacy happy!


# NLP Basics
- Spacey
- NLTK

# Spacy Basics

In [10]:
nlp = spacy.load('en_core_web_sm')

In [11]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [12]:
for token in doc:
  print(f"{token.text:12} {token.pos:5} {token.pos_:9} {token.dep_:9}")

Tesla           96 PROPN     nsubj    
is              87 AUX       aux      
looking        100 VERB      ROOT     
at              85 ADP       prep     
buying         100 VERB      pcomp    
U.S.            96 PROPN     compound 
startup         92 NOUN      dobj     
for             85 ADP       prep     
$               99 SYM       quantmod 
6               93 NUM       compound 
million         93 NUM       pobj     


In [13]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f5314049090>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f5314129c20>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f53131f56e0>)]

In [14]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [15]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [16]:
for token in doc2:
  print(f"{token.text:12} {token.pos:5} {token.pos_:9} {token.dep_:9}")

Tesla           96 PROPN     nsubj    
is              87 AUX       aux      
n't             94 PART      neg      
looking        100 VERB      ROOT     
into            85 ADP       prep     
startups        92 NOUN      pobj     
anymore         86 ADV       advmod   
.               97 PUNCT     punct    


In [17]:
# get individual tokens
doc2[0]

Tesla

In [18]:
doc2[0].pos_

'PROPN'

# Spans
Parts of a docuement

In [19]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [20]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [21]:
type(life_quote)

spacy.tokens.span.Span

In [22]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [23]:
for sentence in doc4.sents:
  print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [24]:
token = doc4[6]

In [25]:
token.is_sent_start

True

# Tokenization

In [26]:
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [27]:
# Create a Doc object and explore tokens
doc = nlp(mystring)

for token in doc:
    print(token.text, end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [28]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [29]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [30]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


<font color=green>Here the abbreviations for "Saint" and "United States" are both preserved.</font>

In [31]:
len(doc4.vocab)

554

In [32]:
doc5 = nlp(u"It is better to give than receive.")

In [33]:
doc5[0]

It

In [34]:
doc5[2:5]

better to give

# Named Entities

In [35]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [36]:
for entity in doc8.ents:
  print(entity)
  print(entity.label_)
  print(str(spacy.explain(entity.label_)))
  print()

Apple
ORG
Companies, agencies, institutions, etc.

Hong Kong
GPE
Countries, cities, states

$6 million
MONEY
Monetary values, including unit



# Noun Chunk

In [37]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [38]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


# Visualizing Tokenization

In [39]:
from spacy import displacy

In [40]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [43]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':75})

In [46]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for profit of $6 million.')

In [47]:
displacy.render(doc, style='ent', jupyter=True, options={'distance':75})

In [49]:
# doc = nlp(u"This is a sentence.")
# displacy.serve(doc, style='dep')

# Stemming

Stemming is reducing a word to its base:
```
boating > boaters > boats = boat
```
It is problematic enough in English which has many exceptions that Spacy does not include a Stemmer, rather opting for lemmetization.

In [50]:
import nltk

In [58]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [66]:
words = ['run','runner','ran','runs','easily','fairly', 'fairness', 'generous','generation','generate']

In [67]:
stemmer = PorterStemmer()
for word in words:
  print(f'{word:10} --->   {stemmer.stem(word):10}')

run        --->   run       
runner     --->   runner    
ran        --->   ran       
runs       --->   run       
easily     --->   easili    
fairly     --->   fairli    
fairness   --->   fair      
generous   --->   gener     
generation --->   gener     
generate   --->   gener     


In [68]:
stemmer = SnowballStemmer(language='english')
for word in words:
  print(f'{word:10} --->   {stemmer.stem(word):10}')

run        --->   run       
runner     --->   runner    
ran        --->   ran       
runs       --->   run       
easily     --->   easili    
fairly     --->   fair      
fairness   --->   fair      
generous   --->   generous  
generation --->   generat   
generate   --->   generat   


# Lemmatization

Looking for a `lemma` of each word.

In [69]:
doc = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [74]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [75]:
show_lemmas(doc)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


In [79]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop Words

Get rid of the `305` words that add nothing but confusion.

In [80]:
print(nlp.Defaults.stop_words)

{"'s", 're', 'please', 'along', 'front', 'indeed', 'show', 'seems', 'each', 'towards', 'else', 'to', 'whenever', 'you', 'an', 'seemed', '’re', 'should', 'two', '‘s', 'among', 'but', 'themselves', 'where', 'therein', 'as', 'yourself', 'made', 'eight', 'than', 'nowhere', 'we', '‘ve', 'whereafter', 'perhaps', "n't", 'just', 'was', 'eleven', 'ours', 'whence', 'herein', 'its', 'ourselves', 'over', 'although', 'get', 'her', 'myself', 'with', 'various', 'never', 'have', 'amount', 'thus', 'yours', 'other', 'is', 'side', 'behind', 'formerly', 'yet', 'others', 'our', 'sometimes', 'part', 'in', 'meanwhile', 'below', 'they', 'fifty', 'mostly', 'ten', 'noone', 'either', 'thereby', 'because', 'until', "'ve", 'former', "'ll", 'becoming', 'hereafter', 'off', 'seeming', 'done', 'unless', 'anything', 'twenty', 'being', 'down', 'put', 'move', 'back', 'make', 'why', 'n’t', 'alone', 'beforehand', 'my', 'elsewhere', 'becomes', 'whole', '’s', 'it', 'own', 'due', 'whose', 'himself', 'next', 'that', 'between',

In [81]:
nlp.vocab['is']

<spacy.lexeme.Lexeme at 0x7f52fe8e59b0>

In [82]:
nlp.vocab['is'].is_stop

True

In [83]:
nlp.vocab['mystery'].is_stop

False

In [88]:
# Add new Stop Words if you like
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True
nlp.vocab['btw'].is_stop

True

In [None]:
# or remove any stop_words
nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop = False

# Vocabulary and Matching

## Qualifiers and Tokens

This found both two-word patterns, with and without the hyphen!

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


## Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [142]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [143]:
# Looking for all of the following:
#   SolarPower
pattern1 = [{'LOWER':'solarpower'}]
#   Solar-Power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
#   Solar Power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [144]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [145]:
doc = nlp(u'Solar--power FTW! The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [146]:
matcher(doc)

[(8656102463236116519, 0, 3),
 (8656102463236116519, 6, 8),
 (8656102463236116519, 15, 16),
 (8656102463236116519, 18, 21)]

In [160]:
def print_matches(doc, matcher):
  found_matches = matcher(doc)
  for match_id, start, end in found_matches:
      string_id = nlp.vocab.strings[match_id]  # get string representation
      span = doc[start:end]                    # get the matched span
      print(f"{match_id:<24}{string_id:16}{start:7}{end:7} {span.text}")

In [148]:
print_matches(doc, matcher)

8656102463236116519     SolarPower       0   3 Solar--power
8656102463236116519     SolarPower       6   8 Solar Power
8656102463236116519     SolarPower      15  16 solarpower
8656102463236116519     SolarPower      18  21 Solar-power


In [149]:
# remove them when not needed
matcher.remove('SolarPower')

In [150]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

In [151]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [152]:
print_matches(doc, matcher)

8656102463236116519     SolarPower       0   3 Solar--power
8656102463236116519     SolarPower       6   8 Solar Power
8656102463236116519     SolarPower      15  16 solarpower
8656102463236116519     SolarPower      18  21 Solar-power


In [153]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [154]:
with open(f'{data_path}/reaganomics.txt', encoding="latin") as f:
  doc3 = nlp(f.read())

In [163]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [164]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [165]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [166]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [167]:
found_matches = matcher(doc3)

In [168]:
print_matches(doc3, matcher)

3680293220734633682     EconMatcher          41     45 supply-side economics
3680293220734633682     EconMatcher          49     53 trickle-down economics
3680293220734633682     EconMatcher          54     56 voodoo economics
3680293220734633682     EconMatcher          61     65 free-market economics
3680293220734633682     EconMatcher         673    677 supply-side economics
3680293220734633682     EconMatcher        2986   2990 trickle-down economics


# Assessment