# Exploring NLP tools

## NLTK

### Install NLTK

In [1]:
!pip install nltk



### Explore NLTK

In [2]:
# Load a raw plaintext corpus from Project Gutenberg
import requests
import nltk

f = requests.get('https://www.gutenberg.org/cache/epub/68196/pg68196.txt')
start_str = "*** START OF THE PROJECT GUTENBERG EBOOK ALIEN ***"
start = f.text.find(start_str) + len(start_str)
end = f.text.rfind("*** END OF THE PROJECT GUTENBERG EBOOK ALIEN ***")
raw_text = f.text[start:end]
print(raw_text[:1000])
print('-' * 80)
print(raw_text[-1000:])







                                 ALIEN

                          BY GEORGE O. SMITH

           [Transcriber's Note: This etext was produced from
               Astounding Science-Fiction, October 1946.
         Extensive research did not uncover any evidence that
         the U.S. copyright on this publication was renewed.]


The telephone rang and the lieutenant of police Timothy McDowell
grunted. He put down his magazine, and hastily covered the
partially-clad damsel on the front cover before he answered the ringing
phone.

"McDowell," he grunted.

"McDowell," came the voice in his ear. "I think ye'd better come overe
here."

"What's up?"

"Been a riot at McCarthy's on Boylston Street."

"That's nothing new," growled McDowell, "excepting sometimes it's
Hennesey's on Dartmouth or Kelley's on Massachusetts."

"Yeah, but this is different."

"Whut's so different about a riot in a jernt like McCarthy's on a
street like Boylston?"

"Well, the 
-------------------------------------

In [3]:
# Tokenize our corpus into sentences and tokens (list of lists)
sent_toks = []
for s in nltk.sent_tokenize(raw_text):
    sent_toks.append(nltk.word_tokenize(s))
sent_toks[:2]

[['ALIEN',
  'BY',
  'GEORGE',
  'O.',
  'SMITH',
  '[',
  'Transcriber',
  "'s",
  'Note',
  ':',
  'This',
  'etext',
  'was',
  'produced',
  'from',
  'Astounding',
  'Science-Fiction',
  ',',
  'October',
  '1946',
  '.'],
 ['Extensive',
  'research',
  'did',
  'not',
  'uncover',
  'any',
  'evidence',
  'that',
  'the',
  'U.S.',
  'copyright',
  'on',
  'this',
  'publication',
  'was',
  'renewed',
  '.',
  ']']]

In [4]:
%%time
# Create a maximum likelihood expectation (MLE) language model from the corpus
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

N = 4
train, vocab = padded_everygram_pipeline(N, sent_toks)

lm = MLE(N)
lm.fit(train, vocab)
print(lm.vocab)
print(lm.counts)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1228 items>
<NgramCounter with 4 ngram orders and 26666 ngrams>
CPU times: user 325 ms, sys: 7.79 ms, total: 333 ms
Wall time: 330 ms


In [5]:
# Generate some text from the language model using a seed 2-gram
toks = lm.generate(32, text_seed='Tell them'.split(), random_seed=42)
' '.join(toks)

"that you were afraid , that you'd been hiding because of the differences in evolutionary ending of the host . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>"

In [6]:
# Check for log likelihood this sequence exists in the language model
lm.logscore('bit', 'was a'.split())

-2.8073549220576046

## GenSim

### Install GenSim

In [7]:
!pip install gensim



### Explore GenSim

In [8]:
import gensim.models

model = gensim.models.Word2Vec(sentences=sent_toks, vector_size=5)
word_vectors = model.wv
del model

In [9]:
for index, word in enumerate(word_vectors.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(word_vectors.index_to_key)} is {word}, vec = {word_vectors[word]}")

word #0/156 is ., vec = [ 0.19978775  0.85359436  1.1713744  -0.9864809  -0.3433871 ]
word #1/156 is ,, vec = [ 0.1172794  1.1657429  1.4586245 -1.5009246 -0.2580464]
word #2/156 is the, vec = [ 0.43704826  1.0936207   1.3120347  -1.3994651  -0.29396105]
word #3/156 is '', vec = [ 0.12169423  0.6942297   0.7988962  -1.0238564  -0.2958518 ]
word #4/156 is ``, vec = [ 3.2935810e-01  8.1392103e-01  1.0268925e+00 -9.5124507e-01
  4.7777989e-04]
word #5/156 is a, vec = [ 0.09903629  0.63957345  0.93133605 -1.0325854  -0.19885504]
word #6/156 is and, vec = [ 0.02612713  0.6771986   1.0611243  -1.1080297  -0.17047247]
word #7/156 is to, vec = [ 0.13124461  0.84318745  0.7264108  -0.9244929  -0.21209371]
word #8/156 is of, vec = [ 0.05099332  1.037147    0.98514247 -1.3549186  -0.16270308]
word #9/156 is that, vec = [ 0.18680783  0.59298074  1.1109077  -0.9128647   0.05257313]


In [10]:
tokens = "The jury was not cited for contempt of court".split()
for token in tokens:
    if token in word_vectors:
        print(f'{token} = {word_vectors[token]}')
    else:
        print(token)

The = [ 0.26197264  0.20541258  0.47735476 -0.457172   -0.0888974 ]
jury
was = [-0.02212266  0.4195795   0.618374   -0.5979918  -0.19352509]
not = [ 0.2595777   0.38216555  0.62206984 -0.79241335  0.03801519]
cited
for = [ 0.29584187  0.692679    0.77136207 -0.712972    0.05781078]
contempt
of = [ 0.05099332  1.037147    0.98514247 -1.3549186  -0.16270308]
court


In [11]:
import gensim.downloader as api

api.info()['models']

{'fasttext-wiki-news-subwords-300': {'num_records': 999999,
  'file_size': 1005007116,
  'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)',
  'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py',
  'license': 'https://creativecommons.org/licenses/by-sa/3.0/',
  'parameters': {'dimension': 300},
  'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).',
  'read_more': ['https://fasttext.cc/docs/en/english-vectors.html',
   'https://arxiv.org/abs/1712.09405',
   'https://arxiv.org/abs/1607.01759'],
  'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af',
  'file_name': 'fasttext-wiki-news-subwords-300.gz',
  'parts': 1},
 'conceptnet-numberbatch-17-06-300': {'num_records': 1917247,
  'file_size': 1225497562,
  'base_dataset': 'ConceptNet, word2vec, GloVe, and OpenSubtitles 2016',
  'reader_code': 'https:/

In [12]:
%%time
word_vectors = api.load("glove-wiki-gigaword-50")  # load pre-trained word-vectors from gensim-data

CPU times: user 24.2 s, sys: 341 ms, total: 24.5 s
Wall time: 24.6 s


In [13]:
tokens = "The jury was not cited for contempt of court".split()
for token in tokens:
    if token in word_vectors:
        print(f'{token} = {word_vectors[token]}')
    else:
        print(token)

The
jury = [ 0.13889   -0.15441   -0.65196    0.55466    1.2798     0.60057
  0.7441     1.4171     0.32741    0.40929   -0.69933   -0.42597
 -0.40659    0.22372    1.358     -0.5151    -0.24794   -0.48514
 -0.44527   -1.2945     1.1523     0.86963    0.63349   -0.072768
 -1.0655    -1.9788    -0.35197    0.21787   -0.83935   -0.95735
  1.1546    -0.96692   -0.67812   -1.8802     0.89337   -0.91028
  0.70292    0.0085246  0.49123   -0.95192   -0.56366    0.19392
  0.29582    0.74449   -0.76221   -0.16316   -0.28296    0.082915
  0.047064   0.20868  ]
was = [ 0.086888 -0.19416  -0.24267  -0.33391   0.56731   0.39783  -0.97809
  0.03159  -0.61469  -0.31406   0.56145   0.12886  -0.84193  -0.46992
  0.47097   0.023012 -0.59609   0.22291  -1.1614    0.3865    0.067412
  0.44883   0.17394  -0.53574   0.17909  -2.1647   -0.12827   0.29036
 -0.15061   0.35242   3.124    -0.90085  -0.02567  -0.41709   0.40565
 -0.22703   0.76829   0.60982   0.070068 -0.13271  -0.1201    0.096132
 -0.43998  -0.4

In [14]:
king = word_vectors['king']
man = word_vectors['man']
woman = word_vectors['woman']
queen = word_vectors['queen']

calc_queen = king - man + woman
print('queen / calc_queen', word_vectors.cosine_similarities(queen, [calc_queen]))
print('man / woman', word_vectors.distance('man', 'woman'))
print('king / queen', word_vectors.distance('king', 'queen'))

queen / calc_queen [0.86095816]
man / woman 0.11396622657775879
king / queen 0.2160956859588623


## spaCy

### Install spaCy tools and starter English language model

In [15]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m0:01[0m01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Explore spaCy

In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [17]:
# Chunking text on subphrases
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [18]:
# Named entity recognition
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [19]:
# spaCy also has word vectors in its language models
tokens = nlp("The fat cat sat on the mat")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

The True 10.318105 True
fat True 8.668562 True
cat True 8.584026 True
sat True 7.815958 True
on True 8.546586 True
the True 8.815763 True
mat True 8.037259 True
