 From Applied Natural Language Processing in the Enterprise ch1

In [1]:
import transformers


In [2]:
?transformers

[0;31mType:[0m            _LazyModule
[0;31mString form:[0m     <module 'transformers' from '/home/user/anaconda3/envs/crs1278/lib/python3.6/site-packages/transformers/__init__.py'>
[0;31mFile:[0m            ~/anaconda3/envs/crs1278/lib/python3.6/site-packages/transformers/__init__.py
[0;31mDocstring:[0m       <no docstring>
[0;31mClass docstring:[0m Module class that surfaces all objects but only performs associated imports when the objects are requested.


In [3]:
from transformers import pipeline

In [4]:
classifier = pipeline("sentiment-analysis")

In [5]:
classifier("This is lovely!")

[{'label': 'POSITIVE', 'score': 0.9998669028282166}]

In [6]:
text_generator = pipeline("text-generation")

In [7]:
text_generator("Did you not see my lady? ", max_length=50, do_sample=False)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Did you not see my lady? \xa0She was in the middle of the road, and I was trying to get her to stop. \xa0She was trying to get out of the way, and I was trying to get her to stop.'}]

# spaCy

In [1]:
# Import spacy and download language model
import spacy

#! spacy download en_core_web_trf
#! spacy download en_core_web_sm

Collecting en-core-web-trf==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0-py3-none-any.whl (460.2 MB)
[K     |████████████████████████████████| 460.2 MB 2.0 kB/s  eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 15.4 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# Tokenization
sentence = nlp.tokenizer("We live in Paris.")

In [4]:
sentence

We live in Paris.

In [5]:
# Length of sentence
print("The number of tokens: ", len(sentence))

# Print individual words (i.e., tokens)
print("The tokens: ")
for words in sentence:
    print(words)

The number of tokens:  5
The tokens: 
We
live
in
Paris
.


In [6]:
import pandas as pd

In [8]:
data = pd.read_csv('/home/user/crs1278/nltk/JEOPARDY_CSV.csv')


In [10]:
# Lowercase, strip whitespace, and view column names
data.columns = map(lambda x: x.lower().strip(), data.columns)

In [11]:
data.columns

Index(['show number', 'air date', 'round', 'category', 'value', 'question',
       'answer'],
      dtype='object')

In [12]:
# Reduce size of data
data = data[0:1000]

In [13]:
# Tokenize Jeopardy Questions
data["question_tokens"] = data["question"].apply(lambda x: nlp(x))

In [14]:
# View first question
example_question = data.question[0]
example_question_tokens = data.question_tokens[0]
print("The first questions is:")
print(example_question)

The first questions is:
For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory


In [15]:
# Print individual tokens of first question
print("The tokens from the first question are:")
for tokens in example_question_tokens:
    print(tokens)

The tokens from the first question are:
For
the
last
8
years
of
his
life
,
Galileo
was
under
house
arrest
for
espousing
this
man
's
theory


In [16]:
# Print Part-of-speech tags for tokens in the first question
print("Here are the Part-of-speech tags for each token in the first question:")
for token in example_question_tokens:
    print(token.text,token.pos_, spacy.explain(token.pos_))

Here are the Part-of-speech tags for each token in the first question:
For ADP adposition
the DET determiner
last ADJ adjective
8 NUM numeral
years NOUN noun
of ADP adposition
his PRON pronoun
life NOUN noun
, PUNCT punctuation
Galileo PROPN proper noun
was AUX auxiliary
under ADP adposition
house NOUN noun
arrest NOUN noun
for ADP adposition
espousing VERB verb
this DET determiner
man NOUN noun
's PART particle
theory NOUN noun


In [17]:
# Print Dependency Parsing tags for tokens in the first question
for token in example_question_tokens:
    print(token.text,token.dep_, spacy.explain(token.dep_))

For prep prepositional modifier
the det determiner
last amod adjectival modifier
8 nummod numeric modifier
years pobj object of preposition
of prep prepositional modifier
his poss possession modifier
life pobj object of preposition
, punct punctuation
Galileo nsubj nominal subject
was ROOT None
under prep prepositional modifier
house compound compound
arrest pobj object of preposition
for prep prepositional modifier
espousing pcomp complement of preposition
this det determiner
man poss possession modifier
's case case marking
theory dobj direct object


In [18]:
# Visualize the dependency parse
from spacy import displacy

displacy.render(example_question_tokens, style='dep',
                jupyter=True, options={'distance': 120})

In [19]:
# Print tokens for example sentence without chunking
for token in nlp("My parents live in New York City."):
    print(token.text)

My
parents
live
in
New
York
City
.


In [20]:
# Print chunks for example sentence
for chunk in nlp("My parents live in New York City.").noun_chunks:
      print(chunk.text)

My parents
New York City


In [21]:
# Print lemmatization for tokens in the first question
lemmatization = pd.DataFrame(data=[], \
  columns=["original","lemmatized"])
i = 0
for token in example_question_tokens:
    lemmatization.loc[i,"original"] = token.text
    lemmatization.loc[i,"lemmatized"] = token.lemma_
    i = i+1

lemmatization

Unnamed: 0,original,lemmatized
0,For,for
1,the,the
2,last,last
3,8,8
4,years,year
5,of,of
6,his,his
7,life,life
8,",",","
9,Galileo,Galileo


In [22]:
# Print NER results
example_sentence = "George Washington was an American political leader, \
military general, statesman, and Founding Father who served as the \
first president of the United States from 1789 to 1797.\n"

print(example_sentence)

print("Text Start End Label")
doc = nlp(example_sentence)
for token in doc.ents:
    print(token.text, token.start_char, token.end_char, token.label_)

George Washington was an American political leader, military general, statesman, and Founding Father who served as the first president of the United States from 1789 to 1797.

Text Start End Label
George Washington 0 17 PERSON
American 25 33 NORP
first 119 124 ORDINAL
the United States 138 155 GPE
1789 to 1797 161 173 DATE


In [23]:
# Import libraries
import requests

# Define Google Knowledge Graph API Result function
def returnGraphResult(query, key, entityType):
    if entityType=="PERSON":
        google = f"https://kgsearch.googleapis.com/v1/entities:search\
         ?query={query}&key={key}"
        resp = requests.get(google)
        url = resp.json()['itemListElement'][0]['result']\
         ['detailedDescription']['url']
        description = resp.json()['itemListElement'][0]['result']\
         ['detailedDescription']['articleBody']
        return url, description
    else:
        return "no_match", "no_match"

In [24]:
# Print Wikipedia descriptions and URLs for entities
for token in doc.ents:
    url, description = returnGraphResult(token.text, key, token.label_)
    print(token.text, token.label_, url, description)

NameError: name 'key' is not defined