# Using `spaCy` for part-of-speech and named entity recognition

Prior to running this code, it's necessary to install `spaCy` on your machine, and also to download its English libraries. 

In [1]:
import spacy
import pandas as pd
from spacy.tokens import Doc
from spacy.vocab import Vocab

In [2]:
# Load the pre-defined English model:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Load the small pre-defined general English model (this is sufficient for most tasks):
nlp = spacy.load('en')

## For a single sentence

In [4]:
# Tokenize a sample sentence:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion.')

In [7]:
# Display part-of-speech tagging 
for token in doc:
    print(token.text, token.pos_)

Apple PROPN
is VERB
looking VERB
at ADP
buying VERB
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM
. PUNCT


In [8]:
# Use the 'displacy' feature to show Named Entity Recognition (NER)
spacy.displacy.render(doc, style='ent',jupyter=True)

#### What this should look like if it ran correctly:

<img src="resources/posner_example.png" alt="change kernel" style="width: 1000px;"/>

### Full range of `token` methods and attributes
as listed on spacy [documentation](https://spacy.io/api/token)

**Note**: adding an underscore `_` after some attributes (like `lemma`) will display text; removing will display vector.  
**Note:** the `.dep_` attribute means "Syntactic dependency relation"

In [15]:
# display a few of the many token attributes.
for token in doc:
    print(token.text, token.lemma_, token.is_stop, token.dep_)

Apple apple False nsubj
is be True aux
looking look False ROOT
at at True prep
buying buy False pcomp
U.K. u.k. False compound
startup startup False dobj
for for True prep
$ $ False quantmod
1 1 False compound
billion billion False pobj
. . False punct


## For a pandas dataframe

In [23]:
# Read in a CSV file with a column of text abstracts.
df = pd.read_csv('resources/fedreg.csv')
df.head(3)

Unnamed: 0,document_number,abstract
0,testing12345,The quick brown fox jumps over the lazy dog.
1,2018-10583,We are superseding Airworthiness Directive (AD...
2,2018-10902,The Commodity Futures Trading Commission (Comm...


In [25]:
# Preprocess and vectorize the text column.
df['tokens'] = df['abstract'].apply(lambda x: nlp(x))
df.head(3)

Unnamed: 0,document_number,abstract,tokens
0,testing12345,The quick brown fox jumps over the lazy dog.,"(The, quick, brown, fox, jumps, over, the, laz..."
1,2018-10583,We are superseding Airworthiness Directive (AD...,"(We, are, superseding, Airworthiness, Directiv..."
2,2018-10902,The Commodity Futures Trading Commission (Comm...,"(The, Commodity, Futures, Trading, Commission,..."


In [27]:
# get part-of-speech for tokens in every row
def get_pos(doc):
    return [token.pos_ for token in doc]

df['pos'] = df['tokens'].apply(lambda x: get_pos(x))
df.head(3)

Unnamed: 0,document_number,abstract,tokens,pos
0,testing12345,The quick brown fox jumps over the lazy dog.,"(The, quick, brown, fox, jumps, over, the, laz...","[DET, ADJ, ADJ, NOUN, VERB, ADP, DET, ADJ, NOU..."
1,2018-10583,We are superseding Airworthiness Directive (AD...,"(We, are, superseding, Airworthiness, Directiv...","[PRON, VERB, VERB, PROPN, PROPN, PUNCT, NOUN, ..."
2,2018-10902,The Commodity Futures Trading Commission (Comm...,"(The, Commodity, Futures, Trading, Commission,...","[DET, PROPN, PROPN, PROPN, PROPN, PUNCT, PROPN..."


In [30]:
# get syntactic dependency for tokens in every row
def get_dep(doc):
    return [token.dep_ for token in doc]

df['dep'] = df['tokens'].apply(lambda x: get_dep(x))
df.head(3)

Unnamed: 0,document_number,abstract,tokens,pos,dep
0,testing12345,The quick brown fox jumps over the lazy dog.,"(The, quick, brown, fox, jumps, over, the, laz...","[DET, ADJ, ADJ, NOUN, VERB, ADP, DET, ADJ, NOU...","[det, amod, amod, nsubj, ROOT, prep, det, amod..."
1,2018-10583,We are superseding Airworthiness Directive (AD...,"(We, are, superseding, Airworthiness, Directiv...","[PRON, VERB, VERB, PROPN, PROPN, PUNCT, NOUN, ...","[nsubj, aux, ROOT, compound, dobj, punct, appo..."
2,2018-10902,The Commodity Futures Trading Commission (Comm...,"(The, Commodity, Futures, Trading, Commission,...","[DET, PROPN, PROPN, PROPN, PROPN, PUNCT, PROPN...","[det, compound, compound, compound, nsubj, pun..."


In [31]:
# get stop-word boolean for tokens in every row
def get_stopw(doc):
    return [token.is_stop for token in doc]

df['stopwords'] = df['tokens'].apply(lambda x: get_stopw(x))
df.head(3)

Unnamed: 0,document_number,abstract,tokens,pos,dep,stopwords
0,testing12345,The quick brown fox jumps over the lazy dog.,"(The, quick, brown, fox, jumps, over, the, laz...","[DET, ADJ, ADJ, NOUN, VERB, ADP, DET, ADJ, NOU...","[det, amod, amod, nsubj, ROOT, prep, det, amod...","[False, False, False, False, False, True, True..."
1,2018-10583,We are superseding Airworthiness Directive (AD...,"(We, are, superseding, Airworthiness, Directiv...","[PRON, VERB, VERB, PROPN, PROPN, PUNCT, NOUN, ...","[nsubj, aux, ROOT, compound, dobj, punct, appo...","[False, True, False, False, False, False, Fals..."
2,2018-10902,The Commodity Futures Trading Commission (Comm...,"(The, Commodity, Futures, Trading, Commission,...","[DET, PROPN, PROPN, PROPN, PROPN, PUNCT, PROPN...","[det, compound, compound, compound, nsubj, pun...","[False, False, False, False, False, False, Fal..."
