# Imports

In [1]:
import pathlib

# Constants

In [2]:
PROJECT_DIR = pathlib.Path('~/work').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [3]:
# https://rstudio-pubs-static.s3.amazonaws.com/120883_c8123ff272164b2a94be097a6237150b.html
YELP_DATA_DIR = DATA_DIR / 'yelp' / 'v6' / 'yelp_dataset_challenge_academic_dataset'

# Stemming

In [4]:
import nltk

In [5]:
stemmer = nltk.stem.porter.PorterStemmer()

In [6]:
stemmer.stem('flowers')

'flower'

In [7]:
stemmer.stem('zeroes')

'zero'

In [8]:
stemmer.stem('stemmer')

'stemmer'

In [9]:
stemmer.stem('sixties')

'sixti'

In [10]:
stemmer.stem('sixty')

'sixti'

In [11]:
stemmer.stem('goes')

'goe'

In [12]:
stemmer.stem('go')

'go'

# Example 3-2

PoS tagging and chunking

In [13]:
import json
import pandas as pd

In [14]:
# Load the first 10 reviews
with open(YELP_DATA_DIR / 'yelp_academic_dataset_review.json') as f:
    review_df = pd.DataFrame([
        json.loads(f.readline()) for i in range(10)
    ])

In [15]:
review_df.shape

(10, 8)

## Using spaCy

In [16]:
# First we'll walk through spaCy's functions
import spacy

In [17]:
# !python -m spacy download en_core_web_sm

In [18]:
# preload the language model
nlp = spacy.load('en')

In [19]:
# We can create a Pandas Series of spaCy nlp variables
doc_df = review_df['text'].apply(nlp)
type(doc_df)

pandas.core.series.Series

In [20]:
type(doc_df[4])

spacy.tokens.doc.Doc

In [21]:
type(doc_df[4][0])

spacy.tokens.token.Token

In [22]:
# spaCy gives us fine-grained parts of speech using (.pos_)
# and coarse-grained parts of speech using (.tag_)
for token in doc_df[4]:
    print(token.text, token.pos_, token.tag_)

Got VERB VB
a DET DT
letter NOUN NN
in ADP IN
the DET DT
mail NOUN NN
last ADJ JJ
week NOUN NN
that DET WDT
said VERB VBD
Dr. PROPN NNP
Goldberg PROPN NNP
is AUX VBZ
moving VERB VBG
to ADP IN
Arizona PROPN NNP
to PART TO
take VERB VB
a DET DT
new ADJ JJ
position NOUN NN
there ADV RB
in ADP IN
June PROPN NNP
. PUNCT .
  SPACE _SP
He PRON PRP
will VERB MD
be AUX VB
missed VERB VBN
very ADV RB
much ADV RB
. PUNCT .
 

 SPACE _SP
I PRON PRP
think VERB VBP
finding VERB VBG
a DET DT
new ADJ JJ
doctor NOUN NN
in ADP IN
NYC PROPN NNP
that SCONJ IN
you PRON PRP
actually ADV RB
like VERB VBP
might VERB MD
almost ADV RB
be AUX VB
as ADV RB
awful ADJ JJ
as SCONJ IN
trying VERB VBG
to PART TO
find VERB VB
a DET DT
date NOUN NN
! PUNCT .


In [23]:
# spaCy also does some basic noun chunking for us
print([chunk for chunk in doc_df[4].noun_chunks])

[a letter, the mail, Dr. Goldberg, Arizona, a new position, June, He, I, a new doctor, NYC, you, a date]


## Using [TextBlob](https://textblob.readthedocs.io/en/dev/)

In [24]:
from textblob import TextBlob

In [25]:
# import nltk
# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

The default tagger in TextBlob uses the PatternTagger, the same as [pattern](https://www.clips.uantwerpen.be/pattern), which is fine for our example. To use the NLTK tagger, we can specify the pos_tagger when we call TextBlob. More [here](http://textblob.readthedocs.io/en/dev/advanced_usage.html#advanced).

In [26]:
# The default tagger in TextBlob uses the PatternTagger, which is OK for our example.
# You can also specify the NLTK tagger, which works better for incomplete sentences.
blob_df = review_df['text'].apply(TextBlob)
type(blob_df)

pandas.core.series.Series

In [27]:
type(blob_df[4])

textblob.blob.TextBlob

In [28]:
blob_df[4].tags

[('Got', 'NNP'),
 ('a', 'DT'),
 ('letter', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mail', 'NN'),
 ('last', 'JJ'),
 ('week', 'NN'),
 ('that', 'WDT'),
 ('said', 'VBD'),
 ('Dr.', 'NNP'),
 ('Goldberg', 'NNP'),
 ('is', 'VBZ'),
 ('moving', 'VBG'),
 ('to', 'TO'),
 ('Arizona', 'NNP'),
 ('to', 'TO'),
 ('take', 'VB'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('position', 'NN'),
 ('there', 'RB'),
 ('in', 'IN'),
 ('June', 'NNP'),
 ('He', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('missed', 'VBN'),
 ('very', 'RB'),
 ('much', 'RB'),
 ('I', 'PRP'),
 ('think', 'VBP'),
 ('finding', 'VBG'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('doctor', 'NN'),
 ('in', 'IN'),
 ('NYC', 'NNP'),
 ('that', 'IN'),
 ('you', 'PRP'),
 ('actually', 'RB'),
 ('like', 'IN'),
 ('might', 'MD'),
 ('almost', 'RB'),
 ('be', 'VB'),
 ('as', 'RB'),
 ('awful', 'JJ'),
 ('as', 'IN'),
 ('trying', 'VBG'),
 ('to', 'TO'),
 ('find', 'VB'),
 ('a', 'DT'),
 ('date', 'NN')]

In [29]:
# blobs in TextBlob also have noun phrase extraction
print([np for np in blob_df[4].noun_phrases])

['got', 'goldberg', 'arizona', 'new position', 'june', 'new doctor', 'nyc']
