In [2]:
import pandas as pd
import json

In [3]:
# Load the first 10 reviews
f = open('data/yelp/v6/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')
js = []
for i in range(10):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)
review_df.shape

(10, 8)

### Using spacy: [Installation instructions for spacy](https://spacy.io/docs/usage/)

In [4]:
import spacy

In [5]:
# model meta data
spacy.info('en')


    [93mInfo about model en[0m

    link               //anaconda/envs/pharoah-eagle-owl/lib/python3.5/site-packages/spacy/data/en
    lang               en             
    name               core_web_sm    
    author             Explosion AI   
    description        Small English model for spaCy. Includes vocabulary, syntax, entities and word vectors (GloVe).
    source             /anaconda/envs/pharoah-eagle-owl/lib/python3.5/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
    email              contact@explosion.ai
    license            CC BY-SA 3.0   
    spacy_version      >=1.7.0,<2.0.0 
    version            1.2.0          
    url                https://explosion.ai



In [6]:
# preload the language model
nlp = spacy.load('en')

In [7]:
# Keeping it in a pandas dataframe
doc_df = review_df['text'].apply(nlp)

type(doc_df)

pandas.core.series.Series

In [8]:
type(doc_df[0])

spacy.tokens.doc.Doc

In [17]:
doc_df[4]

Got a letter in the mail last week that said Dr. Goldberg is moving to Arizona to take a new position there in June.  He will be missed very much.  

I think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!

In [148]:
# spacy gives you both fine grained (.pos_) + coarse grained (.tag_) parts of speech    
for doc in doc_df[4]:
    print(doc.text, doc.pos_, doc.tag_)

Got VERB VBP
a DET DT
letter NOUN NN
in ADP IN
the DET DT
mail NOUN NN
last ADJ JJ
week NOUN NN
that ADJ WDT
said VERB VBD
Dr. PROPN NNP
Goldberg PROPN NNP
is VERB VBZ
moving VERB VBG
to ADP IN
Arizona PROPN NNP
to PART TO
take VERB VB
a DET DT
new ADJ JJ
position NOUN NN
there ADV RB
in ADP IN
June PROPN NNP
. PUNCT .
  SPACE SP
He PRON PRP
will VERB MD
be VERB VB
missed VERB VBN
very ADV RB
much ADV RB
. PUNCT .
 

 SPACE SP
I PRON PRP
think VERB VBP
finding VERB VBG
a DET DT
new ADJ JJ
doctor NOUN NN
in ADP IN
NYC PROPN NNP
that ADP IN
you PRON PRP
actually ADV RB
like INTJ UH
might VERB MD
almost ADV RB
be VERB VB
as ADV RB
awful ADJ JJ
as ADP IN
trying VERB VBG
to PART TO
find VERB VB
a DET DT
date NOUN NN
! PUNCT .


In [149]:
# spaCy also does noun chunking for us

print([chunk for chunk in doc_df[4].noun_chunks])

[a letter, the mail, Dr. Goldberg, Arizona, a new position, June, He, I, a new doctor, NYC, you, a date]


### Using [Textblob](https://textblob.readthedocs.io/en/dev/)

In [1]:
from textblob import TextBlob

The default tagger in TextBlob uses the PatternTagger, the same as [pattern](https://www.clips.uantwerpen.be/pattern), which is fine for our example. To use the NLTK tagger, we can specify the pos_tagger when we call TextBlob. More [here](http://textblob.readthedocs.io/en/dev/advanced_usage.html#advanced).

In [135]:
blob_df = review_df['text'].apply(TextBlob)

type(blob_df)

pandas.core.series.Series

In [150]:
type(blob_df[4])

textblob.blob.TextBlob

In [151]:
blob_df[4].tags

[('Got', 'NNP'),
 ('a', 'DT'),
 ('letter', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mail', 'NN'),
 ('last', 'JJ'),
 ('week', 'NN'),
 ('that', 'WDT'),
 ('said', 'VBD'),
 ('Dr.', 'NNP'),
 ('Goldberg', 'NNP'),
 ('is', 'VBZ'),
 ('moving', 'VBG'),
 ('to', 'TO'),
 ('Arizona', 'NNP'),
 ('to', 'TO'),
 ('take', 'VB'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('position', 'NN'),
 ('there', 'RB'),
 ('in', 'IN'),
 ('June', 'NNP'),
 ('He', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('missed', 'VBN'),
 ('very', 'RB'),
 ('much', 'JJ'),
 ('I', 'PRP'),
 ('think', 'VBP'),
 ('finding', 'VBG'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('doctor', 'NN'),
 ('in', 'IN'),
 ('NYC', 'NNP'),
 ('that', 'IN'),
 ('you', 'PRP'),
 ('actually', 'RB'),
 ('like', 'IN'),
 ('might', 'MD'),
 ('almost', 'RB'),
 ('be', 'VB'),
 ('as', 'RB'),
 ('awful', 'JJ'),
 ('as', 'IN'),
 ('trying', 'VBG'),
 ('to', 'TO'),
 ('find', 'VB'),
 ('a', 'DT'),
 ('date', 'NN')]

In [152]:
# blobs in TextBlob also have noun phrase extraction

print([np for np in blob_df[4].noun_phrases])

['got', 'goldberg', 'arizona', 'new position', 'june', 'new doctor', 'nyc']
