In [115]:
import pandas as pd
import json

In [116]:
# Load the first 10 reviews
f = open('data/yelp/v6/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')
js = []
for i in range(10):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)
review_df.shape

(10, 8)

### Using spacy: [Installation instructions for spacy](https://spacy.io/docs/usage/)

In [117]:
import spacy

In [118]:
# model meta data
spacy.info('en')


    [93mInfo about model en[0m

    url                https://explosion.ai
    source             /anaconda/envs/pharoah-eagle-owl/lib/python3.5/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
    name               core_web_sm    
    version            1.2.0          
    description        Small English model for spaCy. Includes vocabulary, syntax, entities and word vectors (GloVe).
    lang               en             
    link               //anaconda/envs/pharoah-eagle-owl/lib/python3.5/site-packages/spacy/data/en
    email              contact@explosion.ai
    author             Explosion AI   
    spacy_version      >=1.7.0,<2.0.0 
    license            CC BY-SA 3.0   



In [119]:
# preload the language model
nlp = spacy.load('en')

In [120]:
# Keeping it in a pandas dataframe
review_df['text_pos'] = review_df['text'].apply(nlp)

    
type(review_df['text_pos'])

pandas.core.series.Series

In [121]:
type(review_df['text_pos'][0])

spacy.tokens.doc.Doc

In [122]:
# spacy gives you both fine grained (.pos_) + coarse grained (.tag_) parts of speech    
for doc in review_df['text_pos'][0]:
    print(doc.text, doc.pos_, doc.tag_)

dr INTJ UH
. PUNCT .
goldberg ADJ JJ
offers VERB VBZ
everything NOUN NN
i PRON PRP
look VERB VBP
for ADP IN
in ADP IN
a DET DT
general ADJ JJ
practitioner NOUN NN
. PUNCT .
  SPACE SP
he PRON PRP
's VERB VBZ
nice ADJ JJ
and CCONJ CC
easy ADJ JJ
to PART TO
talk VERB VB
to ADP IN
without ADP IN
being VERB VBG
patronizing ADJ JJ
; PUNCT :
he PRON PRP
's VERB VBZ
always ADV RB
on ADP IN
time NOUN NN
in ADP IN
seeing VERB VBG
his ADJ PRP$
patients NOUN NNS
; PUNCT :
he PRON PRP
's VERB VBZ
affiliated VERB VBN
with ADP IN
a DET DT
top ADJ JJ
- PUNCT HYPH
notch NOUN NN
hospital NOUN NN
( PUNCT -LRB-
nyu NOUN NN
) PUNCT -RRB-
which ADJ WDT
my ADJ PRP$
parents NOUN NNS
have VERB VBP
explained VERB VBN
to ADP IN
me PRON PRP
is VERB VBZ
very ADV RB
important ADJ JJ
in ADP IN
case NOUN NN
something NOUN NN
happens VERB VBZ
and CCONJ CC
you PRON PRP
need VERB VBP
surgery NOUN NN
; PUNCT :
and CCONJ CC
you PRON PRP
can VERB MD
get VERB VB
referrals NOUN NNS
to PART TO
see VERB VB
specialists NOUN NN

In [125]:
# spaCy also does noun chunking for us

print([chunk for chunk in review_df['text_pos'][0].noun_chunks])

[everything, i, a general practitioner, he, he, time, his patients, he, a top-notch hospital, my parents, me, case, something, you, surgery, you, referrals, specialists, him, what, you, i, any complaints, i, him, i, a blank]


### Using [Textblob](https://textblob.readthedocs.io/en/dev/)

In [1]:
from textblob import TextBlob

The default tagger in TextBlob uses the PatternTagger, the same as [pattern](https://www.clips.uantwerpen.be/pattern), which is fine for our example. To use the NLTK tagger, we can specify the pos_tagger when we call TextBlob. More [here](http://textblob.readthedocs.io/en/dev/advanced_usage.html#advanced).

In [111]:
review_df['blob'] = review_df['text'].apply(TextBlob)

type(review_df['blob'])

pandas.core.series.Series

In [112]:
type(review_df['blob'][0])

textblob.blob.TextBlob

In [113]:
review_df['blob'][0].tags

[('dr.', 'NN'),
 ('goldberg', 'NN'),
 ('offers', 'VBZ'),
 ('everything', 'NN'),
 ('i', 'NN'),
 ('look', 'VBP'),
 ('for', 'IN'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('general', 'JJ'),
 ('practitioner', 'NN'),
 ('he', 'PRP'),
 ("'s", 'VBZ'),
 ('nice', 'JJ'),
 ('and', 'CC'),
 ('easy', 'JJ'),
 ('to', 'TO'),
 ('talk', 'VB'),
 ('to', 'TO'),
 ('without', 'IN'),
 ('being', 'VBG'),
 ('patronizing', 'NN'),
 ('he', 'PRP'),
 ("'s", 'VBZ'),
 ('always', 'RB'),
 ('on', 'IN'),
 ('time', 'NN'),
 ('in', 'IN'),
 ('seeing', 'VBG'),
 ('his', 'PRP$'),
 ('patients', 'NNS'),
 ('he', 'PRP'),
 ("'s", 'VBZ'),
 ('affiliated', 'JJ'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('top-notch', 'JJ'),
 ('hospital', 'NN'),
 ('nyu', 'NN'),
 ('which', 'WDT'),
 ('my', 'PRP$'),
 ('parents', 'NNS'),
 ('have', 'VBP'),
 ('explained', 'VBN'),
 ('to', 'TO'),
 ('me', 'PRP'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('important', 'JJ'),
 ('in', 'IN'),
 ('case', 'NN'),
 ('something', 'NN'),
 ('happens', 'NNS'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ('need', '

In [114]:
# TextBlob can give us the noun phrases 
def extract_np(x):
    return x.noun_phrases

review_df['np'] = review_df['blob'].apply(extract_np)
    
print(review_df['np'][0])

['dr. goldberg', 'i look', 'general practitioner', 'top-notch hospital', "i 'm", 'complaints i', "i 'm"]
