In [229]:
import re
import json
import spacy
import pandas as pd
import cufflinks as cf
from nltk import ngrams
from itertools import chain
from random import shuffle
from string import punctuation
from collections import Counter
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

In [219]:
init_notebook_mode(connected=True)
cf.set_config_file(theme='white')
cf.go_offline()
colorscale = map(lambda x: x, cf.colors.get_scales('accent'))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [4]:
def extract_data(f):
    with open(f, 'r') as read_file:
        for _ in read_file:
            yield json.loads(_)

In [5]:
reviews = extract_data('review.json')
businesses = extract_data('business.json')

In [6]:
review_df = pd.DataFrame(list(reviews))
business_df = pd.DataFrame(list(businesses))

In [7]:
us_states = json.loads(open('usStates.json').read())
only_us_businesses = business_df[business_df.state.isin(us_states.keys())]

In [8]:
merged_reviews = business_df[['business_id']].merge(review_df[['business_id', 'text']], on='business_id')

In [53]:
print('Total number of Yelp reviews:', len(merged_reviews))

Total number of Yelp reviews: 4736897


In [209]:
def filter_token(token):
    special_characters = re.compile('[\n\t\s]+?')
    punctuation_digits_regex = re.compile("[\d{}]+$".format(re.escape(punctuation)))
    token_filter = not (token.is_punct 
                        or token.is_stop 
                        or special_characters.search(token.text) 
                        or punctuation_digits_regex.search(token.text)
                        or "'" in token.text)
    if token_filter:
        return token

In [179]:
def gather_doc_metrics(docs):
    return [
        {
            'id': i, 
            'text': [filter_token(token).text for token in doc if filter_token(token)],
            'lemma': [filter_token(token).lemma_ for token in doc if filter_token(token)], 
            'doc': doc
        }
        for i,doc in enumerate(docs)
    ]

In [180]:
nlp = spacy.load('en')
review_sample = merged_reviews.sample(n=10000).text.tolist()
docs = list(map(nlp, review_sample))

In [210]:
doc_metrics = gather_doc_metrics(docs)

###Most Frequent Words

In [211]:
all_text = chain.from_iterable([d.get('text') for d in doc_metrics])
all_text = [token.lower().strip() for token in all_text]

In [212]:
len(Counter(all_text))

29984

In [228]:
#top 25 words by count
word_count_df = pd.DataFrame(Counter(all_text).most_common(25), columns=['Word', 'Count']).set_index('Word')
word_count_plot = word_count_df.iplot(kind='bar', 
                                      asFigure=True, 
                                      xTitle='word', 
                                      yTitle='count', 
                                      title='Top 25 Words by Count',
                                      color='blue')

iplot(cf.iplot(word_count_plot, legend=False))

PlotlyError: The `figure_or_data` positional argument must be either `dict`-like or `list`-like.