In [1]:
import re
import json
import spacy
import pandas as pd
import cufflinks as cf
from nltk import ngrams
from itertools import chain
from random import shuffle
from string import punctuation
from collections import Counter
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

In [2]:
init_notebook_mode(connected=True)
cf.set_config_file(theme='white')
cf.go_offline()
colorscale = map(lambda x: x, cf.colors.get_scales('accent'))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [3]:
def extract_data(f):
    with open(f, 'r') as read_file:
        for _ in read_file:
            yield json.loads(_)

In [4]:
reviews = extract_data('review.json')
businesses = extract_data('business.json')

In [5]:
review_df = pd.DataFrame(list(reviews))
business_df = pd.DataFrame(list(businesses))

In [6]:
us_states = json.loads(open('usStates.json').read())
only_us_businesses = business_df[business_df.state.isin(us_states.keys())]

In [7]:
merged_reviews = business_df[['business_id']].merge(review_df[['business_id', 'text']], on='business_id')

In [8]:
print('Total number of Yelp reviews:', len(merged_reviews))

Total number of Yelp reviews: 4736897


In [9]:
def filter_token(token):
    special_characters = re.compile('[\n\t\s]+?')
    punctuation_digits_regex = re.compile("[\d{}]+$".format(re.escape(punctuation)))
    token_filter = not (token.is_punct 
                        or token.is_stop 
                        or special_characters.search(token.text) 
                        or punctuation_digits_regex.search(token.text)
                        or "'" in token.text)
    if token_filter:
        return token

In [89]:
def gather_doc_metrics(docs):
    all_doc_metrics = []
    for i,doc in enumerate(docs):
        doc_metrics = {
            'id': i, 
            'text': [filter_token(token).text for token in doc if filter_token(token)],
            'lemma': [filter_token(token).lemma_ for token in doc if filter_token(token)], 
            'doc': doc
        }
        doc_metrics['bigrams'] = map(lambda ngram: ' '.join(ngram).lower().strip(), ngrams(doc_metrics.get('text'), 2))
        doc_metrics['trigrams'] = map(lambda ngram: ' '.join(ngram).lower().strip(), ngrams(doc_metrics.get('text'), 3))
        all_doc_metrics.append(doc_metrics)
    return all_doc_metrics

In [82]:
nlp = spacy.load('en')
review_sample = merged_reviews.sample(n=1000).text.tolist()
docs = list(map(nlp, review_sample))

In [133]:
doc_metrics = gather_doc_metrics(docs)

###Most Frequent Words

In [91]:
all_text = chain.from_iterable([d.get('text') for d in doc_metrics])
all_text = [token.lower().strip() for token in all_text]

In [92]:
len(Counter(all_text))

8773

In [123]:
#top 25 words by count
word_count_df = pd.DataFrame(Counter(all_text).most_common(25), columns=['Word', 'Count']).set_index('Word')
word_count_plot = word_count_df.iplot(kind='bar', 
                                      asFigure=True, 
                                      xTitle='word', 
                                      yTitle='count', 
                                      title='Top 25 Words by Count',
                                      color='blue')

cf.iplot(word_count_plot, legend=False)

In [134]:
def make_ngram_list(doc_metrics, key='bigrams'):
    return list(chain.from_iterable([list(d.get(key)) for d in doc_metrics]))

In [135]:
#most common bigrams
all_bigrams = make_ngram_list(doc_metrics)
all_trigrams = make_ngram_list(doc_metrics, key='trigrams')

In [138]:
def make_count_df(_ngrams):
    top25_ngrams = Counter(_ngrams).most_common(25)
    ngram_df = pd.DataFrame(top25_ngrams, columns=['Word', 'Count']).set_index('Word')
    return ngram_df

In [139]:
bigram_df = make_count_df(all_bigrams)
bigram_plot = bigram_df.iplot(kind='bar', 
                              asFigure=True, 
                              xTitle='bigram', 
                              yTitle='count', 
                              title='Top 25 Bigrams by Count',
                              color='blue')
cf.iplot(bigram_plot, legend=False)

In [146]:
trigram_df = make_count_df(all_trigrams)
trigram_plot = trigram_df.iplot(kind='bar', 
                              asFigure=True, 
                              xTitle='trigram', 
                              yTitle='count', 
                              title='Top 25 Trigrams by Count',
                              color='blue')
cf.iplot(trigram_plot, legend=False)