In [1]:
import json
import pandas as pd
import random

In [2]:
input_json = "df_by_usr.json"
with open(input_json, 'r') as f:
    data = f.readlines()
    data = list(map(json.loads, data))

In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,avg_star_delta,reviewer_label,text_agg,user_id
0,-0.318182,med,ummm star rating yelp point come sandwich plac...,--3WaS23LcIXtxyFULJHTA
1,-0.947368,hard,opinion restaurant best food phoenix taste gre...,--4rAAfZnEIAKJE80aIiYg
2,0.333333,med,right world going gawd awful hooter vega getti...,--CIuK7sUpaNzalLAlHJKA
3,0.9,easy,best buffet price quality went saturday night ...,--HCoE1ghaAlcaAfshICgw
4,-0.166667,med,great pizza pasta just quality pizza definitel...,--NIc98RMssgy0mSZL3vpA


In [4]:
df.shape

(35051, 4)

In [5]:
df_easy = list(df.loc[df['reviewer_label'] == 'easy']["text_agg"])
df_easy[0]

'best buffet price quality went saturday night buffet plus prime rib sushi fresh pasta excellent dessert bar amazing dozen pie lot shooter sorbet cake staff friendly attentive saturday night seated minute know restaurant star rating settebello star went lunch overly crowded appetizer delicious pizza came minute ordering pizza make drool just thinking fresh mozzarella sauce crushed tomato seasoning divine learned authentic pizza aren cut cut anyways pizza big split appetizer going sure bbq heaven food authentic savory want barbecue place got small appetizer sampler party tried tri tip sandwich delicious meat strawberry lemonade refreshing huge glass didn need refill came dessert highly recommend snicker ice cream cake smart share enormous service little slow didn initially ask wanted dessert end food won star review went lunch moderately busy menu isn deep option available sounded delicious got grilled cheese excellent reason restaurant star waiting time lunch plan hour proactive asking

In [6]:
reviewer_label_list = set(df['reviewer_label'])
reviewer_label_list

{'easy', 'hard', 'med'}

In [7]:
dict_text_agg = dict()
for label in reviewer_label_list:
    temp_text_agg = list(df.loc[df['reviewer_label'] == label]["text_agg"])
    temp_text_agg = [w.replace("\n", " ") for w in temp_text_agg]
    temp_text_agg = [w.replace("\r", " ") for w in temp_text_agg]
    dict_text_agg[label] = temp_text_agg

In [8]:
dict_text_agg['easy'][0]

'best buffet price quality went saturday night buffet plus prime rib sushi fresh pasta excellent dessert bar amazing dozen pie lot shooter sorbet cake staff friendly attentive saturday night seated minute know restaurant star rating settebello star went lunch overly crowded appetizer delicious pizza came minute ordering pizza make drool just thinking fresh mozzarella sauce crushed tomato seasoning divine learned authentic pizza aren cut cut anyways pizza big split appetizer going sure bbq heaven food authentic savory want barbecue place got small appetizer sampler party tried tri tip sandwich delicious meat strawberry lemonade refreshing huge glass didn need refill came dessert highly recommend snicker ice cream cake smart share enormous service little slow didn initially ask wanted dessert end food won star review went lunch moderately busy menu isn deep option available sounded delicious got grilled cheese excellent reason restaurant star waiting time lunch plan hour proactive asking

In [9]:
dict_text_agg.keys()

dict_keys(['med', 'hard', 'easy'])

In [10]:
from pyspark import SparkContext, SparkConf
import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
import string
from collections import Counter

# Spark Setup
app_name = "most_common_word_review"
conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf=conf)

In [11]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    text = text.lower() # get lower case
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # remove all punctuation + numbers
    nopunct = re.sub(r"\s+", ' ', nopunct) # replace multiple spaces with one space
    words = nltk.word_tokenize(nopunct)
    words = [w for w in words if len(w) > 2] # drop words of length < 3
    words = [w for w in words if w not in stop_words.ENGLISH_STOP_WORDS] # drop stop words
    words = stemwords(words)  # call stemwords() to get words stemmed
    return words

def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

def most_common_n(filename, n):
    rdd = sc.parallelize(filename, 8)
    rdd_listed = rdd.flatMap(lambda x: tokenize(x))
    ctr = Counter(rdd_listed.collect())
    most_common_list = ctr.most_common(n)
    return most_common_list

# def word_freq(filename):
#     rdd = sc.parallelize(filename, 8)
#     rdd_listed = rdd.flatMap(lambda x: tokenize(x))
#     ctr = Counter(rdd_listed.collect())
#     return ctr

In [12]:
n = 50
most_common_n_dict = {}
for label in reviewer_label_list:
    most_common_n_dict[label] = most_common_n(dict_text_agg[label], n)

In [13]:
most_common_n_dict['easy']

[('food', 52242),
 ('place', 48173),
 ('great', 44277),
 ('good', 43776),
 ('order', 28145),
 ('time', 28097),
 ('servic', 27680),
 ('love', 24822),
 ('like', 24672),
 ('tri', 21604),
 ('just', 20777),
 ('realli', 17906),
 ('delici', 17290),
 ('come', 16747),
 ('restaur', 16200),
 ('best', 15722),
 ('nice', 14708),
 ('amaz', 14037),
 ('friendli', 14027),
 ('chicken', 14027),
 ('staff', 12549),
 ('price', 12543),
 ('got', 12429),
 ('menu', 12377),
 ('make', 12168),
 ('eat', 12159),
 ('definit', 11846),
 ('wait', 11794),
 ('drink', 11500),
 ('fri', 11385),
 ('pizza', 11184),
 ('fresh', 10813),
 ('littl', 10666),
 ('recommend', 10210),
 ('don', 10138),
 ('look', 10059),
 ('want', 9997),
 ('tast', 9865),
 ('came', 9384),
 ('bar', 8972),
 ('favorit', 8963),
 ('locat', 8945),
 ('burger', 8887),
 ('salad', 8864),
 ('enjoy', 8505),
 ('sauc', 8498),
 ('lunch', 8482),
 ('tabl', 8379),
 ('meal', 8279),
 ('chees', 8277)]

In [16]:
sc.stop()

In [17]:
import pickle
# Dump the list
filename = 'most_common_n_dict'
outfile = open(filename, 'wb')
pickle.dump(most_common_n_dict, outfile)
outfile.close()