In [18]:
import json
import pandas as pd
import random
from textblob import TextBlob as tb
from cust_stop_words import *

In [2]:
input_json = "../df_by_usr.json"
with open(input_json, 'r') as f:
    data = f.readlines()
    data = list(map(json.loads, data))

In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,avg_star_delta,reviewer_label,text_agg,user_id
0,-0.318182,med,ummm star ratings yelp point come sandwich pla...,--3WaS23LcIXtxyFULJHTA
1,-0.947368,hard,opinion restaurant best food phoenix tastes gr...,--4rAAfZnEIAKJE80aIiYg
2,0.333333,med,right world going gawd awful hooters vegas get...,--CIuK7sUpaNzalLAlHJKA
3,0.9,easy,best buffets price quality went saturday night...,--HCoE1ghaAlcaAfshICgw
4,-0.166667,med,great pizza pasta just quality pizza definitel...,--NIc98RMssgy0mSZL3vpA


In [4]:
df.shape

(35051, 4)

In [5]:
df_easy = list(df.loc[df['reviewer_label'] == 'easy']["text_agg"])
len(df_easy)

5910

In [6]:
reviewer_label_list = set(df['reviewer_label'])
reviewer_label_list

{'easy', 'hard', 'med'}

In [7]:
dict_text_agg = dict()
for label in reviewer_label_list:
    temp_text_agg = list(df.loc[df['reviewer_label'] == label]["text_agg"])
    temp_text_agg = [w.replace("\n", " ") for w in temp_text_agg]
    temp_text_agg = [w.replace("\r", " ") for w in temp_text_agg]
    dict_text_agg[label] = temp_text_agg

In [8]:
from pyspark import SparkContext, SparkConf
import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
import string
from collections import Counter

# Spark Setup
app_name = "most_common_word_review"
conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf=conf)

In [19]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    text = text.lower() # get lower case
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # remove all punctuation + numbers
    nopunct = re.sub(r"\s+", ' ', nopunct) # replace multiple spaces with one space
    words = nltk.word_tokenize(nopunct)        
    words = [w for w in words if len(w) > 2] # drop words of length < 3
    words = [w for w in words if w not in stop_words]
    return words

# Customized version of tokenize with spell check option
def norm_correct(text, spell):
    punct = '[\\!\\"\\#\\$\\%\\&\\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\{\\|\\}\\~0-9\\r\\t\\n]'
    text = text.lower()
    punct_regex = re.compile(punct)
    not_regex = re.compile("(n\\'t)")
    text = not_regex.sub(' not', text)
    words = punct_regex.sub(" ", text).split()
    
    if spell:
        words = [str(tb(w).correct()) for w in words if len(w) > 2 if w not in stop_words]
    else:
        words = [w for w in words if len(w) > 2 if w not in stop_words]
    
    return words

def most_common_n(filename, n, spell):
    rdd = sc.parallelize(filename, 8)
    rdd_listed = rdd.flatMap(lambda x: tokenize(x))
    ctr = Counter(rdd_listed.collect())
    most_common_list = ctr.most_common(n)
    
    return most_common_list

In [20]:
n = 50
most_common_n_dict = {}
for label in reviewer_label_list:
    most_common_n_dict[label] = most_common_n(dict_text_agg[label], n, False)

In [21]:
most_common_n_dict['easy']

[('food', 51258),
 ('great', 44206),
 ('place', 43453),
 ('good', 42891),
 ('service', 27476),
 ('like', 21951),
 ('time', 21330),
 ('just', 20769),
 ('love', 18303),
 ('really', 17906),
 ('delicious', 16948),
 ('best', 15713),
 ('nice', 14078),
 ('chicken', 13986),
 ('friendly', 13954),
 ('amazing', 13815),
 ('restaurant', 13343),
 ('try', 13082),
 ('did', 12890),
 ('ordered', 12650),
 ('staff', 12491),
 ('got', 12428),
 ('order', 12282),
 ('menu', 11935),
 ('definitely', 11607),
 ('little', 10660),
 ('fresh', 10599),
 ('pizza', 10279),
 ('come', 9984),
 ('came', 9384),
 ('eat', 8635),
 ('bar', 8483),
 ('wait', 8411),
 ('lunch', 8361),
 ('went', 8165),
 ('awesome', 8048),
 ('cheese', 7995),
 ('make', 7937),
 ('favorite', 7925),
 ('people', 7846),
 ('pretty', 7839),
 ('recommend', 7583),
 ('dinner', 7448),
 ('night', 7443),
 ('salad', 7409),
 ('experience', 7309),
 ('sauce', 7278),
 ('right', 6984),
 ('excellent', 6899),
 ('meal', 6761)]

In [22]:
sc.stop()

In [23]:
import pickle
# Dump the list
filename = 'most_common_n_dict'
outfile = open(filename, 'wb')
pickle.dump(most_common_n_dict, outfile)
outfile.close()