# Data Cleaning

In [165]:
import json
import pandas as pd
from datetime import datetime
import time
import numpy as np
from collections import defaultdict
import csv
import matplotlib.pyplot as plt
import string
from nltk.stem.porter import *
import nltk
from nltk.corpus import stopwords
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [126]:
reviewsCA = []
with open("reviewsCA.json") as f:
    for l in f:
        reviewsCA.append(eval(l))

reviewsCA = reviewsCA[0]

In [127]:
len(reviewsCA)

76254

In [143]:
reviewsCA[4]

{'rating': 5.0,
 'reviewerName': 'Anne Mason',
 'reviewText': "I've been here dozens of times, and I'll keep coming back. I really like the usuzukari and the Bruce roll",
 'categories': ['Sushi Restaurant', 'Asian Restaurant', 'Japanese Restaurant'],
 'gPlusPlaceId': '111971088396054239194',
 'unixReviewTime': 1394848761,
 'reviewTime': 'Mar 14, 2014',
 'gPlusUserId': '100000106576186066497',
 'gps': [[37.961078, -121.748886]],
 'price': ['$$']}

# Text Analysis

In [129]:
from textblob import TextBlob
#from textblob.sentiments import NaiveBayesAnalyzer

In [136]:
TextBlob("I like the food, but service is bad").sentiment[0] > 0

False

In [131]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [132]:
# all English stopwords
stop_words = set(stopwords.words("english"))

In [133]:
wordCount = defaultdict(int)
totalWords = 0

In [134]:
punct = string.punctuation
# stemmer = PorterStemmer()

In [137]:
for d in reviewsCA:
    t = d["reviewText"]
    t = t.lower()  # lowercase string
    t = [c for c in t if not (c in punct)]  # non-punct characters
    t = "".join(t)  # convert back to string
    words = t.strip().split()  # tokenizes
    for w in words:
        if w not in stop_words:  # remove stopwords
            #if TextBlob(w).sentiment[0] > 0:
            # w = stemmer.stem(w)
            totalWords += 1
            wordCount[w] += 1

KeyboardInterrupt: 

In [10]:
totalWords

1623179

In [14]:
# word with word count
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [18]:
counts[:100]

[(32377, 'food'),
 (23508, 'good'),
 (23220, 'great'),
 (20894, 'place'),
 (18005, 'service'),
 (10467, 'like'),
 (10051, 'best'),
 (9940, 'go'),
 (9939, 'get'),
 (9354, 'one'),
 (8617, 'time'),
 (7938, 'love'),
 (7594, 'really'),
 (7285, 'back'),
 (6937, 'always'),
 (6609, 'would'),
 (6562, 'nice'),
 (6376, 'restaurant'),
 (6261, 'dont'),
 (5970, 'pizza'),
 (5707, 'order'),
 (5493, 'staff'),
 (5491, 'friendly'),
 (5102, 'even'),
 (5011, 'people'),
 (5007, 'delicious'),
 (4943, 'also'),
 (4900, 'never'),
 (4756, 'excellent'),
 (4682, 'ive'),
 (4651, 'well'),
 (4409, 'got'),
 (4394, 'ever'),
 (4391, 'us'),
 (4351, 'chicken'),
 (4267, 'eat'),
 (4264, 'amazing'),
 (4162, 'better'),
 (4158, 'little'),
 (4027, 'try'),
 (4003, 'store'),
 (3939, 'come'),
 (3849, 'went'),
 (3776, 'wait'),
 (3735, 'menu'),
 (3643, 'make'),
 (3636, 'fresh'),
 (3597, 'im'),
 (3596, 'bad'),
 (3575, 'pretty'),
 (3547, 'going'),
 (3509, 'awesome'),
 (3493, 'much'),
 (3375, 'ordered'),
 (3361, 'experience'),
 (3299, 

In [19]:
counts[0]

(32377, 'food')

In [20]:
# take the words
words = [w[1] for w in counts[:1000]]

In [116]:
words[:100]

['food',
 'good',
 'great',
 'place',
 'service',
 'like',
 'best',
 'go',
 'get',
 'one',
 'time',
 'love',
 'really',
 'back',
 'always',
 'would',
 'nice',
 'restaurant',
 'dont',
 'pizza',
 'order',
 'staff',
 'friendly',
 'even',
 'people',
 'delicious',
 'also',
 'never',
 'excellent',
 'ive',
 'well',
 'got',
 'ever',
 'us',
 'chicken',
 'eat',
 'amazing',
 'better',
 'little',
 'try',
 'store',
 'come',
 'went',
 'wait',
 'menu',
 'make',
 'fresh',
 'im',
 'bad',
 'pretty',
 'going',
 'awesome',
 'much',
 'ordered',
 'experience',
 'price',
 'way',
 'bar',
 'recommend',
 'didnt',
 'atmosphere',
 'know',
 'first',
 'could',
 'want',
 'definitely',
 'customer',
 'lunch',
 'favorite',
 'prices',
 'came',
 'coffee',
 'take',
 'said',
 'sushi',
 'made',
 'location',
 'worth',
 'area',
 'quality',
 'dinner',
 'every',
 'still',
 'everything',
 'long',
 'times',
 'right',
 'drinks',
 'around',
 'bit',
 'two',
 'day',
 'table',
 'minutes',
 'meal',
 'though',
 'night',
 'new',
 'small'

In [22]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [28]:
wordSet

{'professional',
 '9',
 'plate',
 'lot',
 'delicious',
 'found',
 'hour',
 'los',
 'werent',
 'quiet',
 'head',
 'changed',
 'honestly',
 'tastes',
 'immediately',
 'lots',
 'husband',
 'least',
 'want',
 'extremely',
 'poor',
 'stuff',
 'go',
 'seriously',
 'fried',
 'tip',
 'whatever',
 'attitude',
 'taco',
 'korean',
 'rib',
 'basically',
 'burrito',
 'chowder',
 'wait',
 'forgot',
 'star',
 'rice',
 'helpful',
 'valley',
 'hard',
 'mention',
 'dinner',
 'employee',
 'seems',
 'knew',
 'okay',
 'north',
 'rude',
 'dessert',
 'crust',
 'need',
 'world',
 'vegetarian',
 'afternoon',
 'make',
 'flavor',
 'tasty',
 'ahead',
 'popular',
 'mac',
 'hole',
 'hang',
 'completely',
 'waited',
 'dog',
 'little',
 'sitting',
 'soup',
 'amazing',
 'meat',
 'im',
 'area',
 'crowd',
 'dish',
 'ones',
 'affordable',
 'wanted',
 'options',
 'rather',
 'hit',
 'break',
 '50',
 'choice',
 'doesnt',
 'compared',
 'week',
 'cook',
 'veggie',
 'superb',
 'spend',
 'days',
 'santa',
 'wouldnt',
 'products

# Model Building

In [112]:
# using Review, Review Hour and Price as features
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'reviewLength': len(reviewsCA[i]['reviewText'])})
    reviewsCA[i].update({'reviewHour': int(datetime.utcfromtimestamp(reviewsCA[i]['unixReviewTime']).strftime("%Y-%m-%d %H:%M:%S")[11:13])})
    reviewsCA[i].update({'priceRank': 1 if reviewsCA[i]['price'] == ['$'] else 2 if reviewsCA[i]['price'] == ['$$'] else 3})

In [113]:
reviewsCA[3]

{'rating': 5.0,
 'reviewerName': 'Anne Mason',
 'reviewText': 'Great coffee and location! Check out the used book store across the street',
 'categories': ['Espresso Bar', 'Coffee Shop'],
 'gPlusPlaceId': '108196901293702895067',
 'unixReviewTime': 1379187049,
 'reviewTime': 'Sep 14, 2013',
 'gPlusUserId': '100000106576186066497',
 'gps': [[38.030536, -121.884024]],
 'price': ['$$$'],
 'reviewHour': 19,
 'priceRank': 3,
 'reviewLength': 74}

In [114]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    feat.append(datum['reviewHour'])
    feat.append(datum['priceRank'])
    return feat

In [115]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [110]:
# divide X and y into training, validation and test
Xtrain = X[:int(0.8*len(reviewsCA))]
Xvalid = X[int(0.8*len(reviewsCA)):int(0.9*len(reviewsCA))]
Xtest = X[int(0.9*len(reviewsCA)):]

ytrain = y[:int(0.8*len(reviewsCA))]
yvalid = y[int(0.8*len(reviewsCA)):int(0.9*len(reviewsCA))]
ytest = y[int(0.9*len(reviewsCA)):]

In [102]:
Xtrain[101]

[1,
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [91]:
len(X)

76254

In [67]:
sum([10 in x for x in X])

31

In [41]:
len(X)

76254

In [42]:
len(y)

76254

In [80]:
len(Xtrain[0])

1001

In [111]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_train = clf.predict(Xtrain)

In [None]:
pred_valid = 

In [108]:
df = pd.DataFrame.from_dict(reviewsCA)

In [109]:
df.assign(
    time=lambda dataframe: dataframe["unixReviewTime"].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime("%Y-%m-%d %H:%M:%S"))
).assign(
    year=lambda dataframe: dataframe["unixReviewTime"].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime("%Y-%m-%d %H:%M:%S")[0:4]
    )
).assign(
    month=lambda dataframe: dataframe["unixReviewTime"].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime("%Y-%m-%d %H:%M:%S")[5:7]
    )
).assign(
    day=lambda dataframe: dataframe["unixReviewTime"].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime("%Y-%m-%d %H:%M:%S")[8:10]
    )
).assign(
    hour=lambda dataframe: dataframe["unixReviewTime"].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime("%Y-%m-%d %H:%M:%S")[11:13]
    )
).assign(
    weekday=lambda dataframe: dataframe["unixReviewTime"].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime("%A")
    )
)

Unnamed: 0,rating,reviewerName,reviewText,categories,gPlusPlaceId,unixReviewTime,reviewTime,gPlusUserId,gps,price,time,year,month,day,hour,weekday
0,4.0,william spindler,Best War Wanton soup in Red Bluff,"[Asian Restaurant, Chinese Restaurant]",106591714648856494903,1394669496,"Mar 12, 2014",100000032416892623125,"[[40.179159, -122.236162]]",[$$$],2014-03-13 00:11:36,2014,03,13,00,Thursday
1,5.0,william spindler,This is a review that is long overdo. I've bee...,"[European Restaurant, Italian Restaurant, Pizz...",109420033090810328045,1394826388,"Mar 14, 2014",100000032416892623125,"[[40.178074, -122.235234]]",[$$],2014-03-14 19:46:28,2014,03,14,19,Friday
2,5.0,william spindler,"Long time favorite Mexican food, always consis...",[Mexican Restaurant],115827996910815192564,1394669713,"Mar 12, 2014",100000032416892623125,"[[40.175064, -122.242574]]",[$$],2014-03-13 00:15:13,2014,03,13,00,Thursday
3,5.0,Anne Mason,Great coffee and location! Check out the used ...,"[Espresso Bar, Coffee Shop]",108196901293702895067,1379187049,"Sep 14, 2013",100000106576186066497,"[[38.030536, -121.884024]]",[$$$],2013-09-14 19:30:49,2013,09,14,19,Saturday
4,5.0,Anne Mason,"I've been here dozens of times, and I'll keep ...","[Sushi Restaurant, Asian Restaurant, Japanese ...",111971088396054239194,1394848761,"Mar 14, 2014",100000106576186066497,"[[37.961078, -121.748886]]",[$$],2014-03-15 01:59:21,2014,03,15,01,Saturday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76249,1.0,Richard Wright,This food is disgusting... I am sat in the can...,"[Mexican Restaurant, Latin American Restaurant]",107540363462823386611,1334257465,"Apr 12, 2012",106390902683872086099,"[[33.344747, -118.326901]]",[$$$],2012-04-12 19:04:25,2012,04,12,19,Thursday
76250,5.0,Richard Wright,I've eaten Indian food all over the world. A g...,[Indian Restaurant],112541221653908428450,1360468376,"Feb 9, 2013",106390902683872086099,"[[38.74622, -121.260129]]",[$$],2013-02-10 03:52:56,2013,02,10,03,Sunday
76251,5.0,Anna Avila,I love this store I can spend hours there and ...,"[Home Improvement Store, Hardware Store, Lumbe...",100569007418359455686,1347304118,"Sep 10, 2012",106390985705853501877,"[[36.717324, -121.664507]]",[$$$],2012-09-10 19:08:38,2012,09,10,19,Monday
76252,3.0,Lizbethh Tovar,This place ive been since I was 5 I love this ...,"[Donut Shop, Dessert Shop]",110664528288564115219,1320593088,"Nov 6, 2011",106391014960118925409,"[[33.97497, -118.274235]]",[$$$],2011-11-06 15:24:48,2011,11,06,15,Sunday


In [9]:
# Top Categories
categ_list = []
categ = defaultdict(list)
count = 0
for d in reviewsCA:
    for l in d["categories"]:
        if l not in categ_list:
            categ_list.append(l)

In [10]:
categ_list[:5]

['Asian Restaurant',
 'Chinese Restaurant',
 'European Restaurant',
 'Italian Restaurant',
 'Pizza Restaurant']

In [11]:
categ_num_review = defaultdict(int)
for c in categ_list:
    for d in reviewsCA:
        if d["categories"] != None:
            if c in d["categories"]:
                categ_num_review[c] += 1

In [12]:
categ_num_review

defaultdict(int,
            {'Asian Restaurant': 10423,
             'Chinese Restaurant': 3263,
             'European Restaurant': 7837,
             'Italian Restaurant': 6387,
             'Pizza Restaurant': 5223,
             'Mexican Restaurant': 6923,
             'Espresso Bar': 361,
             'Coffee Shop': 3446,
             'Sushi Restaurant': 2843,
             'Japanese Restaurant': 3744,
             'Latin American Restaurant': 6484,
             'Seafood Restaurant': 3769,
             'Chicken Restaurant': 1027,
             'American Restaurant': 12108,
             'Dessert Restaurant': 264,
             'Vegetarian Restaurant': 775,
             'Eclectic Restaurant': 377,
             'Cafe': 2640,
             'Bakery': 1405,
             'Nightlife': 56,
             'Deli': 1030,
             'Food Products Supplier': 12,
             'Fast Food Restaurant': 7291,
             'Clothing Store': 471,
             "Women's Clothing Store": 212,
             "

In [14]:
top_categ = []
for d in categ_num_review:
    item = (categ_num_review[d], d)
    top_categ.append(item)

In [23]:
top_categ.sort(reverse=True)
top20 = top_categ[:20]

(10423, 'Asian Restaurant')

# Regression Model: Sentence(positive, negative), price, reviewLength, reviewHour

In [184]:
for i in range(0, len(reviewsCA)):
    t = reviewsCA[i]["reviewText"]
    if TextBlob(t).sentiment[0] > 0:
        reviewsCA[i].update({'Sentiment': 1})
    elif TextBlob(t).sentiment[0] < 0:
        reviewsCA[i].update({'Sentiment': -1})
    else:
        reviewsCA[i].update({'Sentiment': 0})
        
    reviewsCA[i].update({'reviewLength': len(reviewsCA[i]['reviewText'])})
    reviewsCA[i].update({'reviewHour': int(datetime.utcfromtimestamp(reviewsCA[i]['unixReviewTime']).strftime("%Y-%m-%d %H:%M:%S")[11:13])})
    reviewsCA[i].update({'priceRank': 1 if reviewsCA[i]['price'] == ['$'] else 2 if reviewsCA[i]['price'] == ['$$'] else 3})

In [203]:
TextBlob("Have been going to this restaurant for years and always enjoyed fresh sliced turkey dinner with the sides and pie for dessert for a reasonable price.  On this Thanksgiving Day all four in our party ordered the above but received pressed turkey, cold and uncooked yams served in little cut up squares, uncooked veggies.  It was terrible and we sent it back and ordered the fresh cut ham but it was canned ham and the sides were the same as above.  They would not allow us to order from the regular menu and the charge for this terrible dinner was $24.95 each.  The apple pie tasted okay. They have lost mine and my entire family's business.  I know these had to be different owners than the ones in the past.  The staff was all Asian, either Chinese or Japanese so perhaps they just do not know how to run an American restaurant.  Of course I love both Chinese and Japanese food but not when I expect American food in an American restaurant on an American Thanksgiving Day.  Thank you for allowing me to express my opinion and disappointment in this traditional meal that my family and I have enjoyed for years there in the past.").sentiment

Sentiment(polarity=-0.03901515151515152, subjectivity=0.3356643356643356)

In [202]:
reviewsCA[10]

{'rating': 1.0,
 'reviewerName': 'Maryann Manisco',
 'reviewText': "Have been going to this restaurant for years and always enjoyed fresh sliced turkey dinner with the sides and pie for dessert for a reasonable price.  On this Thanksgiving Day all four in our party ordered the above but received pressed turkey, cold and uncooked yams served in little cut up squares, uncooked veggies.  It was terrible and we sent it back and ordered the fresh cut ham but it was canned ham and the sides were the same as above.  They would not allow us to order from the regular menu and the charge for this terrible dinner was $24.95 each.  The apple pie tasted okay. They have lost mine and my entire family's business.  I know these had to be different owners than the ones in the past.  The staff was all Asian, either Chinese or Japanese so perhaps they just do not know how to run an American restaurant.  Of course I love both Chinese and Japanese food but not when I expect American food in an American res

In [186]:
def feature(datum):
    feat = []
    feat.append(1)
    feat.append(datum['Sentiment'])
    feat.append(datum['reviewLength'])
    feat.append(datum['reviewHour'])
    feat.append(datum['priceRank'])
    return feat

In [187]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [194]:
X[0]

[1, 2, 33, 0, 3]

In [188]:
# divide X and y into training, validation and test
Xtrain = X[:int(0.8*len(reviewsCA))]
Xvalid = X[int(0.8*len(reviewsCA)):int(0.9*len(reviewsCA))]
Xtest = X[int(0.9*len(reviewsCA)):]

ytrain = y[:int(0.8*len(reviewsCA))]
yvalid = y[int(0.8*len(reviewsCA)):int(0.9*len(reviewsCA))]
ytest = y[int(0.9*len(reviewsCA)):]

In [189]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(Xtrain, ytrain)
theta = clf.coef_

In [190]:
pred_ytrain = clf.predict(Xtrain)

In [191]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

1.0207932785372438

In [192]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

1.0232904707520944

In [193]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

1.0181710027696955

# Regression Model: Sentence(positive, negative, neutral), price(low, medium, high), reviewLength, reviewHour(0~23)
### One Hot Encoding