In [132]:
# Import required packages
import json
import gzip
from collections import defaultdict
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy
import urllib
import scipy.optimize
import random
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import ast
from nltk.corpus import stopwords
from nltk import FreqDist
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob

## Data Pre-Processing

In [2]:
# Read places data
places_data = []
with gzip.open('places.clean.json.gz') as f:
    for l in f:
        places_data.append(eval(l))
        
len(places_data)

3114353

In [3]:
# Keep only California places with non-empty gps and price
def placesFilter(dataset):
    places_ca = []
    for d in dataset:
        if((d['gps'] != None) and (d['price'] != '') and (d['price'] != None)):
            if ((d['gps'][0] >= 32.32) & (d['gps'][0] <= 42) & (d['gps'][1] >= -124.26) & (d['gps'][1] <= -114.8)):
                places_ca.append(d)
    return places_ca

places_ca = placesFilter(places_data)

len(places_ca)

48865

In [4]:
# Read 4 million reviews data
reviews_data = []
with gzip.open("reviews.clean.json.gz") as f:
    head = [next(f) for x in range(4000000)]
    for l in head:
        reviews_data.append(eval(l))

len(reviews_data)

4000000

In [5]:
# Assign gps and price to each place ID
placesGPS = defaultdict(list)
placesPrice = defaultdict(list)
for d in places_ca:
    gID, gps, price = d['gPlusPlaceId'], d['gps'], d['price']
    placesGPS[gID].append(gps)
    placesPrice[gID].append(price)

In [6]:
len(placesGPS)

48865

In [7]:
len(placesPrice)

48865

In [8]:
reviews_ca = []
for d in reviews_data:
    if d['gPlusPlaceId'] in placesGPS:
        d['gps'] = placesGPS[d['gPlusPlaceId']]
        d['price'] = placesPrice[d['gPlusPlaceId']]
        reviews_ca.append(d)

In [9]:
len(reviews_ca)

125947

In [10]:
reviewsCA = []
for d in reviews_ca:
    if (d['rating'] != None and d['rating'] != '' and d['categories'] != None and d['categories'] != '' and d['reviewerName'] != None and d['reviewerName'] != '' and d['reviewText'] != None and d['reviewText'] != ''):
        reviewsCA.append(d)

In [11]:
len(reviewsCA)

76254

In [12]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$']}

In [None]:
json_str = json.dumps(reviewsCA) + "\n"
json_bytes = json_str.encode('utf-8')

with gzip.GzipFile('reviewsCA.json.gz', 'w') as fout:
    fout.write(json_bytes)

In [None]:
reviews_CA = []
with gzip.open('reviewsCA.json.gz') as f:
    for l in f:
        reviews_CA.append(eval(l))

In [None]:
datCA = reviews_CA[0]

In [None]:
len(datCA)

In [None]:
datCA[-5]

In [None]:
catList = []
for d in datCA:
    for w in d['categories']:
        if w not in catList:
            catList.append(w)

In [None]:
catList[0:5]

In [None]:
numRevCat = defaultdict(int)
for c in catList:
    for d in datCA:
        if c in d['categories']:
            numRevCat[c] += 1

In [None]:
rankCat = []
for d in numRevCat:
    item = (numRevCat[d], d)
    rankCat.append(item)
top100 = sorted(rankCat, reverse = True)[0:100]

In [None]:
top100

In [None]:
dfCA = pd.DataFrame.from_dict(reviewsCA)

In [None]:
dfCA

In [None]:
a = datetime.fromtimestamp(1394826388)

In [None]:
datetime.utcfromtimestamp(1394826388).strftime('%Y-%m-%d %H:%M:%S')[5:7]

In [None]:
datetime.utcfromtimestamp(1394826388).strftime('%A')

In [None]:
dfClean = dfCA.assign(
    Hour = lambda dataframe: dataframe['unixReviewTime'].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime('%Y-%m-%d %H:%M:%S')[11:13]) 
).assign(
    Year = lambda dataframe: dataframe['unixReviewTime'].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime('%Y-%m-%d %H:%M:%S')[0:4])
).assign(
    Month = lambda dataframe: dataframe['unixReviewTime'].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime('%Y-%m-%d %H:%M:%S')[5:7])
).assign(
    Weekday = lambda dataframe: dataframe['unixReviewTime'].map(lambda unixReviewTime: datetime.utcfromtimestamp(unixReviewTime).strftime('%A'))
)

In [None]:
dfClean.isnull().values.any()

In [None]:
dfClean.to_csv('dfClean.csv')

## Text & Prediction

In [None]:
reviewsCA[0]

In [47]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
stop_words = stopwords.words('english')

In [49]:
len(stop_words)

179

In [50]:
wordCount = defaultdict(int)
totalWords = 0
punct = string.punctuation
#stemmer = PorterStemmer()

for d in reviewsCA:
    t = d['reviewText']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        if w not in stop_words:
            #w = stemmer.stem(w)
            totalWords += 1
            wordCount[w] += 1

In [51]:
totalWords

1623179

In [52]:
len(wordCount)

60413

In [None]:
wordCount

In [53]:
counts = [(wordCount[w] , w) for w in wordCount]

In [54]:
counts.sort(reverse=True)
counts

[(32377, 'food'),
 (23508, 'good'),
 (23220, 'great'),
 (20894, 'place'),
 (18005, 'service'),
 (10467, 'like'),
 (10051, 'best'),
 (9940, 'go'),
 (9939, 'get'),
 (9354, 'one'),
 (8617, 'time'),
 (7938, 'love'),
 (7594, 'really'),
 (7285, 'back'),
 (6937, 'always'),
 (6609, 'would'),
 (6562, 'nice'),
 (6376, 'restaurant'),
 (6261, 'dont'),
 (5970, 'pizza'),
 (5707, 'order'),
 (5493, 'staff'),
 (5491, 'friendly'),
 (5102, 'even'),
 (5011, 'people'),
 (5007, 'delicious'),
 (4943, 'also'),
 (4900, 'never'),
 (4756, 'excellent'),
 (4682, 'ive'),
 (4651, 'well'),
 (4409, 'got'),
 (4394, 'ever'),
 (4391, 'us'),
 (4351, 'chicken'),
 (4267, 'eat'),
 (4264, 'amazing'),
 (4162, 'better'),
 (4158, 'little'),
 (4027, 'try'),
 (4003, 'store'),
 (3939, 'come'),
 (3849, 'went'),
 (3776, 'wait'),
 (3735, 'menu'),
 (3643, 'make'),
 (3636, 'fresh'),
 (3597, 'im'),
 (3596, 'bad'),
 (3575, 'pretty'),
 (3547, 'going'),
 (3509, 'awesome'),
 (3493, 'much'),
 (3375, 'ordered'),
 (3361, 'experience'),
 (3299, 

In [55]:
len(counts)

60413

In [56]:
words = [w[1] for w in counts[:1000]]

In [57]:
len(words)

1000

In [58]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [None]:
wordId

In [59]:
len(wordId)

1000

In [60]:
len(wordSet)

1000

In [61]:
wordSet

{'tip',
 'lobster',
 'sandwiches',
 'gem',
 'burger',
 'idea',
 'write',
 'las',
 'added',
 'noisy',
 'far',
 'since',
 'watching',
 'small',
 'employees',
 'full',
 'served',
 'purchase',
 'return',
 'today',
 'reservation',
 'orange',
 'best',
 'way',
 'cashier',
 'muy',
 'hang',
 'pie',
 'drinks',
 '10',
 'soup',
 'pad',
 'left',
 'company',
 'group',
 'beat',
 'ramen',
 'dog',
 'cook',
 'something',
 'garlic',
 'outdoor',
 'remember',
 'japanese',
 'stand',
 'show',
 'waiter',
 'online',
 'hard',
 'youll',
 'homemade',
 'ever',
 'date',
 'foods',
 'heard',
 'walking',
 'us',
 'high',
 'loved',
 'fried',
 'affordable',
 'mcdonalds',
 'looks',
 'quick',
 'bland',
 'beautiful',
 'floor',
 'plain',
 'san',
 'beans',
 'warm',
 'vegan',
 'avoid',
 'ate',
 'opinion',
 'impressed',
 'enjoy',
 '3',
 'rare',
 'waste',
 'dining',
 'chili',
 'bacon',
 'combo',
 'rush',
 'honestly',
 'wrong',
 'use',
 'walk',
 'music',
 'arent',
 'weeks',
 'asking',
 'choice',
 'wow',
 'crispy',
 'usual',
 '45'

### Baseline Model  f(word vector) -> rating
(Compare with other advanced models by comparing MSE)

In [111]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    return feat

In [112]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [113]:
reviewsCA[0]

{'rating': 4.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Best War Wanton soup in Red Bluff',
 'categories': ['Asian Restaurant', 'Chinese Restaurant'],
 'gPlusPlaceId': '106591714648856494903',
 'unixReviewTime': 1394669496,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.179159, -122.236162]],
 'price': ['$$$'],
 'reviewLength': 33,
 'reviewHour': '00',
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 0.0, 1.0]}

In [None]:
len(X)

In [None]:
sum([2 in x for x in X])

In [None]:
len(y)

In [114]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [None]:
ytrain[0]

In [115]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [130]:
pred_ytrain

array([4.18268486, 4.79245583, 4.44904744, ..., 2.78823577, 1.69340961,
       2.91116113])

In [116]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

0.8449665591826174

In [117]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

0.888903371127503

In [118]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

0.8539993216648606

### Advanced Model 1 (Better than baseline) f(word vector, review length, hour, price) -> rating

In [119]:
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'reviewLength':len(reviewsCA[i]['reviewText'])})
    reviewsCA[i].update({'reviewHour':int(datetime.utcfromtimestamp(reviewsCA[i]['unixReviewTime']).strftime('%Y-%m-%d %H:%M:%S')[11:13])})
    reviewsCA[i].update({'priceRank':1 if reviewsCA[i]['price'] == ['$'] else 2 if reviewsCA[i]['price'] == ['$$'] else 3})

In [120]:
len(reviewsCA)

76254

In [121]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': 0,
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 1.0, 0.0],
 'priceRank': 2}

In [122]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    feat.append(datum['reviewLength'])
    feat.append(datum['reviewHour'])
    feat.append(datum['priceRank'])
    return feat

In [123]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [124]:
X[2][-4:]

[1, 69, 0, 2]

In [None]:
y[2]

In [125]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [126]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [127]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

0.8423027028004455

In [128]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

0.8846847452724249

In [129]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

0.8506844249140023

### Advanced Model 2 (Better than baseline but can't beat Advanced Model 1) f(word vector, hour, price) -> rating

In [None]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    #feat.append(datum['reviewLength'])
    feat.append(datum['reviewHour'])
    feat.append(datum['priceRank'])
    return feat

In [None]:
reviewsCA[2]

In [None]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [None]:
X[2][-5:]

In [None]:
len(X)

In [None]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [None]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [None]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

In [None]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

In [None]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

### Advanced Model 3 (Not better than baseline) f(review length, hour, price) -> rating

In [None]:
def feature(datum):
    return [1, datum['reviewLength'], datum['reviewHour'], datum['priceRank']]

In [None]:
X = [feature(d) for d in dataset]
y = [d['rating'] for d in dataset]

In [None]:
X[2]

In [None]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [None]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [None]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

In [None]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

In [None]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

### Advanced Model 4 (Not better than baseline) remove neutral words f(word vector, review length, hour, price) -> rating

In [None]:
from textblob import TextBlob

In [None]:
wordCount = defaultdict(int)
totalWords = 0
punct = string.punctuation
#stemmer = PorterStemmer()

for d in reviewsCA:
    t = d['reviewText']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        if w not in stop_words and TextBlob(w).sentiment[0] != 0:
            #w = stemmer.stem(w)
            totalWords += 1
            wordCount[w] += 1

In [None]:
totalWords

In [None]:
len(wordCount)

In [None]:
wordCount

In [None]:
counts = [(wordCount[w] , w) for w in wordCount]

In [None]:
counts.sort(reverse=True)
len(counts)

In [None]:
counts

In [None]:
words = [w[1] for w in counts]

In [None]:
words

In [None]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [None]:
len(wordSet)

In [None]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    feat.append(datum['reviewLength'])
    feat.append(datum['reviewHour'])
    feat.append(datum['priceRank'])
    return feat

In [None]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [None]:
len(X[2])

In [None]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [None]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [None]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

In [None]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

In [None]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

### Only word vector (Not better than baseline)

In [None]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    #feat.append(datum['reviewLength'])
    #feat.append(datum['reviewHour'])
    #feat.append(datum['priceRank'])
    return feat

In [None]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [None]:
len(X[2])

In [None]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [None]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [None]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

In [None]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

In [None]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

### Advanced Model 5

In [None]:
len(reviewsCA)

In [134]:
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'reviewSentiment':'positive' if TextBlob(reviewsCA[i]['reviewText']).sentiment[0] > 0 else 'negative' if TextBlob(reviewsCA[i]['reviewText']).sentiment[0] < 0 else 'neutral'})

In [135]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': 0,
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 1.0, 0.0],
 'priceRank': 2,
 'reviewSentiment': 'positive'}

In [136]:
sent = []
for d in reviewsCA:
    sent.append(d['reviewSentiment'])

In [137]:
len(sent)

76254

In [148]:
sent[-10:]

['positive',
 'positive',
 'negative',
 'neutral',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative']

In [140]:
valuesS = array(sent)

In [141]:
label_encoderS = LabelEncoder()
integer_encodedS = label_encoderS.fit_transform(valuesS)

onehot_encoderS = OneHotEncoder(sparse=False)
integer_encodedS = integer_encodedS.reshape(len(integer_encodedS), 1)
onehot_encodedS = onehot_encoderS.fit_transform(integer_encodedS)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [149]:
onehot_encodedS[-7]

array([0., 1., 0.])

In [150]:
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'hotSentiment':list(onehot_encodedS[i])})

In [151]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': 0,
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 1.0, 0.0],
 'priceRank': 2,
 'reviewSentiment': 'positive',
 'hotSentiment': [0.0, 0.0, 1.0]}

In [152]:
def feature(datum):
    feat = []
    feat.append(1)
    feat.append(datum['reviewLength'])
    for l in datum['hotHour'] + datum['hotPrice'] + datum['hotSentiment']:
        feat.append(l)
    return feat

In [153]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [154]:
X[2]

[1,
 69,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0]

In [155]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [156]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [157]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

1.0154684598335826

In [158]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

1.0191316967320767

In [159]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

1.0158644305913098

### 修改Advanced Model 1 用one hot encoding on price and hour

In [None]:
data = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
values = array(data)

In [None]:
values

In [None]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

In [None]:
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

In [None]:
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

In [72]:
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'reviewLength':len(reviewsCA[i]['reviewText'])})
    reviewsCA[i].update({'reviewHour':datetime.utcfromtimestamp(reviewsCA[i]['unixReviewTime']).strftime('%Y-%m-%d %H:%M:%S')[11:13]})

In [73]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': '00',
 'hotHour': [0.0, 1.0, 0.0],
 'hotPrice': [0.0, 1.0, 0.0]}

In [74]:
hour = []
for d in reviewsCA:
    hour.append(d['reviewHour'])

In [75]:
len(hour)

76254

In [76]:
hour[:10]

['00', '19', '00', '19', '01', '04', '20', '20', '18', '02']

In [77]:
values = array(hour)

In [78]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [79]:
onehot_encoded[76253]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [80]:
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'hotHour':list(onehot_encoded[i])})

In [81]:
reviewsCA[5]

{'rating': 3.0,
 'reviewerName': 'William Corcuera',
 'reviewText': 'Finally came back to this place after so many years, atmosphere was ok anyways I ordered two shrimp tacos and a lobster taco. My shrimp tacos had about only 2-3 shrimps the rest was soggy bell peppers and a broken up corn tortilla, my lobster taco was a lil better.',
 'categories': ['Latin American Restaurant',
  'Mexican Restaurant',
  'Seafood Restaurant'],
 'gPlusPlaceId': '101788547508969264434',
 'unixReviewTime': 1347079555,
 'reviewTime': 'Sep 7, 2012',
 'gPlusUserId': '100000122158721897485',
 'gps': [[33.20043, -117.331493]],
 'price': ['$$$'],
 'reviewLength': 265,
 'reviewHour': '04',
 'hotHour': [0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 0.0, 1.0]}

In [82]:
price = []
for d in reviewsCA:
    price.append(d['price'])

In [83]:
len(price)

76254

In [84]:
price[:10]

[['$$$'],
 ['$$'],
 ['$$'],
 ['$$$'],
 ['$$'],
 ['$$$'],
 ['$$$'],
 ['$$$'],
 ['$$'],
 ['$$']]

In [85]:
valuesP = array(price)

In [86]:
valuesP[:10]

array([['$$$'],
       ['$$'],
       ['$$'],
       ['$$$'],
       ['$$'],
       ['$$$'],
       ['$$$'],
       ['$$$'],
       ['$$'],
       ['$$']], dtype='<U3')

In [87]:
label_encoderP = LabelEncoder()
integer_encodedP = label_encoderP.fit_transform(valuesP)

onehot_encoderP = OneHotEncoder(sparse=False)
integer_encodedP = integer_encodedP.reshape(len(integer_encodedP), 1)
onehot_encodedP = onehot_encoderP.fit_transform(integer_encodedP)

  y = column_or_1d(y, warn=True)
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [89]:
len(onehot_encodedP)

76254

In [90]:
onehot_encodedP[:10]

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [91]:
for i in range(0, len(reviewsCA)):
    reviewsCA[i].update({'hotPrice':list(onehot_encodedP[i])})

In [92]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': '00',
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 1.0, 0.0]}

In [98]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    feat.append(datum['reviewLength'])
    for l in datum['hotHour'] + datum['hotPrice']:
        feat.append(l)
    return feat

In [99]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [102]:
X[2][-50:]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 69,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0]

In [103]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [107]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [108]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

0.8414252409564961

In [109]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

0.8856883012161867

In [110]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

0.8514769098811863

### Advanced Model 6 f(word vector, review length, hour, price) -> rating one hot for hour, continuous for price

In [160]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': 0,
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 1.0, 0.0],
 'priceRank': 2,
 'reviewSentiment': 'positive',
 'hotSentiment': [0.0, 0.0, 1.0]}

In [161]:
len(wordSet)

1000

In [164]:
def feature(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower()
    t = [c for c in t if not (c in punct)]
    t = ''.join(t)
    words = t.strip().split()
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] += 1
    feat.append(1)
    feat.append(datum['reviewLength'])
    feat.append(datum['priceRank'])
    for l in datum['hotHour']:
        feat.append(l)
    return feat

In [165]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [169]:
X[2][-50:]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 69,
 2,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [170]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [171]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [172]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

0.8416214555170666

In [173]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

0.8857318968965592

In [174]:
pred_ytest = clf.predict(Xtest)
MSEtest = mean_squared_error(ytest, pred_ytest)
MSEtest

0.8518571481258606

### Sentiment

In [175]:
def feature(datum):
    feat = []
    feat.append(1)
    feat.append(datum['reviewLength'])
    feat.append(datum['priceRank'])
    for l in datum['hotHour'] + datum['hotSentiment']:
        feat.append(l)
    return feat

In [176]:
X = [feature(d) for d in reviewsCA]
y = [d['rating'] for d in reviewsCA]

In [178]:
reviewsCA[2]

{'rating': 5.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Long time favorite Mexican food, always consistent and great tasting.',
 'categories': ['Mexican Restaurant'],
 'gPlusPlaceId': '115827996910815192564',
 'unixReviewTime': 1394669713,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125',
 'gps': [[40.175064, -122.242574]],
 'price': ['$$'],
 'reviewLength': 69,
 'reviewHour': 0,
 'hotHour': [1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'hotPrice': [0.0, 1.0, 0.0],
 'priceRank': 2,
 'reviewSentiment': 'positive',
 'hotSentiment': [0.0, 0.0, 1.0]}

In [177]:
X[2]

[1,
 69,
 2,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0]

In [179]:
Xtrain = X[:int(0.8*len(X))]
Xvalid = X[int(0.8*len(X)):int(0.9*len(X))]
Xtest = X[int(0.9*len(X)):]

ytrain = y[:int(0.8*len(y))]
yvalid = y[int(0.8*len(y)):int(0.9*len(y))]
ytest = y[int(0.9*len(y)):]

In [180]:
clf = linear_model.Ridge(1, fit_intercept=False)
clf.fit(Xtrain, ytrain)
theta = clf.coef_
pred_ytrain = clf.predict(Xtrain)

In [181]:
MSEtrain = mean_squared_error(ytrain, pred_ytrain)
MSEtrain

1.015706330170869

In [182]:
pred_yvalid = clf.predict(Xvalid)
MSEvalid = mean_squared_error(yvalid, pred_yvalid)
MSEvalid

1.0194616205795126