In [4]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import math
import nltk
import string
from nltk.stem.porter import *
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import mean_squared_error

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\szbdr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def readData(fname, startLine, endLine):
    count = 0
    for l in open(fname, encoding="UTF-8"):
        if count < startLine:
            count += 1
            continue
        elif count < endLine:
            l=l.replace(":null",'""')
            yield eval(l)
            count += 1
        else:
            break

In [6]:
review = list(readData("yelp_academic_dataset_review.json",0,100000))

In [7]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
stopwordSet = set()
#Process stopwords set
for sw in stopwords.words('english'):
    nonPunSW = ''.join([c for c in sw if not c in punctuation])
    stopwordSet.add(stemmer.stem(nonPunSW))

for d in review:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w not in stopwordSet:
            wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [8]:
words = [x[1] for x in counts[:2000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [9]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w not in stopwordSet:
            if w in wordSet:
                feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [10]:
# Run a linear model using the 2000 most popular words,
# stemmed, with stopwords removed, with no preclassification by category
X = [feature(d) for d in review]
y = [d['stars'] for d in review]

valReview = list(readData("yelp_academic_dataset_review.json",100000,125000))
valX = [feature(d) for d in valReview]
valy = [d['stars'] for d in valReview]

In [12]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
#Run MSE the trainning set with the model 1.0 alpha 
trainRMSE = math.sqrt(mean_squared_error(y, predictions))

valPredictions = clf.predict(valX)
#Run MSE the validation set with the model 1.0 alpha 
valRMSE = math.sqrt(mean_squared_error(valy, valPredictions))
trainRMSE, valRMSE

(1.0007913432419648, 1.0416184658743517)

In [13]:
clf10 = linear_model.Ridge(10.0, fit_intercept=False)
clf10.fit(X, y)
theta10 = clf10.coef_
predictions10 = clf10.predict(X)
#Run MSE the training set with the model 10.0 alpha 
trainRMSE10 =math.sqrt(mean_squared_error(y, predictions10))

valPredictions10 = clf10.predict(valX)
#Run MSE the validation set with the model 10.0 alpha 
valRMSE10 = math.sqrt(mean_squared_error(valy, valPredictions10))
trainRMSE10, valRMSE10

(1.0008011767214173, 1.0411741847591955)

In [14]:
clf100 = linear_model.Ridge(100.0, fit_intercept=False)
clf100.fit(X, y)
theta100 = clf100.coef_
predictions100 = clf100.predict(X)
#Run MSE the trainning set with the model 100.0 alpha 
trainRMSE100 = math.sqrt(mean_squared_error(y, predictions100))

valPredictions100 = clf100.predict(valX)
#Run MSE the validation set with the model 100.0 alpha 
valRMSE100 = math.sqrt(mean_squared_error(valy, valPredictions100))
trainRMSE100, valRMSE100

(1.0015330332357493, 1.0381008345891514)

In [15]:
clf01 = linear_model.Ridge(0.1, fit_intercept=False)
clf01.fit(X, y)
theta01 = clf01.coef_
predictions01 = clf01.predict(X)
#Run MSE the trainning set with the model 0.1 alpha 
trainRMSE01 = math.sqrt(mean_squared_error(y, predictions01))

valPredictions01 = clf01.predict(valX)
#Run MSE the validation set with the model 0.1 alpha 
valRMSE01 = math.sqrt(mean_squared_error(valy, valPredictions01))
trainRMSE01, valRMSE01

(1.0007912414330795, 1.0416646901545525)

In [16]:
clf001 = linear_model.Ridge(0.01, fit_intercept=False)
clf001.fit(X, y)
theta001 = clf001.coef_
predictions001 = clf001.predict(X)
#Run MSE the trainning set with the model 0.1 alpha 
trainRMSE001 = math.sqrt(mean_squared_error(y, predictions001))

valPredictions001 = clf001.predict(valX)
#Run MSE the validation set with the model 0.1 alpha 
valRMSE001 = math.sqrt(mean_squared_error(valy, valPredictions001))
trainRMSE001, valRMSE001

(1.0007912404113715, 1.0416693311105685)

In [17]:
def readUSrestaurants(fname):
    for l in open(fname):
        for rest in eval(l):
            yield rest

USrestaurants = set(readUSrestaurants('C:/Study/CSE 158/assignment2/CSE158-assignment2/USrestaurants.txt'))

In [29]:
import json
f = open('popularWords.json', 'w')
f.write(json.dumps(wordId))
f.close()

In [19]:
def readReviewsUSRest(fname):
    count = 0
    for l in open(fname, encoding="UTF-8"):
        if count == 100000:
            break
        l=l.replace(":null",'""')
        count += 1
        r = eval(l)
        if r['business_id'] in USrestaurants:
            yield r

USrestaurants = list(readReviewsUSRest("yelp_academic_dataset_review.json"))

In [37]:
def checkStar(datum, star):
    if datum['stars'] == star:
        return True
    else:
        return False
Stary = []
StarValy = []
StarSVM = []
for i in range(0,5):
    Stary.append( [checkStar(d, i+1) for d in review])
    StarValy.append([checkStar(d, i+1) for d in valReview])
    StarSVM.append(svm.LinearSVC(C=1.0, max_iter=100))
    StarSVM[i].fit(X, Stary[i])

In [39]:
StarTrainP = []
StarValP = []
for i in range(0,5):
    StarTrainP.append(StarSVM[i].predict(X))
    StarValP.append(StarSVM[i].predict(valX))

In [46]:
# Iterate through the confidence scores and find the predicted score with max confidence
maxConfPredictions = []
for i in range(0,len(y)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarTrainP[j][i] > maxConf:
            maxConf = StarTrainP[j][i]
            maxIndex = j
    maxConfPredictions.append(maxIndex+1)

In [47]:
#Do the same for validation set
valMaxConfPredictions = []
for i in range(0,len(valy)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarValP[j][i] > maxConf:
            maxConf = StarValP[j][i]
            maxIndex = j
    valMaxConfPredictions.append(maxIndex+1)

In [48]:
SVMRMSE1 = math.sqrt(mean_squared_error(y, maxConfPredictions))
valSVMRMSE1 = math.sqrt(mean_squared_error(valy, valMaxConfPredictions))
SVMRMSE1,valSVMRMSE1

(1.4765364878661142, 1.5100993344810136)

In [49]:
valMaxConfPredictions[:10], valy[:10]

([1, 1, 1, 5, 1, 3, 5, 5, 1, 1], [2, 5, 2, 5, 4, 4, 5, 5, 1, 5])

In [50]:
# Run 500 iterations
StarSVM500 = []
for i in range(0,5):
    StarSVM500.append(svm.LinearSVC(C=1.0, max_iter=500))
    StarSVM500[i].fit(X, Stary[i])

In [51]:
StarTrainP500 = []
StarValP500 = []
for i in range(0,5):
    StarTrainP500.append(StarSVM500[i].predict(X))
    StarValP500.append(StarSVM500[i].predict(valX))

In [52]:
# Iterate through the confidence scores and find the predicted score with max confidence
maxConfPredictions500 = []
for i in range(0,len(y)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarTrainP500[j][i] > maxConf:
            maxConf = StarTrainP500[j][i]
            maxIndex = j
    maxConfPredictions500.append(maxIndex+1)

In [53]:
#Do the same for validation set
valMaxConfPredictions500 = []
for i in range(0,len(valy)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarValP500[j][i] > maxConf:
            maxConf = StarValP500[j][i]
            maxIndex = j
    valMaxConfPredictions500.append(maxIndex+1)

In [54]:
SVMRMSE1500 = math.sqrt(mean_squared_error(y, maxConfPredictions500))
valSVMRMSE1500 = math.sqrt(mean_squared_error(valy, valMaxConfPredictions500))
SVMRMSE1500,valSVMRMSE1500

(1.5107018236568062, 1.537231277329472)

In [55]:
# Run 50 iterations
StarSVM50 = []
for i in range(0,5):
    StarSVM50.append(svm.LinearSVC(C=1.0, max_iter=50))
    StarSVM50[i].fit(X, Stary[i])

In [56]:
StarTrainP50 = []
StarValP50 = []
for i in range(0,5):
    StarTrainP50.append(StarSVM50[i].predict(X))
    StarValP50.append(StarSVM50[i].predict(valX))

In [60]:
# Iterate through the confidence scores and find the predicted score with max confidence
maxConfPredictions50 = []
for i in range(0,len(y)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarTrainP50[j][i] > maxConf:
            maxConf = StarTrainP50[j][i]
            maxIndex = j
    maxConfPredictions50.append(maxIndex+1)

In [61]:
#Do the same for validation set
valMaxConfPredictions50 = []
for i in range(0,len(valy)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarValP50[j][i] > maxConf:
            maxConf = StarValP50[j][i]
            maxIndex = j
    valMaxConfPredictions50.append(maxIndex+1)

In [62]:
SVMRMSE150 = math.sqrt(mean_squared_error(y, maxConfPredictions50))
valSVMRMSE150 = math.sqrt(mean_squared_error(valy, valMaxConfPredictions50))
SVMRMSE150,valSVMRMSE150

(1.3208292849569925, 1.3664552682030977)

In [63]:
# Run 30 iterations
StarSVM30 = []
for i in range(0,5):
    StarSVM30.append(svm.LinearSVC(C=1.0, max_iter=30))
    StarSVM30[i].fit(X, Stary[i])

In [64]:
StarTrainP30 = []
StarValP30 = []
for i in range(0,5):
    StarTrainP30.append(StarSVM30[i].predict(X))
    StarValP30.append(StarSVM30[i].predict(valX))

In [65]:
# Iterate through the confidence scores and find the predicted score with max confidence
maxConfPredictions30 = []
for i in range(0,len(y)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarTrainP30[j][i] > maxConf:
            maxConf = StarTrainP30[j][i]
            maxIndex = j
    maxConfPredictions30.append(maxIndex+1)

In [66]:
#Do the same for validation set
valMaxConfPredictions30 = []
for i in range(0,len(valy)):
    maxConf = -1.0
    maxIndex = 0
    for j in range(0,5):
        if StarValP30[j][i] > maxConf:
            maxConf = StarValP30[j][i]
            maxIndex = j
    valMaxConfPredictions30.append(maxIndex+1)

In [67]:
SVMRMSE130 = math.sqrt(mean_squared_error(y, maxConfPredictions30))
valSVMRMSE130 = math.sqrt(mean_squared_error(valy, valMaxConfPredictions30))
SVMRMSE130,valSVMRMSE130

(1.4986093553691704, 1.5170497684650954)