In [124]:
import gzip
import csv
import numpy
import math
from urllib.request import urlopen
import scipy.optimize
import random
from collections import defaultdict
import string

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn.naive_bayes import MultinomialNB

In [125]:
path = "datasets/review.json"
f = open(path, 'r', encoding='utf8')

In [126]:
dataset = []
i = 0
for line in f:
    if i >= 50000: break
    dataset.append(eval(line))
    i+=1

In [127]:
random.shuffle(dataset)
train = dataset[:25000]
valid = dataset[25000:37500]
test = dataset[37500:]

In [128]:
dataset[0]

{'review_id': 'LqmEjmxRDGqupCN2dPhODA',
 'user_id': 'X7mVSZxStPxD4GbN_29pYQ',
 'business_id': 'MfgewgMdqCx_JWk--6PgfA',
 'stars': 3.0,
 'useful': 2,
 'funny': 0,
 'cool': 0,
 'text': "Kretzler's Tavern has been part of the North Hills dining and drinking scene for as long as I can remember (40+ years).  The place has continued to evolve over time but maintained their German slant on beer and food.  I enjoyed a nice cold Hacker Pschorr Munich Gold draft on tap tonight with my fish sandwich and perfectly seasoned potato cake with applesauce.  They feature more than 27 beers on tap which is nice and have a full bar area with high top tables and some games. \n\nTheir fish batter is a tad bland but the fish was white, fresh, and meaty so that sort of made up for the batter.  The potato pancakes were exactly like my dear Mother used to make: shredded potatoes with minced onion and egg that is formed into a cake and pan fried on high heat to give the outside that crispy texture and keep the i

In [129]:
def MSE(pred, y):
    return numpy.square(numpy.subtract(y, pred)).mean() 

In [130]:
def calc_acc(pred,y):
    total = 0
    for i in range(0,len(y)):
        if pred[i] == y[i]:
            total += 1
    return total / len(y)

In [247]:
numTop = 5000

### Baseline

In [248]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        wordCount[w] += 1

In [249]:
len(wordCount)

157773

In [250]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [251]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [252]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [253]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [254]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)

In [255]:
MSE(pred, yvalid)

1.10032

In [256]:
calc_acc(pred, yvalid)

0.60272

### Filtering Punctuation

In [257]:
punctuation = set(string.punctuation)

In [258]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [259]:
len(wordCount)

76461

In [260]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [261]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [262]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [263]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [264]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [265]:
MSE(pred, yvalid)

1.09272

In [266]:
calc_acc(pred, yvalid)

0.5988

### Filtering Punctuation (treating punctuation marks as separate words)

In [151]:
punctuation = set(string.punctuation)

In [152]:
def handlePunc(d):
    r = ''
    for c in d['text'].lower():
        if not c in punctuation:
            r += c
        else:
            r += " " + c + " "
    return r

In [267]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        wordCount[w] += 1

In [268]:
len(wordCount)

56337

In [269]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [270]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [271]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [272]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [273]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [274]:
MSE(pred, yvalid)

1.06224

In [275]:
calc_acc(pred, yvalid)

0.60664

### Filtering Stopwords

In [162]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [276]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [277]:
len(wordCount)

157611

In [278]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [279]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [280]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [281]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [282]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [283]:
MSE(pred, yvalid)

1.05832

In [284]:
calc_acc(pred, yvalid)

0.60288

### Filtering Stemming

In [285]:
stemmer = PorterStemmer()

In [286]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [287]:
len(wordCount)

142783

In [288]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [289]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [290]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [291]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [292]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [293]:
MSE(pred, yvalid)

1.12224

In [294]:
calc_acc(pred, yvalid)

0.59728

### Punctuation and Stopwords

In [295]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [296]:
len(wordCount)

76320

In [297]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [298]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [299]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [300]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [301]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [302]:
MSE(pred, yvalid)

1.0664

In [303]:
calc_acc(pred, yvalid)

0.60008

### Punctuation (separate) and Stopwords

In [304]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [305]:
len(wordCount)

56186

In [306]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [307]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [308]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [309]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [310]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [311]:
MSE(pred, yvalid)

1.06288

In [312]:
calc_acc(pred, yvalid)

0.60632

### Punctuation and Stemming

In [313]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [314]:
len(wordCount)

59030

In [315]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [316]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [317]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [318]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [319]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [320]:
MSE(pred, yvalid)

1.11776

In [321]:
calc_acc(pred, yvalid)

0.59248

### Punctuation (separate) and Stemming

In [322]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [323]:
len(wordCount)

40053

In [324]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [325]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [326]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [327]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [328]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [329]:
MSE(pred, yvalid)

1.10944

In [330]:
calc_acc(pred, yvalid)

0.59944

### Stopwords and Stemming

In [331]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [332]:
len(wordCount)

142692

In [333]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [334]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [335]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [336]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [337]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [338]:
MSE(pred, yvalid)

1.11408

In [339]:
calc_acc(pred, yvalid)

0.5968

### Punctuation, Stopwords, and Stemming

In [340]:
stemmer = PorterStemmer()

In [341]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [342]:
len(wordCount)

58968

In [343]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [344]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [345]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [346]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [347]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [348]:
MSE(pred, yvalid)

1.12392

In [349]:
calc_acc(pred, yvalid)

0.59112

### Punctuation (separate), Stopwords, and Stemming

In [350]:
stemmer = PorterStemmer()

In [351]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [352]:
len(wordCount)

39978

In [353]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [354]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [355]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [356]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [357]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [358]:
MSE(pred, yvalid)

1.12184

In [359]:
calc_acc(pred, yvalid)

0.59728