In [1]:
import gzip
import csv
import numpy
import math
from urllib.request import urlopen
import scipy.optimize
import random
from collections import defaultdict
import string

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn import svm
from sklearn import linear_model
from sklearn import metrics

In [2]:
path = "datasets/review.json"
f = open(path, 'r', encoding='utf8')

In [3]:
dataset = []
i = 0
for line in f:
    if i >= 50000: break
    dataset.append(eval(line))
    i+=1

In [4]:
random.shuffle(dataset)
train = dataset[:25000]
valid = dataset[25000:37500]
test = dataset[37500:]

In [149]:
dataset[0]

{'review_id': 'avd_cNedr88c1TpiqrlnqQ',
 'user_id': '4NE2J0tiy6ATs1zJrNBZOg',
 'business_id': 'gH3w0VDb1pTapVLAFqV28g',
 'stars': 1.0,
 'useful': 0,
 'funny': 2,
 'cool': 0,
 'text': 'The long haired dude that served me looked and acted like he wanted to chop me into little pieces with a machete, so I just left. \nThanks for nothing :)',
 'date': '2016-09-18 16:17:48'}

In [151]:
def MSE(pred, y):
    return numpy.square(numpy.subtract(y, pred)).mean() 

In [152]:
def calc_acc(pred,y):
    total = 0
    for i in range(0,len(y)):
        if pred[i] == y[i]:
            total += 1
    return total / len(y)

In [713]:
numTop = 1000

### Baseline

In [714]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        wordCount[w] += 1

In [715]:
len(wordCount)

157773

In [716]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [717]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [718]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [719]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [720]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [721]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [722]:
pred = mod.predict(Xvalid)

In [723]:
MSE(pred, yvalid)

1.09688

In [724]:
calc_acc(pred, yvalid)

0.63008

### Filtering Punctuation

In [725]:
punctuation = set(string.punctuation)

In [726]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [727]:
len(wordCount)

76461

In [728]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [729]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [730]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [731]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [732]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [733]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [734]:
pred = mod.predict(Xvalid)

In [735]:
MSE(pred, yvalid)

0.99656

In [736]:
calc_acc(pred, yvalid)

0.6372

### Filtering Punctuation (treating punctuation marks as separate words)

In [737]:
punctuation = set(string.punctuation)

In [738]:
def handlePunc(d):
    r = ''
    for c in d['text'].lower():
        if not c in punctuation:
            r += c
        else:
            r += " " + c + " "
    return r

In [739]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        wordCount[w] += 1

In [740]:
len(wordCount)

56337

In [741]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [742]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [743]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [744]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [745]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [746]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [747]:
pred = mod.predict(Xvalid)

In [748]:
MSE(pred, yvalid)

0.96056

In [749]:
calc_acc(pred, yvalid)

0.64032

### Filtering Stopwords

In [750]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [751]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [752]:
len(wordCount)

157611

In [753]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [754]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [755]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [756]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [757]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [758]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [759]:
pred = mod.predict(Xvalid)

In [760]:
MSE(pred, yvalid)

1.19192

In [761]:
calc_acc(pred, yvalid)

0.61736

### Filtering Stemming

In [762]:
stemmer = PorterStemmer()

In [763]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [764]:
len(wordCount)

142783

In [765]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [766]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [767]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [768]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [769]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [770]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [771]:
pred = mod.predict(Xvalid)

In [772]:
MSE(pred, yvalid)

1.11544

In [773]:
calc_acc(pred, yvalid)

0.62656

### Punctuation and Stopwords

In [774]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [775]:
len(wordCount)

76320

In [776]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [777]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [778]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [779]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [780]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [781]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [782]:
pred = mod.predict(Xvalid)

In [783]:
MSE(pred, yvalid)

1.04576

In [784]:
calc_acc(pred, yvalid)

0.63272

### Punctuation (separate) and Stopwords

In [785]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [786]:
len(wordCount)

56186

In [787]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [788]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [789]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [790]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [791]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [792]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [793]:
pred = mod.predict(Xvalid)

In [794]:
MSE(pred, yvalid)

1.032

In [795]:
calc_acc(pred, yvalid)

0.632

### Punctuation and Stemming

In [796]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [797]:
len(wordCount)

59030

In [798]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [799]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [800]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [801]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [802]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [803]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [804]:
pred = mod.predict(Xvalid)

In [805]:
MSE(pred, yvalid)

0.98864

In [806]:
calc_acc(pred, yvalid)

0.64168

### Punctuation (separate) and Stemming

In [807]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [808]:
len(wordCount)

40053

In [809]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [810]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [811]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [812]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [813]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [814]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [815]:
pred = mod.predict(Xvalid)

In [816]:
MSE(pred, yvalid)

0.97168

In [817]:
calc_acc(pred, yvalid)

0.63872

### Stopwords and Stemming

In [818]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [819]:
len(wordCount)

142692

In [820]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [821]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [822]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [823]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [824]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [825]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [826]:
pred = mod.predict(Xvalid)

In [827]:
MSE(pred, yvalid)

1.2224

In [828]:
calc_acc(pred, yvalid)

0.61712

### Punctuation, Stopwords, and Stemming

In [829]:
stemmer = PorterStemmer()

In [830]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [831]:
len(wordCount)

58968

In [832]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [833]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [834]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [835]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [836]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [837]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [838]:
pred = mod.predict(Xvalid)

In [839]:
MSE(pred, yvalid)

1.07232

In [840]:
calc_acc(pred, yvalid)

0.63064

### Punctuation (separate), Stopwords, and Stemming

In [841]:
stemmer = PorterStemmer()

In [842]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [843]:
len(wordCount)

39978

In [844]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [845]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [846]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [847]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [848]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [849]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [850]:
pred = mod.predict(Xvalid)

In [851]:
MSE(pred, yvalid)

1.04824

In [852]:
calc_acc(pred, yvalid)

0.63144