In [1]:
import gzip
import csv
import numpy
import math
from urllib.request import urlopen
import scipy.optimize
import random
from collections import defaultdict
import string

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn import svm
from sklearn import linear_model
from sklearn import metrics

In [2]:
path = "datasets/review.json"
f = open(path, 'r', encoding='utf8')

In [3]:
dataset = []
i = 0
for line in f:
    if i >= 50000: break
    dataset.append(eval(line))
    i+=1

In [4]:
random.shuffle(dataset)
train = dataset[:25000]
valid = dataset[25000:37500]
test = dataset[37500:]

In [5]:
dataset[0]

{'review_id': 'TNLYR1-4DOpEeZyyZUsAWg',
 'user_id': 'lPrSEcTDdL6t96KQFZJm0Q',
 'business_id': 'Nzr_bZqTZtP2XRrBNfj5Xg',
 'stars': 5.0,
 'useful': 0,
 'funny': 0,
 'cool': 0,
 'text': 'This was our first Escape Room experience. The staff were the best. My kids are a little claustrophobic, so at times they didn\'t want to play the game with us. Nicky & Vanessa kept my kids company and gave them games to play, while the rest of us completed the activity.\nWe took longer than the hour, but they were still very patient with us and helped us out whenever we got stuck. If anything, Nicky & Vanessa made our experience a superb one. My only critique is about the "cryptex" because the letters would always move. I think a better cryptex would be needed so that it would be easier to move on. Other than, we thoroughly enjoyed the experience and would definitely do it again.\nThanks Nicky, Vanessa and Captive Kids for making our first experience a memorable one.',
 'date': '2018-03-24 22:27:18'}

### Baseline

In [6]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        wordCount[w] += 1

In [7]:
len(wordCount)

157773

In [8]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [9]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [10]:
top_unigrams[:5]

['the', 'and', 'i', 'a', 'to']

In [11]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [12]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [13]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [15]:
pred = mod.predict(Xvalid)

In [16]:
def calc_acc(pred,y):
    total = 0
    for i in range(0,len(y)):
        if pred[i] == y[i]:
            total += 1
    return total / len(y)

In [17]:
calc_acc(pred, yvalid)

0.62896

### Filtering Punctuation

In [18]:
punctuation = set(string.punctuation)

In [19]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [20]:
len(wordCount)

76461

In [21]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [22]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [23]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [24]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [25]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [27]:
pred = mod.predict(Xvalid)

In [28]:
calc_acc(pred, yvalid)

0.64096

### Filtering Punctuation (treating punctuation marks as words)

In [92]:
punctuation = set(string.punctuation)

In [95]:
def handlePunc(d):
    r = ''
    for c in d['text'].lower():
        if not c in punctuation:
            r += c
        else:
            r += " " + c + " "
    return r

In [96]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        wordCount[w] += 1

In [97]:
len(wordCount)

56337

In [98]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [99]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [100]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [101]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [102]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [103]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [104]:
pred = mod.predict(Xvalid)

In [105]:
calc_acc(pred, yvalid)

0.6452

### Filtering Stopwords

In [29]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [31]:
len(wordCount)

157611

In [32]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [33]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [34]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [35]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [36]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [38]:
pred = mod.predict(Xvalid)

In [39]:
calc_acc(pred, yvalid)

0.616

### Filtering Stemming

In [40]:
stemmer = PorterStemmer()

In [41]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [42]:
len(wordCount)

142783

In [43]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [44]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [45]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [46]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [47]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [49]:
pred = mod.predict(Xvalid)

In [50]:
calc_acc(pred, yvalid)

0.62528

### Punctuation and Stopwords

In [51]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [52]:
len(wordCount)

76320

In [53]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [54]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [55]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [56]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [57]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [58]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [59]:
pred = mod.predict(Xvalid)

In [60]:
calc_acc(pred, yvalid)

0.63288

### Punctuation and Stemming

In [61]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [62]:
len(wordCount)

59030

In [63]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [64]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [65]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [66]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [67]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [68]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [69]:
pred = mod.predict(Xvalid)

In [70]:
calc_acc(pred, yvalid)

0.63976

### Stopwords and Stemming

In [71]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [72]:
len(wordCount)

142692

In [73]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [74]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [75]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [76]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [77]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [79]:
pred = mod.predict(Xvalid)

In [80]:
calc_acc(pred, yvalid)

0.61472

### Punctuation, Stopwords, and Stemming

In [81]:
stemmer = PorterStemmer()

In [82]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [83]:
len(wordCount)

58968

In [84]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [85]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [86]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [87]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [88]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [89]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [90]:
pred = mod.predict(Xvalid)

In [91]:
calc_acc(pred, yvalid)

0.63008