In [1]:
import gzip
import csv
import numpy
import math
from urllib.request import urlopen
import scipy.optimize
import random
from collections import defaultdict
import string

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn import svm
from sklearn import linear_model
from sklearn import metrics

In [2]:
path = "datasets/review.json"
f = open(path, 'r', encoding='utf8')

In [3]:
dataset = []
i = 0
for line in f:
    if i >= 200000: break
    dataset.append(eval(line))
    i+=1

In [4]:
random.shuffle(dataset)
train = dataset[:100000]
valid = dataset[100000:150000]
test = dataset[150000:]

In [5]:
dataset[0]

{'review_id': 'LgJ0__SojXxzkr33eKQdhQ',
 'user_id': 'awN3wYcHrAtaqFahZhMC6g',
 'business_id': 'Q62lGVKqoKJ09lZNt2qpfw',
 'stars': 5.0,
 'useful': 1,
 'funny': 0,
 'cool': 0,
 'text': 'Chilli chicken and veggie spring rolls  yummmm been coming here for years and it is always amazing ! Fast efficient service and tasty food ..what more could anyone ask for',
 'date': '2016-10-31 01:37:24'}

### Baseline

In [6]:
wordCount = defaultdict(int)
for d in train:
    for w in d['text'].lower().split():
        wordCount[w] += 1

In [7]:
len(wordCount)

242996

In [8]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [9]:
    top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [10]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [11]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [12]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [14]:
pred = mod.predict(Xvalid)

In [15]:
def calc_acc(pred,y):
    total = 0
    for i in range(0,len(y)):
        if pred[i] == y[i]:
            total += 1
    return total / len(y)

In [16]:
calc_acc(pred, yvalid)

0.63512

### Filtering Punctuation

In [17]:
punctuation = set(string.punctuation)

In [18]:
wordCount = defaultdict(int)
for d in train:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [19]:
len(wordCount)

114249

In [20]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [21]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [22]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [23]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [24]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [26]:
pred = mod.predict(Xvalid)

In [27]:
calc_acc(pred, yvalid)

0.64604

### Filtering Stopwords

In [28]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
wordCount = defaultdict(int)
for d in train:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [30]:
len(wordCount)

242829

In [31]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [32]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [33]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [34]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [35]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [37]:
pred = mod.predict(Xvalid)

In [38]:
calc_acc(pred, yvalid)

0.62546

### Filtering Stemming

In [39]:
stemmer = PorterStemmer()

In [40]:
wordCount = defaultdict(int)
for d in train:
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [41]:
len(wordCount)

222340

In [42]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [43]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [44]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [45]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [46]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [48]:
pred = mod.predict(Xvalid)

In [49]:
calc_acc(pred, yvalid)

0.63564

### Punctuation and Stopwords

In [50]:
wordCount = defaultdict(int)
for d in train:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [51]:
len(wordCount)

114106

In [52]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [53]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [54]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [55]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [56]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [57]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [58]:
pred = mod.predict(Xvalid)

In [59]:
calc_acc(pred, yvalid)

0.64048

### Punctuation and Stemming

In [None]:
wordCount = defaultdict(int)
for d in train:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
len(wordCount)

In [None]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [None]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [None]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [None]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [None]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [None]:
pred = mod.predict(Xvalid)

In [None]:
calc_acc(pred, yvalid)

### Stopwords and Stemming

In [None]:
wordCount = defaultdict(int)
for d in train:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
len(wordCount)

In [None]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [None]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [None]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [None]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [None]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [None]:
pred = mod.predict(Xvalid)

In [None]:
calc_acc(pred, yvalid)

### Punctuation, Stopwords, and Stemming

In [None]:
stemmer = PorterStemmer()

In [None]:
wordCount = defaultdict(int)
for d in train:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
len(wordCount)

In [None]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [None]:
top_unigrams = [x[1] for x in unigram_counts[:1000]]

In [None]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [None]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]

In [None]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]

In [None]:
pred = mod.predict(Xvalid)

In [None]:
calc_acc(pred, yvalid)