In [1]:
import gzip
import csv
import numpy
import math
from urllib.request import urlopen
import scipy.optimize
import random
from collections import defaultdict
import string

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn.naive_bayes import MultinomialNB

In [2]:
path = "datasets/review.json"
f = open(path, 'r', encoding='utf8')

In [3]:
dataset = []
i = 0
for line in f:
    if i >= 50000: break
    dataset.append(eval(line))
    i+=1

In [4]:
random.shuffle(dataset)
train = dataset[:25000]
valid = dataset[25000:37500]
test = dataset[37500:]

In [5]:
dataset[0]

{'review_id': 'AUUf7lpLgVbfP-GpaMnusA',
 'user_id': 'ylNoap5vLFZCPIjwBIdnYA',
 'business_id': 'KfVGvpJLvR2ID2dZIlnpvw',
 'stars': 4.0,
 'useful': 1,
 'funny': 0,
 'cool': 0,
 'text': "If you come in here and buy in bulk, such ask candies, they have a really good selections of brand names for cheap, other than that, don't come here, because it can get expensive",
 'date': '2014-01-18 05:18:01'}

In [6]:
def MSE(pred, y):
    return numpy.square(numpy.subtract(y, pred)).mean() 

In [7]:
def calc_acc(pred,y):
    total = 0
    for i in range(0,len(y)):
        if pred[i] == y[i]:
            total += 1
    return total / len(y)

In [8]:
numTop = 1000

### Baseline

In [9]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        wordCount[w] += 1

In [10]:
len(wordCount)

157773

In [11]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [12]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [13]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [14]:
Xtrain = [wordbag(d) for d in train]
ytrain = [d['stars'] for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [15]:
Xvalid = [wordbag(d) for d in valid]
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)

In [16]:
MSE(pred, yvalid)

1.38576

In [17]:
calc_acc(pred, yvalid)

0.58544

### Filtering Punctuation

In [18]:
punctuation = set(string.punctuation)

In [19]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [20]:
len(wordCount)

76461

In [21]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [22]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [23]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [24]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [25]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [26]:
MSE(pred, yvalid)

1.26368

In [27]:
calc_acc(pred, yvalid)

0.59288

### Filtering Punctuation (treating punctuation marks as separate words)

In [28]:
punctuation = set(string.punctuation)

In [29]:
def handlePunc(d):
    r = ''
    for c in d['text'].lower():
        if not c in punctuation:
            r += c
        else:
            r += " " + c + " "
    return r

In [30]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        wordCount[w] += 1

In [31]:
len(wordCount)

56337

In [32]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [33]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [34]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [35]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [36]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [37]:
MSE(pred, yvalid)

1.21568

In [38]:
calc_acc(pred, yvalid)

0.59656

### Filtering Stopwords

In [39]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [41]:
len(wordCount)

157611

In [42]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [43]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [44]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [45]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [46]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [47]:
MSE(pred, yvalid)

1.34952

In [48]:
calc_acc(pred, yvalid)

0.58688

### Filtering Stemming

In [49]:
stemmer = PorterStemmer()

In [50]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [51]:
len(wordCount)

142783

In [52]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [53]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [54]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [55]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [56]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [57]:
MSE(pred, yvalid)

1.40016

In [58]:
calc_acc(pred, yvalid)

0.5812

### Punctuation and Stopwords

In [59]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [60]:
len(wordCount)

76320

In [61]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [62]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [63]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [64]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [65]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [66]:
MSE(pred, yvalid)

1.21232

In [67]:
calc_acc(pred, yvalid)

0.596

### Punctuation (separate) and Stopwords

In [68]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        wordCount[w] += 1

In [69]:
len(wordCount)

56186

In [70]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [71]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [72]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [73]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [74]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [75]:
MSE(pred, yvalid)

1.16176

In [76]:
calc_acc(pred, yvalid)

0.60256

### Punctuation and Stemming

In [77]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [78]:
len(wordCount)

59030

In [79]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [80]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [81]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [82]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [83]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [84]:
MSE(pred, yvalid)

1.28664

In [85]:
calc_acc(pred, yvalid)

0.5856

### Punctuation (separate) and Stemming

In [86]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [87]:
len(wordCount)

40053

In [88]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [89]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [90]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [91]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [None]:
MSE(pred, yvalid)

In [None]:
calc_acc(pred, yvalid)

### Stopwords and Stemming

In [None]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
len(wordCount)

In [None]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [None]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [None]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    for w in d['text'].lower().split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [None]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [None]:
MSE(pred, yvalid)

In [None]:
calc_acc(pred, yvalid)

### Punctuation, Stopwords, and Stemming

In [None]:
stemmer = PorterStemmer()

In [None]:
wordCount = defaultdict(int)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
len(wordCount)

In [None]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [None]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [None]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [None]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [None]:
MSE(pred, yvalid)

In [None]:
calc_acc(pred, yvalid)

### Punctuation (separate), Stopwords, and Stemming

In [None]:
stemmer = PorterStemmer()

In [None]:
wordCount = defaultdict(int)
for d in dataset:
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
len(wordCount)

In [None]:
unigram_counts = [(wordCount[w], w) for w in wordCount]
unigram_counts.sort()
unigram_counts.reverse()

In [None]:
top_unigrams = [x[1] for x in unigram_counts[:numTop]]

In [None]:
def wordbag(d):
    count = dict.fromkeys(top_unigrams, 0)
    r = handlePunc(d)
    for w in r.split():
        if w in stop_words:
            continue
        w = stemmer.stem(w)
        if w in top_unigrams:
            count[w] += 1
    res = [count[w] for w in count]
    return res

In [None]:
Xtrain = [wordbag(d) for d in train]
mod = MultinomialNB().fit(Xtrain, ytrain)

In [None]:
Xvalid = [wordbag(d) for d in valid]
pred = mod.predict(Xvalid)

In [None]:
MSE(pred, yvalid)

In [None]:
calc_acc(pred, yvalid)