In [1]:
import gzip
import csv
import numpy
import math
from urllib.request import urlopen
import scipy.optimize
import random
from collections import defaultdict
import string

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn import svm
from sklearn import linear_model
from sklearn import metrics

In [2]:
path = "../datasets/review.json"
f = open(path, 'r', encoding='utf8')

In [70]:
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
punctuation.add('\n')
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
numpy.warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/coraxyc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
dataset = []
for i in range(50000):
    dataset.append(eval(f.readline()))

In [4]:
random.shuffle(dataset)
train = dataset[:25000]
valid = dataset[25000:37500]
test = dataset[37500:]

In [None]:
## contained punctuation, unstemmed words, top 1000 words from train

In [74]:
stemmed = True
stopped = True
Xtrain = getX(train, 'unigrams', 'keep_punct_contained', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'keep_punct_contained', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.39264

In [None]:
## contained punctuation, unstemmed words, top 1000 words from dataset

In [65]:
stemmed = True
Xtrain = getX(train, 'unigrams', 'keep_punct_contained', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'keep_punct_contained', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.61608

In [None]:
## separated punctuation, unstemmed words, top 1000 words from train

In [75]:
Xtrain = getX(train, 'unigrams', 'keep_punct_separate', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'keep_punct_separate', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.41504

In [None]:
## separated punctuation, unstemmed words, top 1000 words from dataset

In [68]:
Xtrain = getX(train, 'unigrams', 'keep_punct_separate', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'keep_punct_separate', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.63992

In [None]:
## removed punctuation, unstemmed words, unstopped, top 1000 words from dataset

In [78]:
Xtrain = getX(train, 'unigrams', 'remove_punct', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'remove_punct', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.63576

In [None]:
## separated punctuation, unstemmed words, stopped, top 1000 words from dataset

In [77]:
Xtrain = getX(train, 'unigrams', 'keep_punct_separate', not stemmed, stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'keep_punct_separate', not stemmed, stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.63464

In [None]:
## separated punctuation, stemmed words, top 1000 words from dataset

In [71]:
Xtrain = getX(train, 'unigrams', 'keep_punct_separate', stemmed, stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'keep_punct_separate', stemmed, stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.63688

In [None]:
## removed punctuation, stemmed words, top 1000 words from dataset

In [72]:
Xtrain = getX(train, 'unigrams', 'remove_punct', stemmed, stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'unigrams', 'remove_punct', stemmed, stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.63224

In [79]:
Xtrain = getX(train, 'bigrams', 'keep_punct_contained', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'bigrams', 'keep_punct_contained', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.56656

In [80]:
Xtrain = getX(train, 'bigrams', 'keep_punct_separated', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'bigrams', 'keep_punct_separated', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.44784

In [81]:
Xtrain = getX(train, 'bigrams', 'remove_punct', not stemmed, not stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'bigrams', 'remove_punct', not stemmed, not stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

0.58392

In [None]:
Xtrain = getX(train, 'bigrams', 'keep_punct_contained', not stemmed, stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'bigrams', 'keep_punct_contained', not stemmed, stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

In [None]:
Xtrain = getX(train, 'bigrams', 'remove_punct', stemmed, stopped, 'word_count')
ytrain = [d['stars'] for d in train]
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(Xtrain,ytrain)
Xvalid = getX(valid, 'bigrams', 'remove_punct', stemmed, stopped, 'word_count')
yvalid = [d['stars'] for d in valid]
pred = mod.predict(Xvalid)
get_accuracy(pred, yvalid)

# Baseline

# Tf-idf

In [53]:
def get_accuracy(pred, y):
    correct_pred = 0
    for i in range(len(pred)):
        if pred[i] == y[i]:
            correct_pred += 1
    return correct_pred / len(pred)

In [63]:
def getX(reviews, i, j, stemmed, stopped, k):
    if k == 'word_count':
        return getX_word_count(reviews, i, j, stemmed, stopped, k)
    elif k == 'tf_idf':
        return getX_tf_idf(reviews, i, j, stemmed, stopped, k)

In [76]:
def getX_word_count(reviews, i, j, stemmed, stopped, k):
    wordCounts = get_wordcounts(dataset, i, j, stemmed, stopped, k)
    words = get_n_most_common_words(wordCounts, 1000)
    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    X = []
    for review in reviews:
        feat = [0] * len(words)
        r_set = get_review_words(review, j)
        prev = ''
        for w in r_set:
            if stemmed:
                w = stemmer.stem(w)
            if stopped and w in stop_words:
                continue
            if i == 'unigrams' and w in wordSet:
                feat[wordId[w]] += 1
            elif i == 'bigrams' and prev != '' and (prev + ' ' + w) in wordSet:
                feat[wordId[prev + ' ' + w]] += 1
            prev = w
        
        feat.append(1) #offset
        X.append(feat)
    return X

In [56]:
def getX_tf_idf(reviews, i, j, stemmed, stopped, k):
    wordCounts = get_wordcounts(reviews, i, j, stemmed, stopped, k)
    words = get_n_most_common_words(wordCounts, 1000)
    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    tf_scores = [0] * len(words)
    df_scores = get_df_scores(reviews, words, wordId, j)
    tf_idf_scores = [0] * len(words)
    
    idf_scores = []
    for score in df_scores:
        if score == 0:
            idf_scores.append(0)
        else:
            idf_scores.append(math.log10(len(words) / score))
    
    X = []
    for review in reviews:
        r_set = get_review_words(review, j)
        for w in r_set:
            if stemmed:
                w = stemmer.stem(w)
            if stopped and w in stop_words:
                continue
            if w in wordSet:
                tf_scores[wordId[w]] += 1

        feat = [i * j for (i,j) in zip(tf_scores,idf_scores)]
        X.append(feat)
    return X

In [57]:
def get_wordcounts(reviews, i, j, stemmed, stopped, k):
    wordCounts = defaultdict(int)

    for review in reviews:
        r_set = get_review_words(review, j)
        prev = ''
        for w in r_set:
            if stemmed:
                w = stemmer.stem(w)
            if stopped and w in stop_words:
                continue
            if i == 'unigrams':
                wordCounts[w] += 1
            elif prev != '' and i == 'bigrams':
                wordCounts[prev + ' ' + w] += 1
            prev = w
    return wordCounts

In [58]:
def get_n_most_common_words(wordCounts, n):
    counts = [(wordCounts[w], w) for w in wordCounts]
    counts.sort()
    counts.reverse()
    words = [x[1] for x in counts[:n]]
    return words

In [59]:
def get_review_words(review, j):
    r = ''
    if j == 'remove_punct':
        r = ''.join([c for c in review['text'].lower() if not c in punctuation])
    elif j == 'keep_punct_contained':
        r = review['text']
    elif j == 'keep_punct_separate':
        for c in review['text'].lower():
            if not c in punctuation:
                r += c
            else:
                r += (' ' + c + ' ')
    return r.split()

In [60]:
def get_df_scores(reviews, words, wordId, j):
    df_scores = [0] * len(words)
    for review in reviews:
        r_set = get_review_words(review, j)
        for w in words:
            if w in r_set:
                df_scores[wordId[w]] += 1
    return df_scores