In [2]:
import gzip
import sklearn
from collections import defaultdict
import random
import math
import numpy as np
from sklearn.metrics import jaccard_score as jaccard
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import scipy.optimize
import string
from nltk.stem.porter import *
from scipy.sparse import lil_matrix


In [3]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:    
        yield json.loads(l)

In [4]:
data = list(parse("data/Video_Games_5.json.gz"))

In [4]:
df = pd.DataFrame(data)

In [5]:
df['overall'].value_counts()

5.0    299759
4.0     93654
3.0     49146
1.0     30883
2.0     24135
Name: overall, dtype: int64

In [5]:
X = [d for d in data]
y = [d['overall'] for d in data]

In [6]:
#shuffle data
Xy = list(zip(X,y))
random.shuffle(Xy)
X = np.array([d[0] for d in Xy])
y = np.array([d[1] for d in Xy])

In [8]:
# df['len_rev'] = df['reviewText'].str.len()

In [9]:
data[0]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '10 17, 2015',
 'reviewerID': 'A1HP7NVNPFMA4N',
 'asin': '0700026657',
 'reviewerName': 'Ambrosia075',
 'reviewText': "This game is a bit hard to get the hang of, but when you do it's great.",
 'summary': "but when you do it's great.",
 'unixReviewTime': 1445040000}

In [10]:
# df.groupby(['reviewerID']).size()
# ['overall'].value_counts()

reviewerID
A0059486XI1Z0P98KP35     5
A0220159ZRNBTRKLG08H     6
A0266076X6KPZ6CCHGVS    14
A0277912HT4JSJKVSL3E    10
A02836981FYG9912C66F     7
                        ..
AZZNK89PXD006            7
AZZQCK9ZAKMFR           11
AZZT1ERHBSNQ8            7
AZZTC2OYVNE2Q            6
AZZTOUKVTUMVM            6
Length: 55223, dtype: int64

In [11]:
# df[df['reviewerID'].isnull()]

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image


In [6]:
Xtrain = X[:30000]
Xvalid = X[30000:40000]
# Xtest = X[40000:50000]

ytrain = y[:30000]
yvalid = y[30000:40000]
# ytest = y[40000:50000]

In [13]:
C = [.001,.01, .1, 1, 10, 100]

In [7]:
#Unigrams, keep punc, tfidf
#training data
unigrams = defaultdict(int)
for d in Xtrain:
    #not all data has a review
    if 'reviewText' in d:
#     token = nltk.word_tokenize(d['text'])
#     unigram = list(ngrams(token, 1))
        t = d['reviewText']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            unigrams[u] += 1

#1000 most common from training set
mostCommonUni =sorted(unigrams.items(),key=lambda v: v[1],reverse=True)[:1000]
unigram_words = [u[0] for u in mostCommonUni]
unigramId = dict(zip(unigram_words, range(len(unigram_words))))
unigramSet = set(unigram_words)

In [15]:
#docFreq and tf
#training data
docFreq = defaultdict(set)
for d in Xtrain:
    if 'reviewText' in d: 
        t = d['reviewText']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            docFreq[u].add(d['reviewerID'])

#term freq
tf = unigrams


In [16]:
def feature_uni_punc_tfidf(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    
        for u in unigram_words:
            if not (u in unigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[u])) * tf[u]
            feat[unigramId[u]] = tf_idf_word

    feat.append(1)
    return feat

In [17]:
Xtrain_1 = [feature_uni_punc_tfidf(d) for d in Xtrain]
Xvalid_1 = [feature_uni_punc_tfidf(d) for d in Xvalid]

In [12]:
punctuation = set(string.punctuation)

In [19]:
#unigrams, discard punc, tfidf
def feature_uni_nopunc_tfidf(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum:
        t = datum['reviewText']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))

        for u in unigram_words:
            if not (u in unigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[u])) * tf[u]
            feat[unigramId[u]] = tf_idf_word

    feat.append(1)
    return feat

In [20]:
Xtrain_2 = [feature_uni_nopunc_tfidf(d) for d in Xtrain]
Xvalid_2 = [feature_uni_nopunc_tfidf(d) for d in Xvalid]

In [21]:
#unigrams, keep punc, counts
def feature_uni_punc_wc(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()

        for u in unigram_words:
            if not (u in unigramSet): continue
            feat[unigramId[u]] += 1

    feat.append(1)
    return feat

In [22]:
Xtrain_3 = [feature_uni_punc_wc(d) for d in Xtrain]
Xvalid_3 = [feature_uni_punc_wc(d) for d in Xvalid]

In [23]:
#unigrams, discard punc, counts
def feature_uni_nopunc_wc(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        for u in unigram_words:
            if not (u in unigramSet): continue
            feat[unigramId[u]] += 1

    feat.append(1)
    return feat

In [24]:
Xtrain_4 = [feature_uni_nopunc_wc(d) for d in Xtrain]
Xvalid_4 = [feature_uni_nopunc_wc(d) for d in Xvalid]

In [25]:
#start of bigram models
bigrams = defaultdict(int)

for d in Xtrain:
#     token = nltk.word_tokenize(d['text'])
#     bigram = list(ngrams(token, 2)) 
    if 'reviewText' in d: 
        text = " ".join(d['reviewText'].splitlines())
        bigram = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
        for b in bigram:
            bigrams[b] += 1
        
#1000 most common from training set
mostCommonBi =sorted(bigrams.items(),key=lambda v: v[1],reverse=True)[:1000]
bigram_words = [u[0] for u in mostCommonBi]
bigramId = dict(zip(bigram_words, range(len(bigram_words))))
bigramSet = set(bigram_words)

In [26]:
#docFreq and tf
#training data
docFreq = defaultdict(set)
for d in Xtrain:
#     token = nltk.word_tokenize(d['text'])
#     bigram = list(ngrams(token, 2)) 
    if 'reviewText' in d: 
        text = " ".join(d['reviewText'].splitlines())
        bigram = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
        for b in bigram:
            docFreq[b].add(d['reviewerID'])

#term freq
tf = bigrams

In [27]:
#bigrams, keep punc, tfidf
def feature_bi_punc_tfidf(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[b])) * tf[b]
            feat[bigramId[b]] = tf_idf_word

    feat.append(1)
    return feat

In [28]:
Xtrain_5 = [feature_bi_punc_tfidf(d) for d in Xtrain]
Xvalid_5 = [feature_bi_punc_tfidf(d) for d in Xvalid]

In [29]:
#bigrams, discard punc, tfidf
def feature_bi_nopunc_tfidf(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))
        t = ''.join([c for c in t.lower() if not c in punctuation])
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[b])) * tf[b]
            feat[bigramId[b]] = tf_idf_word

    feat.append(1)
    return feat

In [30]:
Xtrain_6 = [feature_bi_nopunc_tfidf(d) for d in Xtrain]
Xvalid_6 = [feature_bi_nopunc_tfidf(d) for d in Xvalid]

In [31]:
#bigrams, keep punc, counts
def feature_bi_punc_wc(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']

    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            feat[bigramId[b]] += 1

    feat.append(1)
    return feat

In [32]:
Xtrain_7 = [feature_bi_punc_wc(d) for d in Xtrain]
Xvalid_7 = [feature_bi_punc_wc(d) for d in Xvalid]

In [33]:
#bigrams, discard punc, counts
def feature_bi_nopunc_wc(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']

    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))

        t = ''.join([c for c in t.lower() if not c in punctuation])
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            feat[bigramId[b]] += 1

    feat.append(1)
    return feat

In [34]:
Xtrain_8 = [feature_bi_nopunc_wc(d) for d in Xtrain]
Xvalid_8 = [feature_bi_nopunc_wc(d) for d in Xvalid]

In [35]:
from tqdm import tqdm

In [39]:
to_fit = [Xtrain_1, Xtrain_2, Xtrain_3, Xtrain_4, Xtrain_5, Xtrain_6, Xtrain_7, Xtrain_8]
to_pred = [Xvalid_1, Xvalid_2, Xvalid_3, Xvalid_4, Xvalid_5, Xvalid_6, Xvalid_7, Xvalid_8]

In [40]:
# pipeline
# to_fit = [Xtrain_1, Xtrain_2, Xtrain_3, Xtrain_4, Xtrain_5, Xtrain_6, Xtrain_7, Xtrain_8]
# to_pred = [Xvalid_1, Xvalid_2, Xvalid_3, Xvalid_4, Xvalid_5, Xvalid_6, Xvalid_7, Xvalid_8]
model_performances = []
for i in tqdm(range(len(to_fit))):
    for c in C:
        clf = LogisticRegression(C = c, fit_intercept=False, max_iter = 100000) 
        clf.fit(to_fit[i], ytrain)
        theta = clf.coef_
        predictions = clf.predict(to_pred[i])
        correct = predictions == yvalid
        acc = sum(correct) / len(correct)
        model_performances.append(acc)

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation

In [42]:
model_names = ["unigrams, keep punc, tfidf",
"unigrams, discard punc, tfidf",
"unigrams, keep punc, counts",
"unigrams, discard punc, counts",
"bigrams, keep punc, tfidf",
"bigrams, discard punc, tfidf",
"bigrams, keep punc, counts",
"bigrams, discard punc, counts"]

index_names = []
for model in model_names:
    for c in C:
        index_names.append((model,c))

In [43]:
index = pd.MultiIndex.from_tuples(index_names, names=['model','regularization param'])

In [44]:
df = pd.DataFrame(data = model_performances, index = index, columns = ['accuracy'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy
model,regularization param,Unnamed: 2_level_1
"unigrams, keep punc, tfidf",0.001,0.6171
"unigrams, keep punc, tfidf",0.01,0.6167
"unigrams, keep punc, tfidf",0.1,0.617
"unigrams, keep punc, tfidf",1.0,0.6171
"unigrams, keep punc, tfidf",10.0,0.6168
"unigrams, keep punc, tfidf",100.0,0.617
"unigrams, discard punc, tfidf",0.001,0.6377
"unigrams, discard punc, tfidf",0.01,0.6381
"unigrams, discard punc, tfidf",0.1,0.638
"unigrams, discard punc, tfidf",1.0,0.637


In [50]:
#accuracy with model trained on subset of data
df.sort_values(by = 'accuracy', ascending = False).iloc[0]

accuracy    0.6496
Name: (unigrams, discard punc, counts, 0.1), dtype: float64

In [8]:
# Xtrain = X[:447819]
# Xvalid = X[447819:]
# # Xtest = X[40000:50000]

# ytrain = y[:447819]
# yvalid = y[447819:]
# # ytest = y[40000:50000]

Xtrain = X[:30000]
Xvalid = X[30000:40000]
# Xtest = X[40000:50000]

ytrain = y[:30000]
yvalid = y[30000:40000]
# ytest = y[40000:50000]

In [9]:
#hypertuning
#increase dict size 
#acc for dict size 5000: 0.6603
#acc for dict size 8000: 0.6624
#acc for dict size 10000: 0.6621

unigrams = defaultdict(int)
for d in Xtrain:
    #not all data has a review
    if 'reviewText' in d:
#     token = nltk.word_tokenize(d['text'])
#     unigram = list(ngrams(token, 1))
        t = d['reviewText']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            unigrams[u] += 1

#most common from training set
mostCommonUni =sorted(unigrams.items(),key=lambda v: v[1],reverse=True)[:8000]
unigram_words = [u[0] for u in mostCommonUni]
unigramId = dict(zip(unigram_words, range(len(unigram_words))))
unigramSet = set(unigram_words)

In [66]:
#optimal model with larger dict size
#acc for dict size 5000: 0.6603
#acc for dict size 8000: 0.6624
#acc for dict size 10000: 0.6621

#unigrams, discard punc, counts
def feature_uni_nopunc_wc(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        for u in unigram_words:
            if not (u in unigramSet): continue
            feat[unigramId[u]] += 1

    feat.append(1)
    return feat

In [67]:
Xtrain_4 = [feature_uni_nopunc_wc(d) for d in Xtrain]
Xvalid_4 = [feature_uni_nopunc_wc(d) for d in Xvalid]

In [None]:
Xtrain_4_ = lil_matrix(Xtrain_4)
Xvalid_4_ = lil_matrix(Xvalid_4)

In [68]:
clf = LogisticRegression(C = 0.1, fit_intercept=False, max_iter = 100000) 
clf.fit(Xtrain_4, ytrain)
# theta = clf.coef_
predictions = clf.predict(Xvalid_4)
correct = predictions == yvalid
acc = sum(correct) / len(correct)
acc

0.6621

In [7]:
Xtrain = X[:300000]
Xvalid = X[300000:400000]

ytrain = y[:300000]
yvalid = y[300000:400000]

In [8]:
unigrams = defaultdict(int)
for d in Xtrain:
    #not all data has a review
    if 'reviewText' in d:
#     token = nltk.word_tokenize(d['text'])
#     unigram = list(ngrams(token, 1))
        t = d['reviewText']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            unigrams[u] += 1

#most common from training set
mostCommonUni =sorted(unigrams.items(),key=lambda v: v[1],reverse=True)[:8000]
unigram_words = [u[0] for u in mostCommonUni]
unigramId = dict(zip(unigram_words, range(len(unigram_words))))
unigramSet = set(unigram_words)

In [9]:
#30 most common words in 'summary' acc: 0.6519
#50 most common: 0.6525
unigramsSumm = defaultdict(int)
for d in Xtrain:
    #not all data has a review
    if 'summary' in d:
#     token = nltk.word_tokenize(d['text'])
#     unigram = list(ngrams(token, 1))
        t = d['summary']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            unigramsSumm[u] += 1

#50 most common from training set, summary
mostCommonUniSumm =sorted(unigramsSumm.items(),key=lambda v: v[1],reverse=True)[:50]
unigram_wordsSumm = [u[0] for u in mostCommonUniSumm]
#dictsize 5051 (5000 reviews + 50 summary)
unigramIdSumm = dict(zip(unigram_wordsSumm, np.arange(8000,8051)))
unigramSetSumm = set(unigram_wordsSumm)

In [14]:
punctuation = set(string.punctuation)

In [13]:
#features including summaries and review length
def feature_uni_nopunc_wc_withSumm(datum):
    feat = [0]*(len(unigramSet)+len(unigramSetSumm) + 1)
    if 'reviewText' in datum: 
        t = datum['reviewText']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        for u in unigram_words:
            if not (u in unigramSet): continue
            feat[unigramId[u]] += 1
            
    if 'summary' in datum: 
        t = datum['summary']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_wordsSumm = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        for u in unigram_wordsSumm:
            if not (u in unigramSetSumm): continue
            feat[unigramIdSumm[u]] += 1
        
        
    if 'reviewText' in datum:
        feat[-1] = len(datum['reviewText'])

    feat.append(1)
    return feat

In [15]:
Xtrain_4_ = [feature_uni_nopunc_wc_withSumm(d) for d in Xtrain]
Xvalid_4_ = [feature_uni_nopunc_wc_withSumm(d) for d in Xvalid]

In [16]:
len(Xtrain_4_)

300000

In [17]:
Xtrain_4_ = lil_matrix(Xtrain_4_)

In [18]:
Xvalid_4_ = lil_matrix(Xvalid_4_)

In [19]:
#0.68589 dict size 8000, 300000/100000 train/test split
clf = LogisticRegression(C = 0.1, fit_intercept=False, max_iter = 50000) 
clf.fit(Xtrain_4_, ytrain)
# theta = clf.coef_
predictions = clf.predict(Xvalid_4_)
correct = predictions == yvalid
acc = sum(correct) / len(correct)
acc

0.68589