In [1]:
import csv
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

In [2]:
# convert csv to list of dicts
with open('final_df.csv') as f:
    data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [3]:
data[0]

{'uid_x': '255938',
 'profile': 'DesolatePsyche',
 'anime_uid': '34096',
 'text': 'First things first My reviews system is explained on a blog entry Which can be found through my profile   Im going to keep this review more of a opinion of Gintamas overall and then this season specific  Anyhow What I have always loved regarding Gintama is its content of everything I love the comedy its absurd random can be vile dirty sweet anyhow everything Have laughed countless times in this franchise Also the humor they have also is heavily reference based aka parodies of different anime shows manga live stuff real world anime production and so on Anyhow comedyparody side of this franchise i absolutely love  Now nd side of this show is the serious dramas epic battle shounens and so on There are arcs that are fully comedy arcs that are fully serious and mixtures of both Serious side is usually quite dramatic and managed to somewhat tear me up now and then Whilst the action sequences are absolute bliss

In [4]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=0)

In [5]:
# remove capitalization (and punctuation)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    for w in ws:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [6]:
# 1000 most popular words (bag of words)
words = [x[1] for x in counts[:1000]]

In [7]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [36]:
# unigram
def featureUni(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(int(datum['text len']))
    feat.append(1) # offset
    return feat

In [37]:
X = [featureUni(d) for d in data]
y = [d['score_x'] for d in data]

In [38]:
# 70/30 train/test split. convert y_data (scores) into ints
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

In [63]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [64]:
predictions = clf.predict(X_test)

In [65]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [66]:
MSE(predictions, y_test)

3.2777022317165483

In [67]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [68]:
# most "negative" unigrams
wordSort[:5]

[(-0.5129548798418291, 'worst'),
 (-0.4714228589482305, 'horrible'),
 (-0.3575173624069023, 'terrible'),
 (-0.3573067177497681, 'poor'),
 (-0.30815733250162625, 'mediocre')]

In [69]:
# most "positive" unigrams
wordSort[-5:]

[(0.19308189677302412, 'highly'),
 (0.2150944217225201, 'awesome'),
 (0.2297299810648875, 'amazing'),
 (0.2831379563659605, 'outstanding'),
 (0.4075131819544006, 'masterpiece')]

In [70]:
def accuracy(predictions, labels):
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == labels[i]:
            correct += 1
    return correct/len(labels)

In [71]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.20456634998467668

## Regularization Pipeline

no significant MSE and accuracy changes when experimenting with lambda values. similar results when adding review length as feature

In [48]:
bestModel = None
bestVal = None
bestLamb = None
bestAcc = None

ls = [0.01, 0.1, 1, 10, 100, 1000, 10000]
errorTrain = []
errorTest = []

for l in ls:
    model = sklearn.linear_model.Ridge(l)
    model.fit(X_train, y_train)
    predictTrain = model.predict(X_train)
    MSEtrain = sum((y_train - predictTrain)**2)/len(y_train)
    errorTrain.append(MSEtrain)
    predictTest = model.predict(X_test)
    MSEtest = sum((y_test - predictTest)**2)/len(y_test)
    predictTest = list(map(int, predictTest))
    accTest = accuracy(predictTest, y_test)
    errorTest.append(MSEtest)
    print("l = " + str(l) + ", test MSE = " + str(MSEtest) + ", test Acc = " + str(accTest))
    if bestVal == None or MSEtest < bestVal:
        bestVal = MSEtest
        bestModel = model
        bestLamb = l
    if bestAcc == None or accTest > bestAcc:
        bestAcc = accTest

l = 0.01, test MSE = 3.2777159243483625, test Acc = 0.20459188885483706
l = 0.1, test MSE = 3.2777152730179, test Acc = 0.20459188885483706
l = 1, test MSE = 3.277708763785827, test Acc = 0.20459188885483706
l = 10, test MSE = 3.277644077742966, test Acc = 0.20451527224435592
l = 100, test MSE = 3.277036901311336, test Acc = 0.204387577893554
l = 1000, test MSE = 3.2741638520471628, test Acc = 0.2042343446725917
l = 10000, test MSE = 3.3308302315776435, test Acc = 0.2013229134743079


In [72]:
bestAcc

0.20459188885483706

## tfidf

In [77]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

In [78]:
def text_process(reviewText):
    nopunc = [i for i in reviewText if i not in string.punctuation]
    nopunc_text = ''.join(nopunc)
    return [i for i in nopunc_text.split() if i.lower() not in stopwords.words('english')]

In [80]:
# Benchmark model - multinomial logistic regression
pipeline = Pipeline([
    ('Tf-Idf', TfidfVectorizer(ngram_range=(1,2), analyzer=text_process)),
    ('classifier', linear_model.LogisticRegression(solver='newton-cg', multi_class='multinomial'))
])
X1 = [d['text'] for d in data]
y1 = [d['score_x'] for d in data]
review_train1, review_test1, label_train1, label_test1 = train_test_split(X1, y1, test_size=0.3, random_state=0)
pipeline.fit(review_train1, label_train1)
pip_pred1 = pipeline.predict(review_test1)
print(metrics.classification_report(label_test1,pip_pred1))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.42      0.28      0.34       882
          10       0.52      0.63      0.57      7319
           2       0.25      0.06      0.09       975
           3       0.25      0.21      0.23      1733
           4       0.25      0.10      0.14      1836
           5       0.22      0.16      0.19      2584
           6       0.25      0.21      0.23      3767
           7       0.29      0.32      0.31      5738
           8       0.31      0.38      0.34      6949
           9       0.35      0.36      0.36      7372

    accuracy                           0.35     39156
   macro avg       0.28      0.25      0.25     39156
weighted avg       0.34      0.35      0.34     39156



  _warn_prf(average, modifier, msg_start, len(result))


In [89]:
pred = list(map(int, pip_pred1))
accuracy(pred, y_test)

0.35220655838185716