In [91]:
import csv
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

In [92]:
# convert csv to list of dicts
with open('final_df.csv') as f:
    data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [93]:
data[0]

{'uid_x': '255938',
 'profile': 'DesolatePsyche',
 'anime_uid': '34096',
 'text': 'First things first My reviews system is explained on a blog entry Which can be found through my profile   Im going to keep this review more of a opinion of Gintamas overall and then this season specific  Anyhow What I have always loved regarding Gintama is its content of everything I love the comedy its absurd random can be vile dirty sweet anyhow everything Have laughed countless times in this franchise Also the humor they have also is heavily reference based aka parodies of different anime shows manga live stuff real world anime production and so on Anyhow comedyparody side of this franchise i absolutely love  Now nd side of this show is the serious dramas epic battle shounens and so on There are arcs that are fully comedy arcs that are fully serious and mixtures of both Serious side is usually quite dramatic and managed to somewhat tear me up now and then Whilst the action sequences are absolute bliss

In [110]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=0)

In [111]:
# remove capitalization (and punctuation)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    for w in ws:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [112]:
# 1000 most popular words (bag of words)
words = [x[1] for x in counts[:1000]]

In [113]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

# Predictive Task

## unigram

In [157]:
# unigram
def featureUni(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    #feat.append(int(datum['text len']))
    feat.append(1) # offset
    return feat

In [158]:
X = [featureUni(d) for d in data]
y = [d['score_x'] for d in data]

In [159]:
# 70/30 train/test split. convert y_data (scores) into ints
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

In [160]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [161]:
predictions = clf.predict(X_test)

In [162]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [163]:
MSE(predictions, y_test)

3.276294261479062

In [164]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [165]:
# most "negative" unigrams
wordSort[:5]

[(-0.5133115985652523, 'worst'),
 (-0.4725533892790652, 'horrible'),
 (-0.3583991983390645, 'poor'),
 (-0.35832972524246315, 'terrible'),
 (-0.309473410861833, 'mediocre')]

In [166]:
# most "positive" unigrams
wordSort[-5:]

[(0.19057358697789564, 'highly'),
 (0.21364818723597845, 'awesome'),
 (0.22965326082466814, 'amazing'),
 (0.28205484682399, 'outstanding'),
 (0.40580663178884646, 'masterpiece')]

In [167]:
def accuracy(predictions, labels):
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == labels[i]:
            correct += 1
    return correct/len(labels)

In [168]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.2042343446725917

In [169]:
#0.20456634998467668

## bigram

In [20]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws2:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [21]:
words = [x[1] for x in counts[:1000]]

In [22]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [23]:
def featureBi(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws2:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [24]:
X = [featureBi(d) for d in data]

In [25]:
X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)

In [26]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [27]:
predictions = clf.predict(X_test)

In [28]:
MSE(predictions, y_test)

3.7780989482847462

In [29]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [30]:
# most "negative" bigrams
wordSort[:5]

[(-0.9575741892596256, 'the worst'),
 (-0.3796670740328128, 'not even'),
 (-0.32019151770049364, 'with no'),
 (-0.2695281476757664, 'nothing to'),
 (-0.2521810385095527, 'have no')]

In [31]:
# most "positive" bigrams
wordSort[-5:]

[(0.2479479087536673, 'makes you'),
 (0.24795018471943683, 'really enjoyed'),
 (0.2524566776933574, 'the best'),
 (0.2880505334989174, 'is great'),
 (0.33360213766528596, 'i loved')]

In [32]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.1881703953417101

## trigrams

In [33]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    for w in ws3:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [34]:
words = [x[1] for x in counts[:1000]]

In [35]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [36]:
def featureTri(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    ws = r.split()
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    for w in ws3:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [37]:
X = [featureTri(d) for d in data]

In [38]:
X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)

In [39]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [40]:
predictions = clf.predict(X_test)

In [41]:
MSE(predictions, y_test)

4.204311426628811

In [42]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [43]:
# most "negative" trigrams
wordSort[:5]

[(-1.9031303626592342, 'of the worst'),
 (-0.9482532436736707, 'dont get me'),
 (-0.7057139648198819, 'the only reason'),
 (-0.5948573378628006, 'little to no'),
 (-0.5018927378487092, 'at all the')]

In [44]:
# most "positive" trigrams
wordSort[-5:]

[(0.6256082556262265, 'my favorite anime'),
 (0.6316597443377329, 'the edge of'),
 (0.7440137029718403, 'i highly recommend'),
 (0.804054129001378, 'get me wrong'),
 (0.9671008291837964, 'a must watch')]

In [45]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.1741240167534988

## unigrams, bigrams, and trigrams

In [46]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    for w in ws + ws2 + ws3:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [47]:
words = [x[1] for x in counts[:1000]]

In [48]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [49]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    for w in ws + ws2 + ws3:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [50]:
X = [feature(d) for d in data]

In [51]:
X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)

In [52]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [53]:
predictions = clf.predict(X_test)

In [54]:
MSE(predictions, y_test)

3.403258555167285

In [55]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [56]:
# most "negative" grams
wordSort[:5]

[(-0.6245017427296606, 'worst'),
 (-0.4506875589319873, 'rest of'),
 (-0.2699369118530492, 'boring'),
 (-0.24194845572389395, 'minutes'),
 (-0.22054516501332042, 'the rest')]

In [57]:
# most "positive" grams
wordSort[-5:]

[(0.21221299064279853, 'perfect'),
 (0.22715181275870538, 'awesome'),
 (0.24536187085884112, 'amazing'),
 (0.2968503375489294, 'the fact that'),
 (0.5587612099356443, 'the rest of')]

In [58]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.19866687097762795

## unigrams and bigrams

In [59]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws + ws2:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [60]:
words = [x[1] for x in counts[:1000]]

In [61]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [62]:
def featureUniBi(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws + ws2:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [63]:
X = [featureUniBi(d) for d in data]

In [64]:
X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)

In [65]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [66]:
predictions = clf.predict(X_test)

In [67]:
MSE(predictions, y_test)

3.3917200642667686

In [68]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [69]:
# most "negative" grams
wordSort[:5]

[(-0.6139128992208037, 'worst'),
 (-0.2644882555416461, 'boring'),
 (-0.23825871308714458, 'minutes'),
 (-0.21827240759636252, 'potential'),
 (-0.2113503356590675, 'decent')]

In [70]:
# most "positive" grams
wordSort[-5:]

[(0.19129056746096607, 'loved'),
 (0.20548504037874207, 'highly'),
 (0.2119069081395148, 'perfect'),
 (0.22620660707692405, 'awesome'),
 (0.24307973214546408, 'amazing')]

In [71]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.19961180917356217