In [1]:
import csv
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

In [2]:
# convert csv to list of dicts
with open('final_df.csv') as f:
    data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [3]:
data[0]

{'uid_x': '255938',
 'profile': 'DesolatePsyche',
 'anime_uid': '34096',
 'text': 'First things first My reviews system is explained on a blog entry Which can be found through my profile   Im going to keep this review more of a opinion of Gintamas overall and then this season specific  Anyhow What I have always loved regarding Gintama is its content of everything I love the comedy its absurd random can be vile dirty sweet anyhow everything Have laughed countless times in this franchise Also the humor they have also is heavily reference based aka parodies of different anime shows manga live stuff real world anime production and so on Anyhow comedyparody side of this franchise i absolutely love  Now nd side of this show is the serious dramas epic battle shounens and so on There are arcs that are fully comedy arcs that are fully serious and mixtures of both Serious side is usually quite dramatic and managed to somewhat tear me up now and then Whilst the action sequences are absolute bliss

In [4]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=0)

In [5]:
# remove capitalization (and punctuation)
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    for w in ws:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [6]:
# 1000 most popular words (bag of words)
words = [x[1] for x in counts[:1000]]

In [7]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

# Predictive Task

## unigram

In [8]:
# unigram
def featureUni(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [9]:
X = [featureUni(d) for d in data]
y = [d['score_x'] for d in data]

In [10]:
# 70/30 train/test split. convert y_data (scores) into ints
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

In [11]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

In [12]:
predictions = clf.predict(X_test)

In [13]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [14]:
MSE(predictions, y_test)

3.2027948034964315

In [15]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [16]:
# most "negative" unigrams
wordSort[:5]

[(-0.4955341867392178, 'worst'),
 (-0.48117306630400886, 'horrible'),
 (-0.3820938818256158, 'terrible'),
 (-0.34760319080767005, 'poor'),
 (-0.33220931810463566, 'mediocre')]

In [17]:
# most "positive" unigrams
wordSort[-5:]

[(0.1978931893829899, 'highly'),
 (0.2174602882849021, 'awesome'),
 (0.22330198231093915, 'amazing'),
 (0.2829962038957392, 'outstanding'),
 (0.40739338700174405, 'masterpiece')]

In [18]:
def accuracy(predictions, labels):
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == labels[i]:
            correct += 1
    return correct/len(labels)

In [19]:
# convert predictions (floats) to int and check accuracy
predictions = list(map(int, predictions))
accuracy(predictions, y_test)

0.2061242210644601

## bigram

In [20]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws2:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [21]:
words = [x[1] for x in counts[:1000]]

In [22]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [23]:
def featureBi(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws2:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat