In [1]:
import csv
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
# convert csv to list of dicts
with open('final_df.csv') as f:
    data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [3]:
data[0]

{'uid_x': '255938',
 'profile': 'DesolatePsyche',
 'anime_uid': '34096',
 'text': 'First things first My reviews system is explained on a blog entry Which can be found through my profile   Im going to keep this review more of a opinion of Gintamas overall and then this season specific  Anyhow What I have always loved regarding Gintama is its content of everything I love the comedy its absurd random can be vile dirty sweet anyhow everything Have laughed countless times in this franchise Also the humor they have also is heavily reference based aka parodies of different anime shows manga live stuff real world anime production and so on Anyhow comedyparody side of this franchise i absolutely love  Now nd side of this show is the serious dramas epic battle shounens and so on There are arcs that are fully comedy arcs that are fully serious and mixtures of both Serious side is usually quite dramatic and managed to somewhat tear me up now and then Whilst the action sequences are absolute bliss

In [4]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=0)

In [5]:
# remove capitalization (and punctuation) and stopwords
stopwordSet = set(stopwords.words('english'))
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
    for w in ws:
        if not w in stopwordSet:
            wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [6]:
# 1000 most popular words (bag of words)
words = [x[1] for x in counts[:1000]]

In [7]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [8]:
# unigram
def featureUni(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in wordSet and not w in stopwordSet:
            feat[wordId[w]] += 1
    #feat.append(int(datum['text len']))
    feat.append(1) # offset
    return feat

In [9]:
X = [featureUni(d) for d in data]
y = [d['score_x'] for d in data]

In [10]:
# 70/30 train/test split. convert y_data (scores) into ints
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

In [11]:
# Regularized regression
clf = linear_model.LogisticRegression(max_iter=100, multi_class='multinomial', fit_intercept=False, n_jobs=-1, verbose=2) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        10010     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.10366D+05    |proj g|=  3.15509D+04


 This problem is unconstrained.



At iterate    1    f=  1.98308D+05    |proj g|=  1.73587D+04

At iterate    2    f=  1.95484D+05    |proj g|=  1.43009D+04

At iterate    3    f=  1.93704D+05    |proj g|=  7.42569D+03

At iterate    4    f=  1.91773D+05    |proj g|=  9.00963D+03

At iterate    5    f=  1.89343D+05    |proj g|=  1.24049D+04

At iterate    6    f=  1.84950D+05    |proj g|=  1.39385D+04

At iterate    7    f=  1.83499D+05    |proj g|=  4.99426D+04

At iterate    8    f=  1.78823D+05    |proj g|=  1.45305D+04

At iterate    9    f=  1.77188D+05    |proj g|=  1.22543D+04

At iterate   10    f=  1.75668D+05    |proj g|=  1.90635D+04

At iterate   11    f=  1.72548D+05    |proj g|=  2.54817D+04

At iterate   12    f=  1.69573D+05    |proj g|=  1.48383D+04

At iterate   13    f=  1.66957D+05    |proj g|=  1.65417D+04

At iterate   14    f=  1.65412D+05    |proj g|=  5.65123D+03

At iterate   15    f=  1.64727D+05    |proj g|=  6.37295D+03

At iterate   16    f=  1.63529D+05    |proj g|=  9.82644D+03

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.7min finished


In [12]:
predictions = clf.predict(X_test)

In [13]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [16]:
wordSort = list(zip(theta[:-1], words))
#wordSort.sort()

In [None]:
# most "negative" unigrams
wordSort[:5]

In [None]:
# most "positive" unigrams
wordSort[-5:]

In [15]:
def accuracy(predictions, labels):
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == labels[i]:
            correct += 1
    return correct/len(labels)

In [17]:
# convert predictions (floats) to int and check accuracy
pred = clf.predict(X_train)
predictions = list(map(int, predictions))
print('train: ', accuracy(pred, y_train), MSE(pred, y_test))
print('test:', accuracy(predictions, y_test), MSE(predictions, y_test))

train:  0.3892689440789834 10.29259883542752
test: 0.34291040964347735 3.373327204004495


In [30]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.34      0.27      0.30       907
           2       0.21      0.09      0.12       944
           3       0.24      0.22      0.23      1744
           4       0.20      0.10      0.14      1792
           5       0.23      0.16      0.19      2599
           6       0.25      0.22      0.24      3706
           7       0.30      0.31      0.30      5731
           8       0.31      0.34      0.32      7047
           9       0.34      0.33      0.33      7343
          10       0.48      0.65      0.55      7343

    accuracy                           0.34     39156
   macro avg       0.29      0.27      0.27     39156
weighted avg       0.33      0.34      0.33     39156



## naive baseline: predicting average irrespective of review text

In [34]:
mean_rating = numpy.mean(y_train)
mean_labels = [mean_rating]*len(y_train)
mean_labels_int = list(map(round, mean_labels))
print('train: ', accuracy(mean_labels_int, y_train), MSE(mean_labels, y_train))
mean_labels = [mean_rating]*len(y_test)
mean_labels_int = list(map(round, mean_labels))
print('test: ', accuracy(mean_labels_int, y_test), MSE(mean_labels, y_test))

train:  0.14593754446645724 5.3658626704794035
test:  0.1463632648891613 5.440222819871951


## ridge

In [40]:
m = linear_model.Ridge(1.0, fit_intercept=False)
m.fit(X_train, y_train)
p = m.predict(X_test)
print('test:', MSE(p, y_test))
p = list(map(round, p))
print(metrics.classification_report(y_test, p))
print(accuracy(p, y_test))

test: 3.3030332774471654
              precision    recall  f1-score   support

         -31       0.00      0.00      0.00         0
         -20       0.00      0.00      0.00         0
         -15       0.00      0.00      0.00         0
         -12       0.00      0.00      0.00         0
          -9       0.00      0.00      0.00         0
          -8       0.00      0.00      0.00         0
          -7       0.00      0.00      0.00         0
          -6       0.00      0.00      0.00         0
          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         0
          -2       0.00      0.00      0.00         0
          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00         0
           1       0.20      0.02      0.03       907
           2       0.17      0.03      0.05       944
           3       0.24      0.06      0.09      1744
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## baseline: using review length

In [40]:
def get_len(d):
    feat = []
    feat.append(int(d['text len']))
    return feat
X_lengths = [get_len(d) for d in data]
X_len_train, X_len_test = train_test_split(X_lengths, test_size=0.3, random_state=0)
model = linear_model.LogisticRegression(n_jobs=-1)
model.fit(X_len_train, y_train)
y_len_train_pred = model.predict(X_len_train)
y_len_test_pred = model.predict(X_len_test)
print('train: ', accuracy(y_len_train_pred, y_train), MSE(y_len_train_pred, y_train))
print('test: ', accuracy(y_len_test_pred, y_test), MSE(y_len_test_pred, y_test))

train:  0.20423375400882215 10.065629754490429
test:  0.20875472469097966 10.199254264991318


In [39]:
y_len_train_pred

array([ 9, 10, 10, ...,  8, 10, 10])

## using tfidf instead of bag of words

In [22]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [23]:
def text_process(reviewText):
    nopunc = [i for i in reviewText if i not in punctuation]
    nopunc_text = ''.join(nopunc)
    return [i.lower() for i in nopunc_text.split() if i.lower() not in stopwordSet]

In [24]:
X1 = [d['text'] for d in data]
y1 = [d['score_x'] for d in data]
y1 = list(map(int, y1))

In [28]:
# Benchmark model - multinomial logistic regression
pipeline = Pipeline([
    ('Tf-Idf', TfidfVectorizer(analyzer=text_process)),
    ('classifier', linear_model.LogisticRegression(max_iter=1000, n_jobs=-1, solver='newton-cg', class_weight='balanced'))
], verbose=True)
review_train1, review_test1, label_train1, label_test1 = train_test_split(X1, y1, test_size=0.3, random_state=0)
pipeline.fit(review_train1, label_train1)
pip_pred1 = pipeline.predict(review_test1)
print(metrics.classification_report(label_test1,pip_pred1))

[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  32.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 2.1min
              precision    recall  f1-score   support

           1       0.28      0.44      0.34       907
           2       0.16      0.23      0.19       944
           3       0.21      0.25      0.23      1744
           4       0.17      0.21      0.19      1792
           5       0.21      0.22      0.21      2599
           6       0.25      0.25      0.25      3706
           7       0.31      0.28      0.30      5731
           8       0.35      0.29      0.32      7047
           9       0.38      0.34      0.36      7343
          10       0.55      0.61      0.58      7343

    accuracy                           0.35     39156
   macro avg       0.29      0.31      0.30     39156
weighted avg       0.35      0.35      0.35     39156



In [29]:
pip_pred1_train = pipeline.predict(review_train1)
print(metrics.classification_report(label_train1, pip_pred1_train))
print('train MSE: ', MSE(pip_pred1_train, label_train1))
print('test MSE: ', MSE(pip_pred1, label_test1))

              precision    recall  f1-score   support

           1       0.61      0.97      0.75      2009
           2       0.65      0.95      0.78      2205
           3       0.66      0.82      0.73      3982
           4       0.62      0.78      0.69      4088
           5       0.61      0.67      0.64      5949
           6       0.60      0.59      0.59      8740
           7       0.59      0.53      0.56     13333
           8       0.60      0.50      0.55     16680
           9       0.61      0.53      0.57     17200
          10       0.66      0.73      0.69     17175

    accuracy                           0.62     91361
   macro avg       0.62      0.71      0.65     91361
weighted avg       0.62      0.62      0.62     91361

train MSE:  2.225741837326649
test MSE:  3.3149708856880173


In [43]:
scores = cross_val_score(pipeline, X1, y1, n_jobs=-1, verbose=5)
print(scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  5.2min remaining:  7.7min


0.2930958586478101


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.0min finished


[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  41.9s
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 4.1min
[CV] END ................................ score: (test=0.322) total time= 5.0min
[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  43.9s
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 4.2min
[CV] END ................................ score: (test=0.313) total time= 5.1min
[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  43.9s
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 4.3min
[CV] END ................................ score: (test=0.284) total time= 5.2min
[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  48.9s
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 4.9min
[CV] END ................................ score: (test=0.272) total time= 5.8min
[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  44.0s
[Pipeline] ........ (step 2 of 2) Pro

In [34]:
# First model try - Naive bayes
pipeline = Pipeline([
    ('Tf-Idf', TfidfVectorizer(analyzer=text_process)),
    ('classifier', ComplementNB())
], verbose=True)
review_train2, review_test2, label_train2, label_test2 = train_test_split(X1, y1, test_size=0.3, random_state=0)
pipeline.fit(review_train2, label_train2)
pip_pred2 = pipeline.predict(review_test2)
print(metrics.classification_report(label_test2, pip_pred2))

[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  29.7s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.3s
              precision    recall  f1-score   support

           1       0.54      0.07      0.12       907
           2       0.10      0.00      0.01       944
           3       0.23      0.02      0.04      1744
           4       0.12      0.00      0.01      1792
           5       0.10      0.01      0.01      2599
           6       0.12      0.02      0.03      3706
           7       0.16      0.13      0.14      5731
           8       0.22      0.37      0.28      7047
           9       0.27      0.34      0.30      7343
          10       0.40      0.67      0.50      7343

    accuracy                           0.28     39156
   macro avg       0.23      0.16      0.14     39156
weighted avg       0.24      0.28      0.23     39156



In [35]:
MSE(pip_pred2, label_test2)

6.1427878230667075

In [25]:
scores = cross_val_score(pipeline, X1, y1, n_jobs=-1, verbose=5)
print(scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   58.9s remaining:  1.5min


0.20309703576635849


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.1min finished


In [31]:
# DecisionTreeClassifier
pipeline = Pipeline([
    ('Tf-Idf', TfidfVectorizer(analyzer=text_process)),
    ('classifier', DecisionTreeClassifier())
], verbose=True)
review_train3, review_test3, label_train3, label_test3 = train_test_split(X1, y1, test_size=0.3, random_state=0)
pipeline.fit(review_train3, label_train3)
pip_pred3 = pipeline.predict(review_test3)
print(metrics.classification_report(label_test3, pip_pred3))

[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  30.7s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=10.5min
              precision    recall  f1-score   support

           1       0.14      0.12      0.13       907
           2       0.07      0.06      0.06       944
           3       0.11      0.09      0.10      1744
           4       0.09      0.09      0.09      1792
           5       0.11      0.10      0.10      2599
           6       0.12      0.11      0.12      3706
           7       0.19      0.19      0.19      5731
           8       0.20      0.22      0.21      7047
           9       0.24      0.25      0.24      7343
          10       0.35      0.36      0.36      7343

    accuracy                           0.21     39156
   macro avg       0.16      0.16      0.16     39156
weighted avg       0.21      0.21      0.21     39156



In [32]:
MSE(pip_pred3, label_test3)

7.109536214117887

In [27]:
scores = cross_val_score(pipeline, X1, y1, n_jobs=-1, verbose=5)
print(scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 16.6min remaining: 24.8min


0.2121070000525629


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 19.3min finished


In [36]:
# RandomForest
pipeline = Pipeline([
    ('Tf-Idf', TfidfVectorizer(analyzer=text_process)),
    ('classifier', RandomForestClassifier(verbose=100, n_jobs=-1))
], verbose=True)
review_train4, review_test4, label_train4, label_test4 = train_test_split(X1, y1, test_size=0.3, random_state=0)
pipeline.fit(review_train4, label_train4)
pip_pred4 = pipeline.predict(review_test4)
print(metrics.classification_report(label_test4, pip_pred4))

[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  29.9s
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   12.3s

building tree 10 of 100[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.4s

building tree 11 of 100[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   12.4s

building tree 12 of 100[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   12.8s

building tree 13 of 100[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.0s

building tree 14 of 100[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   13.1s

building tree 15 of 100[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   13.2s

building tree 16 of 100[Parallel(n_jobs=-1)]: Done   8 tasks   

[Parallel(n_jobs=-1)]: Done  97 out of 100 | elapsed:  2.6min remaining:    4.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.6min finished
[Pipeline] ........ (step 2 of 2) Processing classifier, total= 2.6min
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done

In [37]:
MSE(pip_pred4, label_test4)

6.758836449075493

In [29]:
scores = cross_val_score(pipeline, X1, y1, n_jobs=-1, verbose=5)
print(scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  46.9s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=18.3min
[CV] END ................................ score: (test=0.206) total time=19.2min
[Pipeline] ............ (step 1 of 2) Processing Tf-Idf, total=  41.6s
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.5min
building tree 10 of 100
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.5min
building tree 11 of 100
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  2.5min
building tree 12 of 100
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  2.6min
building tree 13 of 100
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.6min
building tre

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 49.0min remaining: 73.5min


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

## removing pipeline