In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from textblob import TextBlob
from IPython import display
from coloredweighteddoc import ColoredWeightedDoc

In [2]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

In [3]:
#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [4]:
# vectorize
tp=r"(?u)\b[\w\'/]+\b" # customized
cv = CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1))
X = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

In [5]:
X.shape

(25000, 71975)

In [6]:
words = cv.get_feature_names()

In [7]:
clf = LogisticRegression(random_state=42, penalty='l2', C=0.5)

In [8]:
clf.fit(X, y_train_original)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
clf.coef_.shape

(1, 71975)

In [10]:
y_pred = clf.predict_proba(X_test)

In [11]:
weights = clf.coef_[0]
bias = clf.intercept_[0]

In [12]:
freq = np.sum(X, axis=0)
freq = freq.A1

In [13]:
clf.score(X, y_train_original)

0.9948

In [14]:
clf.score(X_test, y_test_original)

0.8694

In [15]:
weights.shape

(71975,)

In [16]:
[(words[i], freq[i], weights[i]) for i in range(len(words))]

[('00', 96, -0.17417490362308757),
 ('000', 300, -0.045161415592860285),
 ('0000000000001', 1, -0.02263542046040202),
 ('00001', 2, -0.020331528504687776),
 ('00015', 1, 7.221085959529314e-06),
 ('000s', 1, 0.004050922355972968),
 ('001', 3, -0.003332709343116061),
 ('003830', 1, 0.03938180159843666),
 ('006', 1, 0.0007712521305029623),
 ('007', 19, -0.02444063476105876),
 ('0079', 1, 0.008582219156722233),
 ('0080', 4, 0.051955124027358766),
 ('0083', 2, 0.03592721819426782),
 ('0093638', 1, 0.15201895480189143),
 ('00am', 4, 0.04143568083670975),
 ('00pm', 4, 0.049527777566567295),
 ('00s', 3, -0.004080030741603855),
 ('01', 29, 0.09052299671480904),
 ('01pm', 1, -0.00026849205599714226),
 ('02', 16, 0.06598637190475663),
 ('020410', 1, -1.601836544503561e-05),
 ('029', 1, -0.045448770139430536),
 ('02i', 1, -2.01417893799304e-05),
 ('03', 7, 0.01204711559576822),
 ('04', 7, -0.27307694947735117),
 ('041', 1, -2.401657843674914e-06),
 ('05', 13, -0.08169000489941473),
 ('050', 1, -4.

In [17]:
indices = np.argsort(weights)[::-1]

[(words[i], freq[i], weights[i]) for i in indices]

[('refreshing', 206, 1.4335637848999863),
 ('excellent', 2068, 1.1133727916783),
 ('perfect', 1598, 1.0910967688411772),
 ('superb', 671, 1.090122883314822),
 ('funniest', 358, 1.068696112638053),
 ('wonderfully', 324, 1.053038720409875),
 ('flawless', 125, 1.0296819761169769),
 ('erotic', 201, 1.0148285820439569),
 ('surprisingly', 466, 1.0059420802803873),
 ('carrey', 131, 0.9995462486447931),
 ('breed', 17, 0.9734227590219303),
 ('rare', 442, 0.9731657308902714),
 ('flight', 188, 0.9659340181090025),
 ('incredible', 566, 0.9551387306289176),
 ('appreciated', 196, 0.947037286724019),
 ('highly', 1147, 0.9394639202954798),
 ('favorite', 1404, 0.9376685339593781),
 ('underrated', 235, 0.9277914203534103),
 ('enjoyable', 842, 0.9203571369132675),
 ('kurosawa', 83, 0.9004317935847911),
 ('vengeance', 94, 0.8922671860407939),
 ('delightful', 274, 0.892135466387031),
 ('hooked', 139, 0.8819226865069285),
 ('amazing', 1320, 0.8731674775354588),
 ('kitty', 74, 0.8713153137547143),
 ('whoopi'

In [34]:
# i = np.argmax(y_pred[:,1])
i=20235
print(i)
print('original label', y_test_original[i])
print(y_pred[i])
display.display(ColoredWeightedDoc(X_test_original[i], words, weights, binary = True))

20235
original label 0
[0.62592848 0.37407152]


In [54]:
# i = np.argmax(y_pred[:,1])
i=20278
print(i)
print('original label', y_test_original[i])
print(y_pred[i])
display.display(ColoredWeightedDoc(X_test_original[i], words, weights, binary = True))

20278
original label 0
[0.9977774 0.0022226]


In [52]:
# i = np.argmax(y_pred[:,1])
i=20286
print(i)
print('original label', y_test_original[i])
print(y_pred[i])
display.display(ColoredWeightedDoc(X_test_original[i], words, weights, binary = True))

20286
original label 1
[3.24834651e-05 9.99967517e-01]
