In [276]:
import glob
import numpy as np
import pickle
import os, os.path
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from textblob import TextBlob

In [28]:
def load_pickle_object(path):    
    with open(path, 'rb') as f:
        x = pickle.load(f)
    return x

In [29]:
X_train_corp = load_pickle_object('../data/imdb_original_preprocessed_xtrain.pickle')
X_test_corp = load_pickle_object('../data/imdb_original_preprocessed_xtest.pickle')
y_train = load_pickle_object('../data/imdb_original_preprocessed_ytrain.pickle')
y_test = load_pickle_object('../data/imdb_original_preprocessed_ytest.pickle')

In [30]:
len(X_train_corp)

25000

In [31]:
len(X_test_corp)

25000

In [32]:
tp = r"(?u)\b[\w\'/]+\b"

In [131]:
#vectorizer = CountVectorizer(lowercase=True, min_df=5, ngram_range=(1,1), binary=True, stop_words='english', token_pattern=tp)
vectorizer = TfidfVectorizer(lowercase=True, min_df=5, ngram_range=(1,1), stop_words='english', token_pattern=tp, norm='l2')

In [132]:
X_train = vectorizer.fit_transform(X_train_corp)
X_test = vectorizer.transform(X_test_corp)

In [133]:
terms = vectorizer.get_feature_names()
terms = np.array(terms)

In [134]:
X_train.shape

(25000, 25962)

In [135]:
X_test.shape

(25000, 25962)

Let's test it on a few examples.

In [170]:
# Let's test it on a few test examples
clf = LogisticRegression(penalty='l1', C=1)
clf.fit(X_train, y_train)
weights = clf.coef_[0]
clf.score(X_test, y_test)

0.8798

In [171]:
i=3
sentences = TextBlob(X_test_corp[i]).raw_sentences
sentences

['i have yet to watch the first entry in this series, however, fortunately, i was still able to follow the complex and intricate plot, with all its unexpected twist and turn, and i applaud them for the utter originality of the concept herein.',
 'in case there is any confusion, let me leave no doubt as to the fact that everything i have just said is coated in pure, carefully nurtured sarcasm, the kind that flourish and grow exponentially when exposed to crap like this flick.',
 'a clear sign that this is unimpressive is that it was directed by a visual effect creator, whose only other credit in that field is a henry rooker film that was not well received.',
 'the acting is average at best, and i defy anyone to not find... scottish computer-woman(come on, seriously, what is with that last name?',
 ")'s russian accent laughable and/or irritating.",
 'there is an attempt or two at stylization in this, and they are reasonable.',
 'the action is not terrible.',
 'cinematography and editing 

In [172]:
y_test[i]

0

In [173]:
x=vectorizer.transform(sentences)

In [176]:
#terms[x[11].indices]

In [177]:
#np.sum(np.power(x[11].data,2))
#np.sum(x[11].data)

In [180]:
probs=clf.predict_proba(x)
probs

array([[2.16112984e-01, 7.83887016e-01],
       [8.10963197e-01, 1.89036803e-01],
       [4.36866785e-01, 5.63133215e-01],
       [4.10479521e-01, 5.89520479e-01],
       [9.96556729e-01, 3.44327121e-03],
       [7.89693079e-01, 2.10306921e-01],
       [9.92969314e-01, 7.03068600e-03],
       [5.47470488e-01, 4.52529512e-01],
       [1.71173435e-01, 8.28826565e-01],
       [4.28057938e-01, 5.71942062e-01],
       [4.86023380e-01, 5.13976620e-01],
       [9.76287088e-01, 2.37129125e-02],
       [9.99994411e-01, 5.58944172e-06]])

In [181]:
chosen_sentence_index = np.argmax(probs, axis=0)[y_test[i]]

In [182]:
sentences[chosen_sentence_index]

'1/10'

In [183]:
for j in x[chosen_sentence_index].indices:
    print("%s\t%0.3f" %(terms[j], weights[j]))

1/10	-12.151


The real deal

In [184]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [187]:
chosen_sentences = {}

In [188]:
for tr_i, te_i in kf.split(X_train):
    clf = LogisticRegression(penalty='l1', C=1)
    clf.fit(X_train[tr_i], y_train[tr_i])
    print(clf.score(X_train[te_i], y_train[te_i]))
    for i in te_i:
        sentences = TextBlob(X_train_corp[i]).raw_sentences
        x=vectorizer.transform(sentences)
        probs=clf.predict_proba(x)
        chosen_sentence_index = np.argmax(probs, axis=0)[y_train[i]]
        chosen_sentences[i] = sentences[chosen_sentence_index]
    #    sentences = X_train[i]
    #    for sent in sentences:
    #        v = vect.transform()
    

0.8752099832013439
0.8658346333853354
0.8693147725909036


In [189]:
len(chosen_sentences)

25000

In [190]:
for i in range(3):
    print(X_train_corp[i])
    print()
    print(chosen_sentences[i])
    print("\n\n")

silent night, deadly night 5 is the very last of the series, and like part 4, it is unrelated to the first three except by title and the fact that it is a christmas-themed horror flick.except to the oblivious, there is some obvious thing going on here...mickey rooney plays a toymaker named joe petto and his creepy son's name is pino. ring a bell, anyone? now, a little boy named derek heard a knock at the door one evening, and opened it to find a present on the doorstep for him. even though it said "do not open till christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself. inside is a little red ball that sprouts santa arm and a head, and proceed to kill dad. oop, maybe he should have left well-enough alone. of course derek is then traumatized by the incident since he watched it from the stair, but he does not grow up to be some killer santa, he just stops talking.there is a mysterious stranger lurking around, who

In [191]:
chosen_corpus=list(chosen_sentences.values())

In [192]:
chosen_X = vectorizer.transform(chosen_corpus)

In [193]:
chosen_X

<25000x25962 sparse matrix of type '<class 'numpy.float64'>'
	with 253818 stored elements in Compressed Sparse Row format>

In [194]:
from sklearn.svm import OneClassSVM

In [233]:
from sklearn.metrics.pairwise import cosine_similarity

In [251]:
svm = OneClassSVM(kernel='linear')# we should be using cosine similarity, but for now, let's use linear to see what the weights look like
#svm = OneClassSVM(kernel=cosine_similarity)

In [252]:
svm.fit(chosen_X)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto',
      kernel='linear', max_iter=-1, nu=0.5, random_state=None,
      shrinking=True, tol=0.001, verbose=False)

In [253]:
svm_weights = svm.coef_.todense().A1

In [254]:
top_weights = np.argsort(np.abs(svm_weights))

In [255]:
for i in top_weights[-20:]:
    print("%s\t%0.2f" %(terms[i], svm_weights[i]))

avoid	55.16
did	55.16
like	55.92
perfect	58.16
good	59.15
wonderful	59.37
love	59.60
awful	59.95
character	60.27
story	60.75
performance	60.90
time	71.79
excellent	72.04
just	72.64
worst	74.12
bad	76.30
best	79.40
great	92.41
movie	127.81
film	132.93


In [256]:
i=9
sentences = TextBlob(X_test_corp[i]).raw_sentences
x=vectorizer.transform(sentences)
svm.decision_function(x)

array([[-32.58501325],
       [ -7.47050665],
       [ 41.08405962],
       [  9.05894138],
       [ -4.25182756],
       [-29.51801648],
       [-46.83369572],
       [ 31.04644405],
       [-13.42723929]])

In [257]:
j=np.argmax(svm.decision_function(x))
j

2

In [258]:
sentences[j]

'this is my all-time, number one favorite movie.'

In [259]:
# Let's see what the accuracy on the test data is

In [260]:
clf.score(X_test, y_test)

0.8732

In [261]:
chosen_test_sentences = []
for i in range(len(X_test_corp)):
    sentences = TextBlob(X_test_corp[i]).raw_sentences
    x=vectorizer.transform(sentences)
    df=svm.decision_function(x)
    j=np.argmax(df)
    chosen_test_sentences.append(sentences[j])

In [262]:
chosen_test_sentences[0]

'this was a great show.'

In [263]:
x=vectorizer.transform(chosen_test_sentences)

In [264]:
x.shape

(25000, 25962)

In [265]:
clf.score(x, y_test)

0.76352

This is pretty low. Let's see what kind of mistakes this model is making.

In [266]:
y_pred = clf.predict(x)

In [267]:
incorrect = np.arange(x.shape[0])[y_pred != y_test]

In [268]:
len(incorrect)

5912

In [269]:
i = incorrect[4]
sentences = TextBlob(X_test_corp[i]).raw_sentences
x=vectorizer.transform(sentences)
df = svm.decision_function(x)
probs = clf.predict_proba(x)
for j in range(x.shape[0]):
    print(sentences[j])
    print(df[j])
    print(probs[j])
    print()
#print(X_test_corp[i])
#print()
#print(chosen_test_sentences[i])
#print()

i could not even...i mean...look....okay...wow.not even a bunch of my drunk friends trying to make fun of the movie could enjoy themselves in the least bit.i can only think...how.
[22.47570404]
[0.44601375 0.55398625]

how do independent film maker everywhere go year without getting noticed (or even their life) and con-artist like the guy who made this get a dvd on a shelf?
[4.86495874]
[0.28944889 0.71055111]

it seriously looks as if some guy with a home movie camera went out with some guy he met at subway and made the worst thing he could think of.
[17.82602473]
[0.99646738 0.00353262]

"hey guy, give me some idea.
[-27.95370138]
[0.79834739 0.20165261]

stumrt with a corn-field and work backward."
[-43.34152951]
[0.4746959 0.5253041]

"well, you have gottum have actor straight out of high school, and some oken corn stumlks with shred of clothing attumched.
[-37.16101617]
[0.54276178 0.45723822]

and boobs."
[-54.09431169]
[0.4746959 0.5253041]

thank, guy, i am sure that you and wi

Let's see if things improve when we use human-provided phrases

In [281]:
human_vocab = []
filelist = glob.glob(os.path.join("../data/human-provided-phrases", "*.txt"))
for file in filelist:
    with open(file) as f:
        for line in f:
            human_vocab.append(line.split('\t')[0])

In [282]:
human_vocab

['2/10',
 '3/10',
 '7/10',
 '4/10',
 '8/10',
 '1/10',
 'unwatchable',
 'incoherent',
 'stinker',
 'mst3k',
 'unfunny',
 'waste',
 '9/10',
 'flawless',
 'atrocious',
 'pointless',
 'horrid',
 'superbly',
 'redeeming',
 '10/10',
 'laughable',
 'drivel',
 'worst',
 'perfection',
 'lousy',
 'awful',
 'wasting',
 'remotely',
 'poorly',
 'sucks',
 'captures',
 'wonderfully',
 'existent',
 'lame',
 'boredom',
 'uninspired',
 'miserably',
 'refreshing',
 'amateurish',
 'unintentional',
 'pathetic',
 'eathtaking',
 'appalling',
 'uninteresting',
 'unconvincing',
 'suck',
 'delightful',
 'idiotic',
 'wasted',
 'beautifully',
 'underrated',
 'crap',
 'stupidity',
 'dreadful',
 'tedious',
 'sadness',
 'horrible',
 'insulting',
 'dire',
 'mess',
 'superb',
 'gripping',
 'garbage',
 'timeless',
 'embarrassing',
 'badly',
 'insult',
 'terrible',
 'wooden',
 'touching',
 'worse',
 'cardboard',
 'unforgettable',
 'extraordinary',
 'inept',
 'stupid',
 'pile',
 'worthless',
 'ashamed',
 'junk',
 'illian

In [307]:
vectorizer = TfidfVectorizer(lowercase=True, min_df=5, ngram_range=(1,5), vocabulary=human_vocab, token_pattern=tp, norm='l2')

In [308]:
X_train = vectorizer.fit_transform(X_train_corp)
X_test = vectorizer.transform(X_test_corp)

In [309]:
X_train.shape

(25000, 441)

In [318]:
clf = LogisticRegression(penalty='l1', C=1)
clf.fit(X_train, y_train)
weights = clf.coef_[0]
clf.score(X_train, y_train)

0.84532

In [319]:
clf.score(X_test, y_test)

0.83624