In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from textblob import TextBlob

In [2]:
def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

In [4]:
#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

#Load dataset of sentence [+/-]

X_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtrain.pickle')
X_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtest.pickle')
y_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytrain.pickle')
y_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytest.pickle')

In [26]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# whole imdb corpus
X_train_original_bow = tf_vectorizer.fit_transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)

# positive/negative sentence
X_train_np_bow = tf_vectorizer.transform(X_train_np_sentence)
X_test_np_bow = tf_vectorizer.transform(X_test_np_sentence) 

words = tf_vectorizer.get_feature_names()
words = np.array(words)

In [27]:
# Initial variable
random_state = 42
C = 1

# using whole corpus
clf_i = LogisticRegression(random_state=random_state, C=C)
clf_i.fit(X_train_original_bow, y_train_original)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Approach

In [102]:
idx = 4

document = np.vstack([X_train_original[idx], X_train_original[idx]])
sentences = TextBlob(X_train_original[idx]).raw_sentences
# " ".join(sentences)

In [47]:
document = document.flatten()
len(document)

2

In [28]:
len(sentences)

27

In [48]:
x_doc = tf_vectorizer.transform(document)
x = tf_vectorizer.transform(sentences)

In [49]:
x.shape

(27, 26266)

In [50]:
x_doc.shape

(2, 26266)

In [96]:
x = x.todense()

In [97]:
x.shape

(26, 26266)

In [98]:
sent = []

prob_ = clf_i.predict_proba(x_doc)
prob = prob_[0]

for i in range(x.shape[0]):
    sentences_temp = np.delete(sentences, i, 0)
    sent.append(" ".join(sentences_temp))

In [117]:
if prob[0] > prob[1]:
    base = 0
else:
    base = 1

In [99]:
len(sent)

26

In [104]:
prob_drop = []

x_drop = tf_vectorizer.transform(sent)
x_drop.shape

(26, 26266)

In [105]:
probs = clf_i.predict_proba(x_drop)

In [111]:
prob_drop = np.absolute(probs - prob)

In [118]:
drops = np.absolute(probs[:,base] - prob[base])

In [154]:
prob

array([0.02318392, 0.97681608])

In [156]:
prob[base]

0.9768160808833599

In [157]:
probs

array([[0.05058436, 0.94941564],
       [0.02067961, 0.97932039],
       [0.07779213, 0.92220787],
       [0.01647496, 0.98352504],
       [0.00967595, 0.99032405],
       [0.12177675, 0.87822325],
       [0.0498918 , 0.9501082 ]])

In [158]:
probs[:,base]

array([0.94941564, 0.97932039, 0.92220787, 0.98352504, 0.99032405,
       0.87822325, 0.9501082 ])

In [120]:
np.argmax(drops)

12

In [121]:
sentences[12]

'he only occasionally gives outburst after quietly putting up with more than most could stand under such circumstance.'

Gotcha!

In [184]:
chosen_sentence = []
disagree = []

for i, doc in enumerate(X_train_original):
    x = []
    x.append(doc)
    sentences = TextBlob(doc).raw_sentences
    for i,sent in enumerate(sentences):
        sentences_temp = np.delete(sentences, i, 0)
        x.append(" ".join(sentences_temp))
    
    x_cv = tf_vectorizer.transform(x)
    probs = clf_i.predict_proba(x_cv)
    
    prob = probs[0]
    probs = np.delete(probs,0,0)

    if prob[0] > prob[1]:
        base = 0
    else:
        base = 1
        
    drops = np.absolute(probs[:,base] - prob[base])
    chosen_sentence.append(sentences[np.argmax(drops)])
    disagree.append(sentences[np.argmin(drops)])

In [189]:
chosen_sentence[:10]

["we even see him buying a bunch when derek's mom takes him to the store to find a gift for him to ing him out of his trauma.",
 'the producer has produced a short film of excellent quality that cannot be compared to any other short film that i have seen.',
 'for me, this movie just seemed to fall on its face.',
 "we have high-tech international terrorists/criminal who bicker like pre-school kid, stallone's man-of-steel-type resilience toward ice-cold weather, dialog so dumb that it is sometimes almost hilarious, and so on.",
 'the screenplay manages to do what the best of this type of movie does: give factual event and place them meaningfully inside a dramatic framework that makes you feel like you know the person *behind* the facts.9/10 star',
 'poor!',
 '1st watched 12/7/2002 - 3/10(dir-steve purcell): typical mary kate and ashley fare with a few more kiss.',
 'azaria (the baby) was never seen again, and the result of her horrendous disappearance caused a true life frenzy all around

In [190]:
disagree[:10]

['4 out of 5.',
 'i have rated this film at the highest possible rating.',
 'he is a good guy, and that is always obvious in his performance.',
 'nothing.',
 'rea makes us feel for this man.',
 'this is one of the film we watched.',
 'we will see what happens then.',
 'meryl streep does immaculate justice to the role of lindy, as she always does.',
 'skippy from "family ty" play eddie, a wussy \'metal\' nerd who gets picked on.',
 'mr perlman gives a standout performance (as usual).']

In [171]:
len(chosen_sentence)

25000

In [170]:
len(disagree)

25000

In [173]:
all_ = chosen_sentence+disagree
all_ = np.asarray(all_)

In [174]:
y = np.hstack((np.ones(len(chosen_sentence)), np.zeros(len(disagree))))

In [175]:
y.shape

(50000,)

In [179]:
clf = LogisticRegression(random_state=42, C=1)
x = tf_vectorizer.transform(all_)
clf.fit(x,y)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [180]:
clf.score(x,y)

0.87552