In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from textblob import TextBlob

In [2]:
def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load Data

In [3]:
#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

#Load dataset of sentence [+/-]

X_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtrain.pickle')
X_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtest.pickle')
y_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytrain.pickle')
y_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytest.pickle')

### Preprocessing

In [4]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# whole imdb corpus
X_train_original_bow = tf_vectorizer.fit_transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)

# positive/negative sentence
X_train_np_bow = tf_vectorizer.transform(X_train_np_sentence)
X_test_np_bow = tf_vectorizer.transform(X_test_np_sentence) 

words = tf_vectorizer.get_feature_names()
words = np.array(words)

In [5]:
# Initial variable
random_state = 42
C = 1

# using whole corpus
clf_i = LogisticRegression(random_state=random_state, C=C)
clf_i.fit(X_train_original_bow, y_train_original)

# using the [+/-] sentence

clf_j = LogisticRegression(random_state=random_state, C=C)
clf_j.fit(X_train_np_bow, y_train_np_sentence)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Approach

Extraction

In [6]:
i=4
sentences = TextBlob(X_test_original[i]).raw_sentences


In [7]:
# for j in x[chosen_sentence_index].indices:
#     print("%s\t%0.3f" %(terms[j], weights[j]))

In [8]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [9]:
chosen_sentences = {}

In [10]:
for train_index, test_index in kf.split(X_train_original_bow):
    clf = LogisticRegression(penalty='l1', C=1)
    clf.fit(X_train_original_bow[train_index], y_train_original[train_index])
    print(clf.score(X_train_original_bow[train_index], y_train_original[train_index]))
    print(clf.score(X_train_original_bow[test_index], y_train_original[test_index]))
    for i in test_index:
        sentences = TextBlob(X_train_original[i]).raw_sentences
        x = tf_vectorizer.transform(sentences)
        probs = clf.predict_proba(x)
        chosen_sentence_index = np.argmax(probs, axis=0)[y_train_original[i]]
        chosen_sentences[i] = sentences[chosen_sentence_index]
    #    sentences = X_train[i]
    #    for sent in sentences:
    #        v = vect.transform()

0.9826593063722548
0.8707703383729302
0.9852402951940962
0.8676347053882155
0.9842803143937121
0.8712348493939758


In [11]:
chosen_sentences

{0: 'whee.a step up from part 4, but not much of one.',
 3: "we have high-tech international terrorists/criminal who bicker like pre-school kid, stallone's man-of-steel-type resilience toward ice-cold weather, dialog so dumb that it is sometimes almost hilarious, and so on.",
 6: '1st watched 12/7/2002 - 3/10(dir-steve purcell): typical mary kate and ashley fare with a few more kiss.',
 17: 'he is played by david oyelowo, who is illiant at the part, coming across as a totally sympathetic person, although his only activity for five hour are loving and grieving, which he does superbly, so that one wants to comfort him, as he is so obviously a nice guy.',
 19: 'wb had to mess this up with some tripe batman of the future.',
 29: "the family continues to stay in the haunted house as thing get worse and worse and no mention of the monk is made until nearly the very end when he turns up again to do what he should have done an hour earlier--try to drive the spirit out of the house, although by

In [12]:
len(chosen_sentences)

25000

Relevant Sentence Model

In [13]:
from sklearn.svm import OneClassSVM
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
chosen_corpus=list(chosen_sentences.values())
# chosen_corpus

In [15]:
# count vectorize the chosen_corpus

svm_clf = OneClassSVM(kernel='rbf', random_state=42)

In [16]:
X = tf_vectorizer.transform(chosen_corpus)

In [17]:
X

<25000x26266 sparse matrix of type '<class 'numpy.int64'>'
	with 676323 stored elements in Compressed Sparse Row format>

In [18]:
svm_clf.fit(X)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.5, random_state=42, shrinking=True, tol=0.001,
      verbose=False)

In [19]:
# need to extract sentence for each doc
# and use decision_function (predict)

svm_clf.decision_function(X[:10])

array([[  3.89802165],
       [ -4.69473815],
       [  0.90259644],
       [ -6.62202232],
       [  3.55835032],
       [-28.07471014],
       [  1.37904705],
       [  2.87247895],
       [ -9.36394717],
       [ -0.42607689]])

In [20]:
j=np.max(svm_clf.decision_function(X[:10]))
j

3.898021654051263

In [22]:
chosen_sentence=[]

for ind, corpus in enumerate(X_test_original):
    '''
    Breakdown the corpus into sentence and transform into bag-of-words
    '''
    sentence_set = tf_vectorizer.transform(TextBlob(corpus).raw_sentences)

    '''
    related sentence classifier
    '''
    svm_decision = svm_clf.decision_function(sentence_set)
    mr = np.argmax(svm_decision)

    # no threshold?
    chosen_sentence.append(TextBlob(corpus).raw_sentences[mr])
    

In [23]:
len(chosen_sentence)

25000

In [24]:
X_chosen = tf_vectorizer.transform(chosen_sentence)

In [25]:
clf_i.score(X_chosen,y_test_original)

0.64752

In [26]:
clf_j.score(X_chosen,y_test_original)

0.62656