In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from textblob import TextBlob

In [2]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [3]:
#Load dataset of sentence [relevant,-relevant]

X_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtrain.pickle')
X_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtest.pickle')
y_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytrain.pickle')
y_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytest.pickle')

#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

#Load dataset of sentence [+/-]

X_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtrain.pickle')
X_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtest.pickle')
y_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytrain.pickle')
y_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytest.pickle')

### Preprocessing

In [4]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# rel/unrel sentence
X_train_sentence_bow = tf_vectorizer.fit_transform(X_train_sentence)
X_test_sentence_bow = tf_vectorizer.transform(X_test_sentence)

# whole imdb corpus
X_train_original_bow = tf_vectorizer.transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)

# neg/pos sentence
X_train_np_bow = tf_vectorizer.transform(X_train_np_sentence)
X_test_np_bow = tf_vectorizer.transform(X_test_np_sentence) 

words = tf_vectorizer.get_feature_names()
print(len(words))

546


### Train A [rel,unrel] classifier

In [5]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.


random_state = 42
C = 0.1

clf_A = LogisticRegression(random_state=random_state, C=C)
clf_A.fit(X_train_sentence_bow, y_train_sentence)

y_predict = clf_A.predict(X_test_sentence_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_A.score(X_train_sentence_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf_A.score(X_test_sentence_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

--------------
C=0.10
--------------
Accuracy
Train:	0.82671 
Test:	0.75712 
             precision    recall  f1-score   support

        0.0       0.80      0.73      0.77       363
        1.0       0.71      0.79      0.75       304

avg / total       0.76      0.76      0.76       667



### Train 1 [+,-] classifier

In [6]:
# using whole corpus
clf_1_i = LogisticRegression(random_state=random_state, C=C)
clf_1_i.fit(X_train_original_bow, y_train_original)

# using the [+/-] sentence

clf_1_j = LogisticRegression(random_state=random_state, C=C)
clf_1_j.fit(X_train_np_bow, y_train_np_sentence)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
# Test baseline

y_predict = clf_1_i.predict(X_test_original_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_i.score(X_train_original_bow, y_train_original)))
print('Test:\t%.5f ' %(clf_1_i.score(X_test_original_bow, y_test_original)))
    
print(classification_report(y_test_original,y_predict))

--------------
C=0.10
--------------
Accuracy
Train:	0.84748 
Test:	0.84212 
             precision    recall  f1-score   support

          0       0.85      0.83      0.84     12500
          1       0.83      0.85      0.84     12500

avg / total       0.84      0.84      0.84     25000



In [8]:
# Test baseline

y_predict = clf_1_j.predict(X_test_np_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_j.score(X_train_np_bow, y_train_np_sentence)))
print('Test:\t%.5f ' %(clf_1_j.score(X_test_np_bow, y_test_np_sentence)))
    
print(classification_report(y_test_np_sentence,y_predict))

--------------
C=0.10
--------------
Accuracy
Train:	0.84084 
Test:	0.72156 
             precision    recall  f1-score   support

        0.0       0.73      0.68      0.70       163
        1.0       0.71      0.76      0.74       171

avg / total       0.72      0.72      0.72       334



### Now the real deal

In [9]:
# Build an accuracy function excluding those -1

def nested_classifier(X_test_original, threshold):
    y_pred_i = []
    y_pred_j = []
    
    for ind, corpus in enumerate(X_test_original):
        '''
        Breakdown the corpus into sentence and transform into bag-of-words
        '''
        sentence_set = tf_vectorizer.transform(TextBlob(corpus).raw_sentences)

        '''
        related sentence classifier
        '''
        y_A_proba = clf_A.predict_proba(sentence_set)
        mu, mr = np.argmax(y_A_proba, axis=0)

        '''
        +/- classifier
        '''
        if y_A_proba[mr,1] > threshold:
            y_i_proba = clf_1_i.predict_proba(sentence_set[mr])
            y_pred_i.append(np.argmax(y_i_proba))

            y_j_proba = clf_1_j.predict_proba(sentence_set[mr])
            y_pred_j.append(np.argmax(y_j_proba))
        else:
            y_pred_i.append(-1)
            y_pred_j.append(-1)
            continue
    return np.array(y_pred_i), np.array(y_pred_j)

In [10]:
def rejection_rate(y):
    return np.sum(y==-1)/len(y)

def accuracy(y, y_pred):
    return np.sum(y_pred==y)/(np.sum(y_pred==1) + np.sum(y_pred==0))

In [11]:
threshold = np.arange(0.5, 1, 0.05)
threshold = np.append(threshold, [0.96, 0.97, 0.98, 0.99])

print('t \t ~rel \t rr \t acc_i \t acc_j')
for t in threshold : 
    y_pred_i, y_pred_j = nested_classifier(X_test_original, t)
    print('%.2f \t %d \t %.5f \t %.5f \t %.5f' %(t,
                                                 np.sum(y_pred_i==-1),
                                                 rejection_rate(y_pred_i), 
                                                 accuracy(y_test_original, y_pred_i), 
                                                 accuracy(y_test_original,y_pred_j)))

t 	 ~rel 	 rr 	 acc_i 	 acc_j
0.50 	 525 	 0.02100 	 0.71632 	 0.64547
0.55 	 897 	 0.03588 	 0.71701 	 0.64556
0.60 	 1532 	 0.06128 	 0.71834 	 0.64594
0.65 	 2501 	 0.10004 	 0.71994 	 0.64732
0.70 	 4056 	 0.16224 	 0.72522 	 0.65011
0.75 	 6460 	 0.25840 	 0.73069 	 0.65372
0.80 	 9944 	 0.39776 	 0.73698 	 0.66020
0.85 	 14382 	 0.57528 	 0.75353 	 0.67065
0.90 	 19515 	 0.78060 	 0.77302 	 0.68642
0.95 	 23685 	 0.94740 	 0.81445 	 0.71255
0.96 	 24172 	 0.96688 	 0.82488 	 0.72947
0.97 	 24582 	 0.98328 	 0.82536 	 0.71770
0.98 	 24835 	 0.99340 	 0.82424 	 0.73333
0.99 	 24964 	 0.99856 	 0.83333 	 0.83333


In [12]:
# np.savetxt('cos_sim.csv', np.around(cos_sim,2), delimiter=',')
print(C)

0.1


Now, let's take a look on the words