In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from textblob import TextBlob

In [2]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [3]:
#Load dataset of sentence [relevant,-relevant]

X_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtrain.pickle')
X_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtest.pickle')
y_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytrain.pickle')
y_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytest.pickle')

#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

#Load dataset of sentence [+/-]

X_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtrain.pickle')
X_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtest.pickle')
y_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytrain.pickle')
y_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytest.pickle')

In [4]:
# Baseline

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(lowercase=True, min_df=5, binary=True, token_pattern=token)

X_tr_baseline = cv.fit_transform(X_train_original)
X_te_baseline = cv.transform(X_test_original)

print(len(cv.get_feature_names()))

26266


In [5]:
print(X_tr_baseline.shape)

(25000, 26266)


In [6]:
clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_tr_baseline, y_train_original)


print(clf.score(X_tr_baseline, y_train_original))
print(clf.score(X_te_baseline, y_test_original))

0.90968
0.8794


### Approach

In [7]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# rel/unrel sentence
X_train_sentence_bow = tf_vectorizer.fit_transform(X_train_sentence)
X_test_sentence_bow = tf_vectorizer.transform(X_test_sentence)

# whole imdb corpus
X_train_original_bow = tf_vectorizer.transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)

# neg/pos sentence
X_train_np_bow = tf_vectorizer.transform(X_train_np_sentence)
X_test_np_bow = tf_vectorizer.transform(X_test_np_sentence) 

words = tf_vectorizer.get_feature_names()
print(len(words))

546


In [8]:
# Again baseline

clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_train_original_bow, y_train_original)

print(clf.score(X_train_original_bow, y_train_original))
print(clf.score(X_test_original_bow, y_test_original))

0.8416
0.83664


In [9]:
print('corpus')
print(y_train_original.shape)
print(y_test_original.shape)

print('rel/unrel')
print(y_train_sentence.shape)
print(y_test_sentence.shape)

print('np sentence')
print(y_train_np_sentence.shape)
print(y_test_np_sentence.shape)

corpus
(25000,)
(25000,)
rel/unrel
(1333,)
(667,)
np sentence
(666,)
(334,)


### Train A [rel,unrel] classifier

In [10]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.


random_state = 42
C = 1

clf_A = LogisticRegression(random_state=random_state, C=C)
clf_A.fit(X_train_sentence_bow, y_train_sentence)

y_predict = clf_A.predict(X_test_sentence_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_A.score(X_train_sentence_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf_A.score(X_test_sentence_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

--------------
C=1.00
--------------
Accuracy
Train:	0.90623 
Test:	0.74813 
             precision    recall  f1-score   support

        0.0       0.79      0.73      0.76       363
        1.0       0.71      0.77      0.74       304

avg / total       0.75      0.75      0.75       667



### Train 1 [+,-] classifier

In [11]:
# using whole corpus
clf_1_i = LogisticRegression(random_state=random_state, C=C)
clf_1_i.fit(X_train_original_bow, y_train_original)

# using the [+/-] sentence

clf_1_j = LogisticRegression(random_state=random_state, C=C)
clf_1_j.fit(X_train_np_bow, y_train_np_sentence)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Test baseline

y_predict = clf_1_i.predict(X_test_original_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_i.score(X_train_original_bow, y_train_original)))
print('Test:\t%.5f ' %(clf_1_i.score(X_test_original_bow, y_test_original)))
    
print(classification_report(y_test_original,y_predict))

--------------
C=1.00
--------------
Accuracy
Train:	0.84776 
Test:	0.84204 
             precision    recall  f1-score   support

          0       0.85      0.83      0.84     12500
          1       0.83      0.86      0.84     12500

avg / total       0.84      0.84      0.84     25000



In [13]:
# Test baseline

y_predict = clf_1_j.predict(X_test_np_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_j.score(X_train_np_bow, y_train_np_sentence)))
print('Test:\t%.5f ' %(clf_1_j.score(X_test_np_bow, y_test_np_sentence)))
    
print(classification_report(y_test_np_sentence,y_predict))

--------------
C=1.00
--------------
Accuracy
Train:	0.94144 
Test:	0.74551 
             precision    recall  f1-score   support

        0.0       0.76      0.69      0.73       163
        1.0       0.73      0.80      0.76       171

avg / total       0.75      0.75      0.74       334



In [14]:
# Build an accuracy function excluding those -1

y_pred_i = []
y_pred_j = []
threshold=0.99

for ind, corpus in enumerate(X_test_original):
    '''
    Breakdown the corpus into sentence and transform into bag-of-words
    '''
    sentence_set = tf_vectorizer.transform(TextBlob(corpus).raw_sentences)
    
    '''
    Related classifier given threshold. 
    threshold, if None, it only predict the label. If float number given, assign the threshold to the sentence with
    probability over threshold
    '''
    if threshold==None:
        y_A_proba = clf_A.predict_proba(sentence_set)
        mu, mr = np.argmax(y_A_proba, axis=0)
    
        if y_A_proba[mr,1] > 0.5:
            y_i_proba = clf_1_i.predict_proba(sentence_set[mr])
            y_pred_i.append(np.argmax(y_i_proba))
            
            y_j_proba = clf_1_j.predict_proba(sentence_set[mr])
            y_pred_j.append(np.argmax(y_j_proba))
        else:
            y_pred_i.append(-1)
            y_pred_j.append(-1)
            continue
    else:
        y_A_proba = clf_A.predict_proba(sentence_set)
        y_threshold = (y_A_proba[:,1] > threshold)
        
        mr = np.array(np.where(y_threshold)).flatten()
        
        if len(mr) == 0:
            y_pred_i.append(-1)
            y_pred_j.append(-1)
            continue  
        else: 
            y_i_proba = clf_1_i.predict_proba(sentence_set[mr])
            y_i_avg = np.mean(y_i_proba, axis=0)
            y_pred_i.append(np.argmax(y_i_avg))
            
            y_j_proba = clf_1_j.predict_proba(sentence_set[mr])
            y_j_avg = np.mean(y_j_proba, axis=0)
            y_pred_j.append(np.argmax(y_j_avg))

     
    
y_pred_i = np.array(y_pred_i)
y_pred_j = np.array(y_pred_j)

print(y_pred_i)
print(y_pred_j)

[-1 -1 -1 ... -1 -1 -1]
[-1 -1 -1 ... -1 -1 -1]


In [17]:
#Procedure call

print('t \t ~rel \t rr \t acc_i \t acc_j')
print('%.2f \t %d \t %.5f \t %.5f \t %.5f' %(threshold,
                                       np.sum(y_pred_i==-1),
                                       rejection_rate(y_pred_i), 
                                       accuracy(y_test_original, y_pred_i), 
                                       accuracy(y_test_original,y_pred_j)))

t 	 ~rel 	 rr 	 acc_i 	 acc_j
0.99 	 18894 	 0.75576 	 0.78431 	 0.71258


In [16]:
def rejection_rate(y):
    return np.sum(y==-1)/len(y)

def accuracy(y, y_pred):
    return np.sum(y_pred==y)/(np.sum(y_pred==1) + np.sum(y_pred==0))

In [None]:
np.savetxt('cos_sim.csv', np.around(cos_sim,2), delimiter=',')