In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from textblob import TextBlob

In [2]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [3]:
#Load dataset of [relevant,-relevant]

X_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtrain.pickle')
X_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtest.pickle')
y_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytrain.pickle')
y_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytest.pickle')

#Load dataset of [+/-]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [4]:
# Baseline

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(lowercase=True, min_df=5, binary=True, token_pattern=token)

X_tr_baseline = cv.fit_transform(X_train_original)
X_te_baseline = cv.transform(X_test_original)

print(len(cv.get_feature_names()))

26266


In [5]:
print(X_tr_baseline.shape)

(25000, 26266)


In [6]:
clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_tr_baseline, y_train_original)


print(clf.score(X_tr_baseline, y_train_original))
print(clf.score(X_te_baseline, y_test_original))

0.90968
0.8794


### Approach

In [None]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

X_train_sentence_bow = tf_vectorizer.fit_transform(X_train_sentence)
X_test_sentence_bow = tf_vectorizer.transform(X_test_sentence)

X_train_original_bow = tf_vectorizer.transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)


words = tf_vectorizer.get_feature_names()
print(len(words))

546


In [None]:
# Again baseline

clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_train_original_bow, y_train_original)

print(clf.score(X_train_original_bow, y_train_original))
print(clf.score(X_test_original_bow, y_test_original))

In [None]:
print(y_train_sentence.shape)
print(y_test_sentence.shape)
print(y_train_original.shape)
print(y_test_original.shape)

### Train to [rel,unrel] classifier

In [None]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.



random_state = 42
C = 0.01

clf_A = LogisticRegression(random_state=random_state, C=C)
clf_A.fit(X_train_sentence_bow, y_train_sentence)

y_predict = clf_A.predict(X_test_sentence_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_A.score(X_train_sentence_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf_A.score(X_test_sentence_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

### Train [+,-] classifier

In [None]:
# using whole corpus

clf_1 = LogisticRegression(random_state=random_state, C=C)
clf_1.fit(X_train_original_bow, y_train_original)

# using the [+/-] sentence
# clf_2 = LogisticRegression(random_state=random_state, C=C)
# clf_2.fit(X_train_sentence_bow, y_train_sentence)

### Test on a

In [None]:
TextBlob(X_train_original[1]).raw_sentences

In [None]:
X_train_original[5]

In [None]:
TextBlob(X_train_original[5]).raw_sentences

In [None]:
test_set = tf_vectorizer.transform(TextBlob(X_train_original[5]).raw_sentences)

In [None]:
for i,j in enumerate(clf_A.predict_proba(test_set)[:,1]):
    if j>0.5:
        print(i,TextBlob(X_train_original[5]).raw_sentences[i],"\t{:.3f}".format(j))

In [None]:
# len(np.array(x).flatten())

In [None]:
x = np.where(clf_A.predict_proba(test_set)[:,1]>0.5)

test = test_set[x]

for i,j in enumerate(clf_1.predict_proba(test)):
    print(i,TextBlob(X_train_original[5]).raw_sentences[np.array(x).flatten()[i]],j)

In [None]:
# Build an accuracy function excluding those -1


y_pred = []
threshold=None
highest_confidence_related=True

for corpus in X_train_original[0:1]:
    '''
    Breakdown the corpus into sentence and transform into bag-of-words
    '''
    sentence_set = tf_vectorizer.transform(TextBlob(corpus).raw_sentences)
    
    '''
    Related classifier given threshold. 
    threshold, if None, it only predict the label. If float number given, assign the threshold to the sentence with
    probability over threshold
    '''
    if threshold==None:
        y_ind_proba = clf_A.predict_proba(sentence_set)
        y_ind = clf_A.predict(sentence_set)
    else:
        y_ind_proba = clf_A.predict_proba(sentence_set)
        y_ind = y_ind_proba[:,1]>threshold
        
    '''
    +/- classifier given the sentences from the previous classifier
    if 0, assign the label as -1 (which means it does not have any related sentence)
    else, classify the sentence into +/- label according to the given sentence. 
    '''
    if np.sum(y_ind) == 0:
        y_pred.append(-1)
        continue
    else:
        if highest_confidence_related:
            indices = np.array(np.argmax(y_ind_proba[:,1])).flatten()
            y = clf_1.predict(sentence_set[indices, :])
            y_pred.append(y)
        else:
            indices = np.array(np.where(y_ind[:,1] == 1)).flatten()
            y_np_proba = clf_1.predict_proba(sentence_set[indices, :])
            y_neg_proba = np.max(y_np_proba[:,0])
            y_pos_proba = np.max(y_np_proba[:,1])
            mn, mp = np.max(y_np_proba, axis=1)

            if y_pos_proba > y_neg_proba:
                y_pred.append(1)
            else:
                y_pred.append(0)

     
    
y_pred = np.array(y_pred)

In [None]:
mn

In [None]:
y_pred

In [None]:
y_train_original[1:5]