In [27]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [4]:
#Load dataset of [relevant,-relevant]

X_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtrain.pickle')
X_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtest.pickle')
y_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytrain.pickle')
y_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytest.pickle')

#Load dataset of [+/-]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [28]:
# Baseline

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(lowercase=True, min_df=5, binary=True, token_pattern=token)

X_tr_baseline = cv.fit_transform(X_train_original)
X_te_baseline = cv.transform(X_test_original)

print(len(cv.get_feature_names()))

26266


In [33]:
print(X_tr_baseline.shape)
print(y_train_original.shape)

(25000, 26266)
(25000,)


In [38]:
clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_tr_baseline, y_train_original)


print(clf.score(X_tr_baseline, y_train_original))
print(clf.score(X_te_baseline, y_test_original))

0.90968
0.8794


### Approach

In [35]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?



token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

X_train_sentence_bow = tf_vectorizer.fit_transform(X_train_sentence)
X_test_sentence_bow = tf_vectorizer.transform(X_test_sentence)

X_train_original_bow = tf_vectorizer.transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)


words = tf_vectorizer.get_feature_names()
print(len(words))

4532


In [37]:
# Again baseline

clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_train_original_bow, y_train_original)

print(clf.score(X_train_original_bow, y_train_original))
print(clf.score(X_test_original_bow, y_test_original))

0.88508
0.86832


In [6]:
print(y_train_sentence.shape)
print(y_test_sentence.shape)
print(y_train_original.shape)
print(y_test_original.shape)

(1333,)
(667,)
(25000,)
(25000,)


### Train to [rel,unrel] classifier

In [26]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

random_state = 42
C = 0.01

clf = LogisticRegression(random_state=random_state, C=C)
clf.fit(X_train_sentence_bow, y_train_sentence)

y_predict = clf.predict(X_test_sentence_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf.score(X_train_sentence_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf.score(X_test_sentence_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

--------------
C=0.01
--------------
Accuracy
Train:	0.76819 
Test:	0.69415 
             precision    recall  f1-score   support

        0.0       0.77      0.63      0.69       363
        1.0       0.64      0.77      0.70       304

avg / total       0.71      0.69      0.69       667



### Train [+,-] classifier

In [41]:
# using whole corpus

clf_a = LogisticRegression(random_state=random_state, C=C)
clf_a.fit(X_train_sentence_bow, y_train_sentence)

# using the [+/-] sentence
clf_b = LogisticRegression(random_state=random_state, C=C)
clf_b.fit(X_train_sentence_bow, y_train_sentence)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Test

In [12]:
# Build an accuracy function excluding those -1

