In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from textblob import TextBlob

In [2]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [21]:
#Load dataset of [relevant,-relevant]

X_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtrain.pickle')
X_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtest.pickle')
y_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytrain.pickle')
y_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytest.pickle')

#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

#Load dataset of sentence [+/-]

X_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtrain.pickle')
X_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtest.pickle')
y_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytrain.pickle')
y_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytest.pickle')

In [23]:
print(len(X_train_np_sentence))
print(len(X_test_np_sentence))

666
334


In [4]:
# Baseline

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(lowercase=True, min_df=5, binary=True, token_pattern=token)

X_tr_baseline = cv.fit_transform(X_train_original)
X_te_baseline = cv.transform(X_test_original)

print(len(cv.get_feature_names()))

26266


In [5]:
print(X_tr_baseline.shape)

(25000, 26266)


In [6]:
clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_tr_baseline, y_train_original)


print(clf.score(X_tr_baseline, y_train_original))
print(clf.score(X_te_baseline, y_test_original))

0.90968
0.8794


### Approach

In [25]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# rel/unrel sentence
X_train_sentence_bow = tf_vectorizer.fit_transform(X_train_sentence)
X_test_sentence_bow = tf_vectorizer.transform(X_test_sentence)

# whole imdb corpus
X_train_original_bow = tf_vectorizer.transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)

# neg/pos sentence
X_train_np_bow = tf_vectorizer.fit_transform(X_train_np_sentence)
X_test_np_bow = tf_vectorizer.transform(X_test_np_sentence) 

words = tf_vectorizer.get_feature_names()
print(len(words))

291


In [28]:
# Again baseline

clf = LogisticRegression(random_state=42, C=0.01)

clf.fit(X_train_original_bow, y_train_original)

print(clf.score(X_train_original_bow, y_train_original))
print(clf.score(X_test_original_bow, y_test_original))

0.8416
0.83664


In [27]:
print('corpus')
print(y_train_original.shape)
print(y_test_original.shape)

print('rel/unrel')
print(y_train_sentence.shape)
print(y_test_sentence.shape)

print('np sentence')
print(y_train_np_sentence.shape)
print(y_test_np_sentence.shape)

corpus
(25000,)
(25000,)
rel/unrel
(1333,)
(667,)
np sentence
(666,)
(334,)


### Train A [rel,unrel] classifier

In [29]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.


random_state = 42
C = 0.01

clf_A = LogisticRegression(random_state=random_state, C=C)
clf_A.fit(X_train_sentence_bow, y_train_sentence)

y_predict = clf_A.predict(X_test_sentence_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_A.score(X_train_sentence_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf_A.score(X_test_sentence_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

--------------
C=0.01
--------------
Accuracy
Train:	0.73743 
Test:	0.69265 
             precision    recall  f1-score   support

        0.0       0.76      0.63      0.69       363
        1.0       0.63      0.77      0.69       304

avg / total       0.70      0.69      0.69       667



### Train 1 [+,-] classifier

In [11]:
# using whole corpus

clf_1 = LogisticRegression(random_state=random_state, C=C)
clf_1.fit(X_train_original_bow, y_train_original)

# using the [+/-] sentence
# clf_2 = LogisticRegression(random_state=random_state, C=C)
# clf_2.fit(X_train_sentence_bow, y_train_sentence)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Test on a

In [12]:
TextBlob(X_train_original[1]).raw_sentences

['the idea ia a very short film with a lot of information.',
 'interesting, entertaining and leaves the viewer wanting more.',
 'the producer has produced a short film of excellent quality that cannot be compared to any other short film that i have seen.',
 'i have rated this film at the highest possible rating.',
 'i also recommend that it is shown to office manager and business person in any establishment.',
 'what comes out of it is the fact that person with idea are never listened to, their voice is never heard.',
 'it is a lesson to be learned by any office that wants to go forward.',
 "i hope that the produced will produce a second part to this 'idea'.",
 'i look forward to viewing the sequence.',
 'once again congrat to halaqah medium in producing a film of excellence and quality with a lesson in mind.']

In [13]:
X_train_original[5]

"when i was a kid we always used to be babysat, and we always used to rent a film or see a film at the cinema. this is one of the film we watched. this is one of the stupidest film i have ever seen, i think it might even be a walt disney picture film! a martian is dropped on earth, turns into a human, befriends a human, and is trying everything he can to get back home. but he is distracted by the wonder of the earth. the only good comment i can give is the choice of actor, back to the future's christopher lloyd as the martian, uncle martin, dumb and dumber's jeff daniel as tim o'hara, elizabeth hurley as brace channing and daryl hannah as lizzie. but apart from that it is complete crap. poor!"

In [14]:
TextBlob(X_train_original[5]).raw_sentences

['when i was a kid we always used to be babysat, and we always used to rent a film or see a film at the cinema.',
 'this is one of the film we watched.',
 'this is one of the stupidest film i have ever seen, i think it might even be a walt disney picture film!',
 'a martian is dropped on earth, turns into a human, befriends a human, and is trying everything he can to get back home.',
 'but he is distracted by the wonder of the earth.',
 "the only good comment i can give is the choice of actor, back to the future's christopher lloyd as the martian, uncle martin, dumb and dumber's jeff daniel as tim o'hara, elizabeth hurley as brace channing and daryl hannah as lizzie.",
 'but apart from that it is complete crap.',
 'poor!']

In [15]:
test_set = tf_vectorizer.transform(TextBlob(X_train_original[5]).raw_sentences)

In [16]:
for i,j in enumerate(clf_A.predict_proba(test_set)[:,1]):
    if j>0.5:
        print(i,TextBlob(X_train_original[5]).raw_sentences[i],"\t{:.3f}".format(j))

1 this is one of the film we watched. 	0.650
2 this is one of the stupidest film i have ever seen, i think it might even be a walt disney picture film! 	0.732
5 the only good comment i can give is the choice of actor, back to the future's christopher lloyd as the martian, uncle martin, dumb and dumber's jeff daniel as tim o'hara, elizabeth hurley as brace channing and daryl hannah as lizzie. 	0.512
6 but apart from that it is complete crap. 	0.608
7 poor! 	0.502


In [17]:
# len(np.array(x).flatten())

In [18]:
x = np.where(clf_A.predict_proba(test_set)[:,1]>0.5)

test = test_set[x]

for i,j in enumerate(clf_1.predict_proba(test)):
    print(i,TextBlob(X_train_original[5]).raw_sentences[np.array(x).flatten()[i]],j)

0 this is one of the film we watched. [0.43243437 0.56756563]
1 this is one of the stupidest film i have ever seen, i think it might even be a walt disney picture film! [0.42348131 0.57651869]
2 the only good comment i can give is the choice of actor, back to the future's christopher lloyd as the martian, uncle martin, dumb and dumber's jeff daniel as tim o'hara, elizabeth hurley as brace channing and daryl hannah as lizzie. [0.41162858 0.58837142]
3 but apart from that it is complete crap. [0.51941371 0.48058629]
4 poor! [0.63365011 0.36634989]


In [19]:
# Build an accuracy function excluding those -1


y_pred = []
threshold=None
highest_confidence_related=True

for corpus in X_train_original[0:1]:
    '''
    Breakdown the corpus into sentence and transform into bag-of-words
    '''
    sentence_set = tf_vectorizer.transform(TextBlob(corpus).raw_sentences)
    
    '''
    Related classifier given threshold. 
    threshold, if None, it only predict the label. If float number given, assign the threshold to the sentence with
    probability over threshold
    '''
    if threshold==None:
        y_ind_proba = clf_A.predict_proba(sentence_set)
        y_ind = clf_A.predict(sentence_set)
    else:
        y_ind_proba = clf_A.predict_proba(sentence_set)
        y_ind = y_ind_proba[:,1]>threshold
        
    '''
    +/- classifier given the sentences from the previous classifier
    if 0, assign the label as -1 (which means it does not have any related sentence)
    else, classify the sentence into +/- label according to the given sentence. 
    '''
    if np.sum(y_ind) == 0:
        y_pred.append(-1)
        continue
    else:
        if highest_confidence_related:
            indices = np.array(np.argmax(y_ind_proba[:,1])).flatten()
            y = clf_1.predict(sentence_set[indices, :])
            y_pred.append(y)
        else:
            indices = np.array(np.where(y_ind[:,1] == 1)).flatten()
            y_np_proba = clf_1.predict_proba(sentence_set[indices, :])
            y_neg_proba = np.max(y_np_proba[:,0])
            y_pos_proba = np.max(y_np_proba[:,1])
            mn, mp = np.max(y_np_proba, axis=1)

            if y_pos_proba > y_neg_proba:
                y_pred.append(1)
            else:
                y_pred.append(0)

     
    
y_pred = np.array(y_pred)

In [20]:
mn

NameError: name 'mn' is not defined

In [None]:
y_pred

In [None]:
y_train_original[1:5]