In [1]:
import numpy as np

from dataset_load import *

X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [2]:
####################################################################

# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

######################################################################

In [3]:
len(cv.get_feature_names())

3686

### LR1

In [4]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(penalty='l1', random_state=42)
lr1.fit(X_train, y_train_original)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## LR2

In [5]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [6]:
y_train_agreement, y_test_agreement = generate_appearance(X_train_original, X_test_original, 
                                                          word_list, connotation)

In [7]:
htcv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True, vocabulary=word_list)

In [8]:
ht_train = htcv.transform(X_train_original)
ht_test = htcv.transform(X_test_original)

In [9]:
y_train_agreement_pos = np.copy(y_train_agreement)
y_train_agreement_neg = np.copy(y_train_agreement)
y_test_agreement_pos = np.copy(y_test_agreement)
y_test_agreement_neg = np.copy(y_test_agreement)

y_train_agreement_pos[y_train_agreement_pos == -1] = 0
y_train_agreement_neg[y_train_agreement_neg == 1] = 0
y_test_agreement_pos[y_test_agreement_pos == -1] = 0
y_test_agreement_neg[y_test_agreement_neg == 1] = 0

#### From actual label

In [10]:
lr2_train = list()
for i,y in enumerate(y_train_original):
    if y == 1:
        lr2_train.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train.append(y_train_agreement_neg[i] * y)
        
lr2_train = np.asarray(lr2_train)

In [11]:
lr2 = LogisticRegression(penalty='l1', random_state=42)
lr2.fit(lr2_train, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
# Test with predict {0,1}

lr1_predict = lr1.predict(X_test)

lr2_test_bin = list()
for i,y in enumerate(lr1_predict):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)

print(lr2.score(lr2_test_bin, y_test_original))

# Test with predict [0,1]

lr1_predict = lr1.predict_proba(X_test)[:,1]

lr2_test_proba = list()
for i,y in enumerate(lr1_predict):
    if y>=0.5:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

print(lr2.score(lr2_test_proba, y_test_original))

0.84176
0.84176


In [13]:
weight = lr2.coef_.flatten()
weight

array([0.        , 0.        , 0.        , 0.        , 3.60807977,
       2.00065885, 5.57719273, 5.42575667, 4.4799257 , 5.48751428,
       6.49268325, 0.        , 0.        , 0.        , 0.        ,
       0.        , 7.14574196, 0.        , 8.05024534, 0.        ,
       0.        , 4.29807921, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 5.78208606, 6.57940866,
       7.16514804, 0.        , 5.52189856, 5.79841191, 6.56802781,
       0.        , 8.11054374, 0.        , 0.        , 5.73070348,
       8.53974843, 0.        , 5.48924186, 0.        , 0.        ,
       0.        , 0.        , 0.        , 6.49440925, 0.        ,
       0.        , 0.        , 5.5493446 , 0.        , 0.        ,
       6.75690602, 0.        , 0.        , 0.        , 0.        ,
       0.        , 5.7077356 , 5.21556887, 0.        , 4.56289999,
       0.        , 0.        , 5.29323725, 0.        , 5.63209228,
       5.92664306, 5.25290756, 0.        , 0.        , 0.     

In [14]:
def print_weight(clf1, clf2=None):
    w = clf1.coef_.flatten()
    indices = np.argsort(w)[::-1]
    con = list(connotation.values())

    if clf2 is not None:
        w2 = clf2.coef_.flatten()
        for i in indices:
            print('%s \t %.3f \t %.3f \t %d' %(word_list[i], w[i], w2[i], con[i]))
    else:  
        for i in indices:
            print('%s \t %.3f \t %d' %(word_list[i], w[i],con[i]))

#### Train with predicted label

In [15]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

In [16]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [17]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train_original)
lr2_y_hat_proba.fit(lr2_train_proba, y_train_original)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_test_pred_bin = lr2_y_hat_bin.predict(lr2_test_bin)
y_test_pred_proba = lr2_y_hat_proba.predict(lr2_test_proba)

In [19]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test_original))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test_original))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train_original))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train_original))

0.84172
0.84176

0.89284
0.89316


In [20]:
print(lr2_y_hat_bin.intercept_)
print(lr2_y_hat_proba.intercept_)

[-1.51896895]
[-1.60487157]


In [21]:
# print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

In [22]:
# print_weight(lr2_y_hat_proba)

### What if... we put scale and ReLU manually

If we do not scale. Then, the negative weights would be closer to zero. The magnitude would be lesser.

In [23]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

# Scale

predicted_y = predicted_y * 2 - 1
proba_y = proba_y * 2 - 1

In [24]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [25]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train_original)
lr2_y_hat_proba.fit(lr2_train_proba, y_train_original)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
# Test with predict {0,1}

predicted_y_test = lr1.predict(X_test)
predicted_y_test = predicted_y_test * 2 - 1

lr2_test_bin = list()
for i,y in enumerate(predicted_y_test):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)


# Test with predict [0,1]

predicted_y_test_proba = lr1.predict_proba(X_test)[:,1]
predicted_y_test_proba = predicted_y_test_proba * 2 - 1

lr2_test_proba = list()
for i,y in enumerate(predicted_y_test_proba):
    if y>=0:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

In [27]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test_original))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test_original))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train_original))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train_original))

0.856
0.85648

0.90536
0.90496


In [28]:
accept_indices = np.where(np.sum(lr2_test_proba,axis=1)!=0)[0]

In [29]:
print(lr2_y_hat_bin.score(lr2_test_bin[accept_indices], y_test_original[accept_indices]))
print(lr2_y_hat_proba.score(lr2_test_proba[accept_indices], y_test_original[accept_indices]))

0.8965626989178866
0.8971083022642539


In [30]:
from sklearn.metrics import log_loss

p = lr2_y_hat_proba.predict_proba(lr2_test_proba[accept_indices])[:,1]

log_loss(y_test_original[accept_indices], p)

0.3502183449411724

In [31]:
np.where(np.sum(lr2_test_bin,axis=1)==0)[0]

array([    8,    12,    36, ..., 24992, 24997, 24998])

In [32]:
np.sum(np.sum(lr2_test_proba,axis=1)==0)/lr2_test_proba.shape[0]

0.12024

In [33]:
np.sum(np.sum(lr2_test_proba,axis=1)==0)

3006

In [34]:
lr2_test_proba[8]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
lr2_test_bin[8]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
predicted_y_test_proba[8]

0.31825049869445077

In [37]:
predicted_y_test[8]

1

In [38]:
print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

8/10 	 2.783 	 2.919 	 1
7/10 	 2.606 	 2.803 	 1
noir 	 2.014 	 3.068 	 1
superb 	 2.003 	 3.072 	 1
brilliant 	 1.895 	 2.570 	 1
10/10 	 1.754 	 1.916 	 1
9/10 	 1.714 	 1.455 	 1
refreshing 	 1.638 	 1.461 	 1
incredible 	 1.631 	 2.308 	 1
surprisingly 	 1.421 	 1.940 	 1
amazing 	 1.416 	 2.183 	 1
excellent 	 1.409 	 1.957 	 1
great 	 1.380 	 2.360 	 1
fascinating 	 1.369 	 2.367 	 1
beautiful 	 1.365 	 2.408 	 1
gem 	 1.360 	 2.425 	 1
enjoyed 	 1.346 	 1.694 	 1
wonderful 	 1.326 	 2.334 	 1
perfect 	 1.324 	 2.369 	 1
fantastic 	 1.297 	 1.652 	 1
best 	 1.293 	 2.355 	 1
favorite 	 1.290 	 1.963 	 1
rare 	 1.251 	 2.202 	 1
subtle 	 1.188 	 2.220 	 1
solid 	 1.139 	 1.981 	 1
fun 	 1.094 	 2.267 	 1
recommended 	 1.069 	 2.035 	 1
loved 	 1.023 	 2.030 	 1
enjoyable 	 0.957 	 1.987 	 1
beautifully 	 0.606 	 -0.163 	 1
wasted 	 0.508 	 0.650 	 0
funniest 	 0.279 	 0.070 	 1
5/10 	 0.101 	 0.432 	 1
wonderfully 	 0.028 	 0.000 	 1
badly 	 0.000 	 0.410 	 0
6/10 	 0.000 	 0.000

In [39]:
# becasue LR sklearn use a regularization

In [40]:
from IPython import display

from coloredweighteddoc import ColoredWeightedDoc

In [41]:
# review = list(['I was in Chicago last week. And I wanted to see a movie so bad. I am surprised that the movie itself is just amazing.  The plot was kinda weak, but it was great.  Christopher Nolan is just brilliant. Never fails to amaze. Even though the weather was terrible, but I enjoyed it. No regret! Overall, 9/10 . Recommended.'])

review = list(['i went to the movie. the movie is terrible. acting was great, but the plot was awful. the weather is excellent. 1/10 . avoid it.'])

In [42]:
lr1 = LogisticRegression(C=0.01, random_state=42, penalty='l1')
lr1.fit(X_train, y_train_original)



LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
review_cv = cv.transform(review)
words = cv.get_feature_names()
weights = lr1.coef_.flatten()

In [44]:
display.display(ColoredWeightedDoc(review[0], words, weights, binary = False))

In [45]:
lr1.predict(review_cv)

array([0], dtype=int32)

In [46]:
word_list

['1/10',
 '2/10',
 '3/10',
 '4/10',
 '5/10',
 '6/10',
 '7/10',
 '8/10',
 '9/10',
 '10/10',
 'amazing',
 'annoying',
 'avoid',
 'awful',
 'bad',
 'badly',
 'beautiful',
 'beautifully',
 'best',
 'bland',
 'boring',
 'brilliant',
 'cheap',
 'disappointed',
 'disappointing',
 'disappointment',
 'dreadful',
 'dull',
 'enjoyable',
 'enjoyed',
 'excellent',
 'fails',
 'fantastic',
 'fascinating',
 'favorite',
 'forgettable',
 'fun',
 'funny',
 'funniest',
 'gem',
 'great',
 'horrible',
 'incredible',
 'insult',
 'lacks',
 'lame',
 'laughable',
 'lousy',
 'loved',
 'mediocre',
 'mess',
 'mst3k',
 'noir',
 'obnoxious',
 'pathetic',
 'perfect',
 'perfectly',
 'pointless',
 'poor',
 'poorly',
 'predictable',
 'rare',
 'recommended',
 'redeeming',
 'refreshing',
 'ridiculous',
 'sadly',
 'solid',
 'stupid',
 'subtle',
 'superb',
 'surprisingly',
 'tedious',
 'terrible',
 'unfortunately',
 'unfunny',
 'waste',
 'wasted',
 'weak',
 'wonderful',
 'wonderfully',
 'worse',
 'worst']

In [47]:
lr1.predict_proba(review_cv)[:,1]

array([0.4115806])

In [48]:
indices = np.argsort(weights)

for i in indices[:30]:
    print(words[i], weights[i])

worst -1.2842515039051958
waste -1.0065469401437892
awful -0.8980442415704627
bad -0.7767717048859519
boring -0.6373151896568955
poor -0.5645964485179006
terrible -0.4936919105797549
nothing -0.4303432480178849
stupid -0.42005432604835014
worse -0.3849472284771958
supposed -0.3735232682664809
no -0.33481738233288877
dull -0.3115335092612119
script -0.30772850884910136
poorly -0.29333117177627327
minute -0.29130517908433423
unfortunately -0.2692659567405329
horrible -0.26285445771150406
plot -0.2602081716911528
annoying -0.23256980612930267
instead -0.22594661884291586
even -0.1893470524210007
ridiculous -0.18020222218454074
could -0.16059086072052153
any -0.15933860387651116
money -0.15827458565855743
oh -0.1543228125124743
why -0.14837421463176279
least -0.1476850969370449
just -0.14082331136840898


In [56]:
ht_lr = LogisticRegression(penalty='l1', random_state=42)

y_train_agreement_LR = np.copy(y_train_agreement)
y_train_agreement_LR[y_train_agreement_LR != 0] = 1
y_test_agreement_LR = np.copy(y_test_agreement)
y_test_agreement_LR[y_test_agreement_LR != 0] = 1

# cv_ht = CountVectorizer(binary=True, vocabulary=word_list)
# X = cv_ht.fit_transform(list([X_test_original[2062]]))

In [57]:
ht_lr.fit(y_train_agreement_LR, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
# ht_ex = cv_ht.transform(review)

lr1 = LogisticRegression(penalty='l1', random_state=42)
lr1.fit(X_train, y_train_original)
w_original = lr1.coef_[0]

In [66]:
w = ht_lr.coef_.flatten()

our_reject_lr_correct_idx = 5324
our_correct_lr_correct_idx = 305 # OK
our_reject_lr_reject = 24611


display.display(ColoredWeightedDoc(X_test_original[our_reject_lr_correct_idx], word_list, w, binary = False))
display.display(ColoredWeightedDoc(X_test_original[our_correct_lr_correct_idx], word_list, w, binary = False))
display.display(ColoredWeightedDoc(X_test_original[our_reject_lr_reject], word_list, w, binary = False))

In [73]:
print(y_test_original[our_reject_lr_correct_idx], y_test_original[our_correct_lr_correct_idx], y_test_original[our_reject_lr_reject])

0 0 1


In [70]:
display.display(ColoredWeightedDoc(X_test_original[our_reject_lr_correct_idx], cv.get_feature_names(), w_original, binary = False))
display.display(ColoredWeightedDoc(X_test_original[our_correct_lr_correct_idx], cv.get_feature_names(), w_original, binary = False))
display.display(ColoredWeightedDoc(X_test_original[our_reject_lr_reject], cv.get_feature_names(), w_original, binary = False))

In [102]:
import glob
path = r'../../data/aclImdb/'


test_neg_files = glob.glob(path+"/test/neg/*.txt")
test_pos_files = glob.glob(path+"/test/pos/*.txt")

    
for tnf in test_neg_files:
    f = open(tnf, 'r', encoding="utf8")
    if 'the odd mixture of comedy and horror sometimes works and sometimes' in f.read().lower():
        print('our_reject_lr_correct_idx', tnf) 
    elif 'a bit too much mediterrenean' in f.read().lower():
        print('our_correct_lr_correct_idx', tnf)
    elif 'this movie is about' in f.read().lower():
        print('our_reject_lr_reject', tnf)
    f.close()
    
for tpf in test_pos_files:
    f = open(tpf, 'r', encoding="utf8")
    if 'a bit too much mediterrenean' in f.read().lower():
        print('our_correct_lr_correct_idx', tpf)
    elif 'this movie is about' in f.read():
        print('our_reject_lr_reject', tpf)
    f.close()
    
print("Test Data loaded.")

our_reject_lr_correct_idx ../../data/aclImdb//test/neg/441_4.txt
Test Data loaded.


In [92]:
for i,c in enumerate(X_test_original):
    if 'the odd mixture of comedy and horror sometimes works and sometimes' in c:
        print(i, c)

5324 the odd mixture of comedy and horror sometimes works and sometimes does not. had the main male character been a little more interesting, the film would have been as well. a trio of young american visit paris, run into a beautiful werewolf, and the problem confound from there.numerous logic hole make the possibly intriguing story difficult to take.


In [93]:
for i,c in enumerate(X_test_original):
    if 'it was aesthetically nice for a while but it could not sustain' in c:
        print(i, c)

305 a bit too much mediterrenean machismo for me. the cast was beautiful, lovely to watch in all of the romantic scene. the locale was beautiful with azure sky and azure water. it just was not convincing to me that such an egomaniac crud bent on nothing but his building, could attract so many beautiful, vulnerable, woman. only in the mediterranean i guess. certainly in no world i am familiar with. the macho man were really obnoxious, and i found it difficult to believe that the female character could have anything to to with them for so long. the screenplay, cinematography, directing, etc. was set up to deliver a class b film, the central effort being on showing scene of beautiful exposed female east. it was aesthetically nice for a while but it could not sustain a very mediocre film.


In [94]:
for i,c in enumerate(X_test_original):
    if 'among the passenger is convicted serial killer' in c:
        print(i, c)

24611 this movie is about the crew of a spaceship who crash land onto a strange ight planet with three sun. among the passenger is convicted serial killer, richard riddick (vin diesel), and his nemesis, johns (cole hauser). while the survivor of the wreck are getting their bearing, an eclipse of all three sun happens. shortly afterward, scary creature begin to appear and start to pick off the crew member one by one. during the eclipse, riddick comes into his own, as his eyeball have been surgically shined, giving him night vision. the crew have to rely on him to try to get them to safety.this is a very stylish film, with the colour of the sky changing scene by scene, giving it a very strange look. vin diesel is illiant as riddick, truly menacing but forced to help person who otherwise would only see him as a threat. there is also an underlying humour in his performance as well, which adds another dimension to the character. this is the sort of role that vin excel in, the anti-hero, who

In [80]:
12582 - 12500

82