In [1]:
import numpy as np

from dataset_load import *

X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [2]:
####################################################################

# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

######################################################################

In [3]:
len(cv.get_feature_names())

3686

### LR1

In [4]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(penalty='l1', random_state=42)
lr1.fit(X_train, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## LR2

In [5]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [6]:
y_train_agreement, y_test_agreement = generate_appearance(X_train_original, X_test_original, 
                                                          word_list, connotation)

In [7]:
htcv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True, vocabulary=word_list)

In [8]:
ht_train = htcv.transform(X_train_original)
ht_test = htcv.transform(X_test_original)

In [9]:
y_train_agreement_pos = np.copy(y_train_agreement)
y_train_agreement_neg = np.copy(y_train_agreement)
y_test_agreement_pos = np.copy(y_test_agreement)
y_test_agreement_neg = np.copy(y_test_agreement)

y_train_agreement_pos[y_train_agreement_pos == -1] = 0
y_train_agreement_neg[y_train_agreement_neg == 1] = 0
y_test_agreement_pos[y_test_agreement_pos == -1] = 0
y_test_agreement_neg[y_test_agreement_neg == 1] = 0

#### From actual label

In [10]:
lr2_train = list()
for i,y in enumerate(y_train_original):
    if y == 1:
        lr2_train.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train.append(y_train_agreement_neg[i] * y)
        
lr2_train = np.asarray(lr2_train)

In [11]:
lr2 = LogisticRegression(penalty='l1', random_state=42)
lr2.fit(lr2_train, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Test with predict {0,1}

lr1_predict = lr1.predict(X_test)

lr2_test_bin = list()
for i,y in enumerate(lr1_predict):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)

print(lr2.score(lr2_test_bin, y_test_original))

# Test with predict [0,1]

lr1_predict = lr1.predict_proba(X_test)[:,1]

lr2_test_proba = list()
for i,y in enumerate(lr1_predict):
    if y>=0.5:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

print(lr2.score(lr2_test_proba, y_test_original))

0.8418
0.8418


In [13]:
weight = lr2.coef_.flatten()
weight

array([0.        , 0.        , 0.        , 0.        , 3.60950486,
       2.00050248, 5.57702426, 5.42721375, 4.47975806, 5.48895803,
       6.49374532, 0.        , 0.        , 0.        , 0.        ,
       0.        , 7.12266294, 0.        , 7.99754779, 0.        ,
       0.        , 4.29804201, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 5.78204372, 6.57889363,
       7.16281132, 0.        , 5.52208984, 5.79987759, 6.5694834 ,
       0.        , 8.08820867, 0.        , 0.        , 5.73227543,
       8.40122709, 0.        , 5.4907169 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 6.49429474, 0.        ,
       0.        , 0.        , 5.54924019, 0.        , 0.        ,
       6.74646851, 0.        , 0.        , 0.        , 0.        ,
       0.        , 5.70928471, 5.21552235, 0.        , 4.56295182,
       0.        , 0.        , 5.29493477, 0.        , 5.63213107,
       5.92644426, 5.25289555, 0.        , 0.        , 0.     

In [14]:
def print_weight(clf1, clf2=None):
    w = clf1.coef_.flatten()
    indices = np.argsort(w)[::-1]
    con = list(connotation.values())

    if clf2 is not None:
        w2 = clf2.coef_.flatten()
        for i in indices:
            print('%s \t %.3f \t %.3f \t %d' %(word_list[i], w[i], w2[i], con[i]))
    else:  
        for i in indices:
            print('%s \t %.3f \t %d' %(word_list[i], w[i],con[i]))

#### Train with predicted label

In [15]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

In [16]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [17]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train_original)
lr2_y_hat_proba.fit(lr2_train_proba, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
y_test_pred_bin = lr2_y_hat_bin.predict(lr2_test_bin)
y_test_pred_proba = lr2_y_hat_proba.predict(lr2_test_proba)

In [19]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test_original))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test_original))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train_original))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train_original))

0.84176
0.84176

0.89284
0.89316


In [20]:
print(lr2_y_hat_bin.intercept_)
print(lr2_y_hat_proba.intercept_)

[-1.51901016]
[-1.6046603]


In [21]:
# print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

In [22]:
# print_weight(lr2_y_hat_proba)

### What if... we put scale and ReLU manually

If we do not scale. Then, the negative weights would be closer to zero. The magnitude would be lesser.

In [23]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

# Scale

predicted_y = predicted_y * 2 - 1
proba_y = proba_y * 2 - 1

In [24]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [25]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train_original)
lr2_y_hat_proba.fit(lr2_train_proba, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
# Test with predict {0,1}

predicted_y_test = lr1.predict(X_test)
predicted_y_test = predicted_y_test * 2 - 1

lr2_test_bin = list()
for i,y in enumerate(predicted_y_test):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)


# Test with predict [0,1]

predicted_y_test_proba = lr1.predict_proba(X_test)[:,1]
predicted_y_test_proba = predicted_y_test_proba * 2 - 1

lr2_test_proba = list()
for i,y in enumerate(predicted_y_test_proba):
    if y>=0:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

In [27]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test_original))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test_original))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train_original))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train_original))

0.85604
0.85648

0.9054
0.905


In [28]:
accept_indices = np.where(np.sum(lr2_test_proba,axis=1)!=0)[0]

In [29]:
print(lr2_y_hat_bin.score(lr2_test_bin[accept_indices], y_test_original[accept_indices]))
print(lr2_y_hat_proba.score(lr2_test_proba[accept_indices], y_test_original[accept_indices]))

0.8966081658634173
0.8971083022642539


In [30]:
from sklearn.metrics import log_loss

p = lr2_y_hat_proba.predict_proba(lr2_test_proba[accept_indices])[:,1]

log_loss(y_test_original[accept_indices], p)

0.3502273851498478

In [31]:
np.where(np.sum(lr2_test_bin,axis=1)==0)[0]

array([    8,    12,    36, ..., 24992, 24997, 24998], dtype=int64)

In [32]:
np.sum(np.sum(lr2_test_proba,axis=1)==0)/lr2_test_proba.shape[0]

0.12024

In [33]:
np.sum(np.sum(lr2_test_proba,axis=1)==0)

3006

In [34]:
lr2_test_proba[8]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
lr2_test_bin[8]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
predicted_y_test_proba[8]

0.31798309868414254

In [37]:
predicted_y_test[8]

1

In [38]:
print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

8/10 	 2.783 	 2.919 	 1
7/10 	 2.608 	 2.802 	 1
noir 	 2.013 	 3.067 	 1
superb 	 2.003 	 3.073 	 1
brilliant 	 1.895 	 2.570 	 1
10/10 	 1.754 	 1.916 	 1
9/10 	 1.716 	 1.455 	 1
refreshing 	 1.640 	 1.462 	 1
incredible 	 1.631 	 2.308 	 1
amazing 	 1.419 	 2.182 	 1
excellent 	 1.410 	 1.958 	 1
great 	 1.381 	 2.359 	 1
fascinating 	 1.369 	 2.367 	 1
beautiful 	 1.366 	 2.417 	 1
enjoyed 	 1.346 	 1.694 	 1
wonderful 	 1.326 	 2.334 	 1
perfect 	 1.325 	 2.366 	 1
surprisingly 	 1.309 	 1.940 	 1
fantastic 	 1.296 	 1.652 	 1
gem 	 1.294 	 2.425 	 1
best 	 1.294 	 2.351 	 1
favorite 	 1.290 	 1.963 	 1
rare 	 1.252 	 2.202 	 1
subtle 	 1.189 	 2.220 	 1
solid 	 1.141 	 1.982 	 1
fun 	 1.095 	 2.272 	 1
recommended 	 1.070 	 2.034 	 1
loved 	 1.023 	 2.030 	 1
enjoyable 	 0.958 	 1.987 	 1
beautifully 	 0.606 	 -0.172 	 1
wasted 	 0.509 	 0.766 	 0
funniest 	 0.278 	 0.068 	 1
5/10 	 0.102 	 0.432 	 1
wonderfully 	 0.029 	 0.000 	 1
badly 	 0.000 	 0.409 	 0
6/10 	 0.000 	 0.000

In [39]:
# becasue LR sklearn use a regularization

In [40]:
from IPython import display

from coloredweighteddoc import ColoredWeightedDoc

In [41]:
# review = list(['I was in Chicago last week. And I wanted to see a movie so bad. I am surprised that the movie itself is just amazing.  The plot was kinda weak, but it was great.  Christopher Nolan is just brilliant. Never fails to amaze. Even though the weather was terrible, but I enjoyed it. No regret! Overall, 9/10 . Recommended.'])

review = list(['I went to the movie. The movie is terrible. Acting was great, but the plot was awful. The weather is excellent. 1/10 . Avoid it.'])

In [42]:
lr1 = LogisticRegression(C=1, random_state=42, penalty='l1')
lr1.fit(X_train, y_train_original)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
review_cv = cv.transform(review)
words = cv.get_feature_names()
weights = lr1.coef_.flatten()

In [44]:
display.display(ColoredWeightedDoc(review[0], words, weights, binary = False))

In [45]:
lr1.predict(review_cv)

array([0])

In [46]:
word_list

['1/10',
 '2/10',
 '3/10',
 '4/10',
 '5/10',
 '6/10',
 '7/10',
 '8/10',
 '9/10',
 '10/10',
 'amazing',
 'annoying',
 'avoid',
 'awful',
 'bad',
 'badly',
 'beautiful',
 'beautifully',
 'best',
 'bland',
 'boring',
 'brilliant',
 'cheap',
 'disappointed',
 'disappointing',
 'disappointment',
 'dreadful',
 'dull',
 'enjoyable',
 'enjoyed',
 'excellent',
 'fails',
 'fantastic',
 'fascinating',
 'favorite',
 'forgettable',
 'fun',
 'funny',
 'funniest',
 'gem',
 'great',
 'horrible',
 'incredible',
 'insult',
 'lacks',
 'lame',
 'laughable',
 'lousy',
 'loved',
 'mediocre',
 'mess',
 'mst3k',
 'noir',
 'obnoxious',
 'pathetic',
 'perfect',
 'perfectly',
 'pointless',
 'poor',
 'poorly',
 'predictable',
 'rare',
 'recommended',
 'redeeming',
 'refreshing',
 'ridiculous',
 'sadly',
 'solid',
 'stupid',
 'subtle',
 'superb',
 'surprisingly',
 'tedious',
 'terrible',
 'unfortunately',
 'unfunny',
 'waste',
 'wasted',
 'weak',
 'wonderful',
 'wonderfully',
 'worse',
 'worst']

In [47]:
lr1.predict_proba(review_cv)[:,1]

array([0.00044398])

In [48]:
indices = np.argsort(weights)

for i in indices[:30]:
    print(words[i], weights[i])

3/10 -4.980492132096975
1/10 -4.937107261864974
4/10 -4.824637583352251
2/10 -4.6176845612231165
unwatchable -3.271299911592406
incoherent -2.250700821801265
unfunny -2.2279709510711276
stinker -2.1861393041620403
waste -2.129423651160658
disappointment -2.0909189158850356
poorly -2.0801273102448348
behave -2.0368013490575243
worst -1.9320372991334245
miscast -1.850236734715928
uninspired -1.8247514115870782
appalling -1.7576297833333383
pointless -1.7575713622081954
lousy -1.735310024667366
cardboard -1.7300221393521815
boredom -1.7241510728677027
forgettable -1.7140029410285202
mildly -1.6789299445531114
obnoxious -1.678676401827334
awful -1.6780891624228707
mst3k -1.6649308372763307
fails -1.6167210166756092
fest -1.615303586857381
wooden -1.603304904784953
furthermore -1.5881262252686443
laughable -1.5728643342134705


In [49]:
ht_lr = LogisticRegression(penalty='l1', random_state=42)

cv_ht = CountVectorizer(binary=True, vocabulary=word_list)
X = cv_ht.fit_transform(X_train_original)

In [50]:
ht_lr.fit(X, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [51]:
ht_ex = cv_ht.transform(review)

In [52]:
ht_lr.predict(ht_ex)

array([0])

In [53]:
w = ht_lr.coef_.flatten()

In [54]:
display.display(ColoredWeightedDoc(review[0], word_list, w, binary = False))