In [1]:
import numpy as np

from dataset_load import *

X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [2]:
####################################################################

# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

######################################################################

In [3]:
len(cv.get_feature_names())

3686

### LR1

In [4]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(penalty='l1', random_state=42)
lr1.fit(X_train, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## LR2

In [5]:
word_list, connotation = load_unigrams('./imdb-unigrams.txt', X_train_original, y_train_original)

In [6]:
y_train_agreement, y_test_agreement = generate_appearance(X_train_original, X_test_original, 
                                                          word_list, connotation)

In [7]:
htcv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True, vocabulary=word_list)

In [8]:
ht_train = htcv.transform(X_train_original)
ht_test = htcv.transform(X_test_original)

In [9]:
y_train_agreement_pos = np.copy(y_train_agreement)
y_train_agreement_neg = np.copy(y_train_agreement)
y_test_agreement_pos = np.copy(y_test_agreement)
y_test_agreement_neg = np.copy(y_test_agreement)

y_train_agreement_pos[y_train_agreement_pos == -1] = 0
y_train_agreement_neg[y_train_agreement_neg == 1] = 0
y_test_agreement_pos[y_test_agreement_pos == -1] = 0
y_test_agreement_neg[y_test_agreement_neg == 1] = 0

#### From actual label

In [10]:
lr2_train = list()
for i,y in enumerate(y_train_original):
    if y == 1:
        lr2_train.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train.append(y_train_agreement_neg[i] * y)
        
lr2_train = np.asarray(lr2_train)

In [11]:
lr2 = LogisticRegression(penalty='l1', random_state=42)
lr2.fit(lr2_train, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Test with predict {0,1}

lr1_predict = lr1.predict(X_test)

lr2_test_bin = list()
for i,y in enumerate(lr1_predict):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)

lr2.score(lr2_test_bin, y_test_original)

0.8418

In [13]:
# Test with predict [0,1]

lr1_predict = lr1.predict_proba(X_test)[:,1]

lr2_test_proba = list()
for i,y in enumerate(lr1_predict):
    if y>=0.5:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

lr2.score(lr2_test_proba, y_test_original)

0.8418

In [14]:
weight = lr2.coef_.flatten()
weight

array([0.        , 0.        , 0.        , 0.        , 3.60950486,
       2.00050248, 5.57702426, 5.42721375, 4.47975806, 5.48895803,
       6.49374532, 0.        , 0.        , 0.        , 0.        ,
       0.        , 7.12266294, 0.        , 7.99754779, 0.        ,
       0.        , 4.29804201, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 5.78204372, 6.57889363,
       7.16281132, 0.        , 5.52208984, 5.79987759, 6.5694834 ,
       0.        , 8.08820867, 0.        , 0.        , 5.73227543,
       8.40122709, 0.        , 5.4907169 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 6.49429474, 0.        ,
       0.        , 0.        , 5.54924019, 0.        , 0.        ,
       6.74646851, 0.        , 0.        , 0.        , 0.        ,
       0.        , 5.70928471, 5.21552235, 0.        , 4.56295182,
       0.        , 0.        , 5.29493477, 0.        , 5.63213107,
       5.92644426, 5.25289555, 0.        , 0.        , 0.     

In [31]:
def print_weight(clf1, clf2=None):
    w = clf1.coef_.flatten()
    indices = np.argsort(w)[::-1]
    con = list(connotation.values())

    if clf2 is not None:
        w2 = clf2.coef_.flatten()
        for i in indices:
            print('%s \t %.3f \t %.3f \t %d' %(word_list[i], w[i], w2[i], con[i]))
    else:  
        for i in indices:
            print('%s \t %.3f \t %d' %(word_list[i], w[i],con[i]))

#### Train with predicted label

In [17]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

In [18]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [19]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train_original)
lr2_y_hat_proba.fit(lr2_train_proba, y_train_original)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
y_test_pred_bin = lr2_y_hat_bin.predict(lr2_test_bin)
y_test_pred_proba = lr2_y_hat_proba.predict(lr2_test_proba)

In [35]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test_original))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test_original))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train_original))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train_original))

0.84176
0.84176

0.89284
0.89316


In [34]:
print(lr2_y_hat_bin.intercept_)
print(lr2_y_hat_proba.intercept_)

[-1.51901016]
[-1.6046603]


In [33]:
print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

8/10 	 4.023 	 4.283 	 1
7/10 	 3.851 	 4.087 	 1
noir 	 3.268 	 3.808 	 1
brilliant 	 3.145 	 3.782 	 1
superb 	 3.139 	 3.636 	 1
10/10 	 2.977 	 3.299 	 1
9/10 	 2.778 	 3.091 	 1
incredible 	 2.743 	 3.259 	 1
fascinating 	 2.638 	 3.316 	 1
great 	 2.589 	 3.280 	 1
refreshing 	 2.585 	 2.743 	 1
excellent 	 2.563 	 3.050 	 1
beautiful 	 2.548 	 3.241 	 1
amazing 	 2.519 	 3.088 	 1
enjoyed 	 2.484 	 3.005 	 1
wonderful 	 2.457 	 3.034 	 1
best 	 2.452 	 3.126 	 1
gem 	 2.418 	 3.128 	 1
favorite 	 2.369 	 2.929 	 1
surprisingly 	 2.351 	 2.946 	 1
subtle 	 2.319 	 2.946 	 1
perfect 	 2.299 	 2.933 	 1
rare 	 2.280 	 2.898 	 1
fun 	 2.273 	 3.036 	 1
fantastic 	 2.210 	 2.649 	 1
solid 	 2.123 	 2.726 	 1
loved 	 2.018 	 2.643 	 1
recommended 	 1.994 	 2.562 	 1
enjoyable 	 1.919 	 2.602 	 1
5/10 	 0.924 	 1.218 	 1
beautifully 	 0.511 	 0.070 	 1
funniest 	 0.365 	 0.223 	 1
fails 	 0.000 	 0.000 	 0
avoid 	 0.000 	 0.000 	 0
2/10 	 0.000 	 0.000 	 0
3/10 	 0.000 	 0.000 	 0
4/10

In [32]:
print_weight(lr2_y_hat_proba)

8/10 	 4.283 	 1
7/10 	 4.087 	 1
noir 	 3.808 	 1
brilliant 	 3.782 	 1
superb 	 3.636 	 1
fascinating 	 3.316 	 1
10/10 	 3.299 	 1
great 	 3.280 	 1
incredible 	 3.259 	 1
beautiful 	 3.241 	 1
gem 	 3.128 	 1
best 	 3.126 	 1
9/10 	 3.091 	 1
amazing 	 3.088 	 1
excellent 	 3.050 	 1
fun 	 3.036 	 1
wonderful 	 3.034 	 1
enjoyed 	 3.005 	 1
subtle 	 2.946 	 1
surprisingly 	 2.946 	 1
perfect 	 2.933 	 1
favorite 	 2.929 	 1
rare 	 2.898 	 1
refreshing 	 2.743 	 1
solid 	 2.726 	 1
fantastic 	 2.649 	 1
loved 	 2.643 	 1
enjoyable 	 2.602 	 1
recommended 	 2.562 	 1
5/10 	 1.218 	 1
dreadful 	 0.576 	 0
funniest 	 0.223 	 1
beautifully 	 0.070 	 1
dull 	 0.000 	 0
disappointment 	 0.000 	 0
disappointing 	 0.000 	 0
disappointed 	 0.000 	 0
fails 	 0.000 	 0
cheap 	 0.000 	 0
badly 	 0.000 	 0
awful 	 0.000 	 0
forgettable 	 0.000 	 0
avoid 	 0.000 	 0
annoying 	 0.000 	 0
bland 	 0.000 	 0
1/10 	 0.000 	 0
4/10 	 0.000 	 0
worse 	 0.000 	 0
wonderfully 	 0.000 	 1
2/10 	 0.000 	 0


### What if... we put scale and ReLU manually