In [1]:
import numpy as np

from dataset_load import *

path = r"..\..\data\reviews_Amazon_Instant_Video_5.json.gz"

X, y = extract_review_amazon(path, 'reviewText')
y_label = np.asarray(y)

neutral_indices = np.where(y_label == 3)[0]
y_label[y_label<3] = 0
y_label[y_label>3] = 1

X_discarded = np.delete(X,neutral_indices)
y_discarded = np.delete(y_label, neutral_indices)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit

# split
X_train_split, X_test_split, y_train, y_test = train_test_split(X_discarded, y_discarded, test_size=0.33, random_state=42)


# preprocessing
X_train_corpus_update = update_corpus_contraction(X_train_split)
X_test_corpus_update = update_corpus_contraction(X_test_split)

(75, 2)
corpus update start
corpus update end

(75, 2)
corpus update start
corpus update end



In [2]:
####################################################################

# Count vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_corpus_update)
X_test = cv.transform(X_test_corpus_update)

######################################################################

In [3]:
len(cv.get_feature_names())

1499

### LR1

In [4]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(penalty='l1', random_state=42)
lr1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## LR2

In [5]:
word_list, connotation = load_unigrams('./amazon-video-unigrams-more.txt', X_train_corpus_update, y_train)

In [6]:
y_train_agreement, y_test_agreement = generate_appearance(X_train_corpus_update, X_test_corpus_update, 
                                                          word_list, connotation)

In [7]:
htcv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True, vocabulary=word_list)

In [8]:
ht_train = htcv.transform(X_train_corpus_update)
ht_test = htcv.transform(X_test_corpus_update)

In [9]:
y_train_agreement_pos = np.copy(y_train_agreement)
y_train_agreement_neg = np.copy(y_train_agreement)
y_test_agreement_pos = np.copy(y_test_agreement)
y_test_agreement_neg = np.copy(y_test_agreement)

y_train_agreement_pos[y_train_agreement_pos == -1] = 0
y_train_agreement_neg[y_train_agreement_neg == 1] = 0
y_test_agreement_pos[y_test_agreement_pos == -1] = 0
y_test_agreement_neg[y_test_agreement_neg == 1] = 0

#### From actual label

In [10]:
lr2_train = list()
for i,y in enumerate(y_train):
    if y == 1:
        lr2_train.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train.append(y_train_agreement_neg[i] * y)
        
lr2_train = np.asarray(lr2_train)

In [11]:
lr2 = LogisticRegression(penalty='l1', random_state=42)
lr2.fit(lr2_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Test with predict {0,1}

lr1_predict = lr1.predict(X_test)

lr2_test_bin = list()
for i,y in enumerate(lr1_predict):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)

print(lr2.score(lr2_test_bin, y_test))

# Test with predict [0,1]

lr1_predict = lr1.predict_proba(X_test)[:,1]

lr2_test_proba = list()
for i,y in enumerate(lr1_predict):
    if y>=0.5:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

print(lr2.score(lr2_test_proba, y_test))

0.892916283348666
0.892916283348666


In [13]:
weight = lr2.coef_.flatten()
weight

array([4.28918339, 2.67608546, 2.06030386, 1.23953709, 0.        ,
       1.44715692, 3.59021862, 5.44840051, 2.9410274 , 2.34183544,
       2.23180046, 0.        , 3.10105059, 2.60747664, 4.16913173,
       1.34150674, 2.67811426, 0.        , 4.97959419, 4.0351474 ,
       6.85408214, 5.85027575, 4.7721894 , 4.82172967, 3.34839224,
       2.23796446, 4.80542253, 3.82654216, 2.13342247, 2.68336358,
       6.6060343 , 3.82547233, 2.99316974, 4.06270428, 2.51605173,
       0.        , 0.        , 0.27412764, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       2.01044686, 1.94684598, 1.42204315, 0.        , 1.53956788,
       1.7602934 , 0.        , 2.81949389, 2.14899092, 3.86829695,
       0.        , 1.19486136, 1.86958905, 4.62976364, 1.88177154,
       1.6589575 , 2.07634325, 2.61391363, 1.57157637, 2.22516645,
       1.43331211, 2.39872507, 1.8250656 , 1.12657921, 2.21986553,
       2.00817979, 4.30834626, 2.609186  , 0.        , 2.58729

In [14]:
def print_weight(clf1, clf2=None):
    w = clf1.coef_.flatten()
    indices = np.argsort(w)[::-1]
    con = list(connotation.values())

    if clf2 is not None:
        w2 = clf2.coef_.flatten()
        for i in indices:
            print('%s \t %.3f \t %.3f \t %d' %(word_list[i], w[i], w2[i], con[i]))
    else:  
        for i in indices:
            print('%s \t %.3f \t %d' %(word_list[i], w[i],con[i]))

#### Train with predicted label

In [15]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

In [16]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [17]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train)
lr2_y_hat_proba.fit(lr2_train_proba, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
y_test_pred_bin = lr2_y_hat_bin.predict(lr2_test_bin)
y_test_pred_proba = lr2_y_hat_proba.predict(lr2_test_proba)

In [19]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train))

0.8932842686292548
0.8939282428702852

0.8893017354660383
0.8946939145407585


In [20]:
print(lr2_y_hat_bin.intercept_)
print(lr2_y_hat_proba.intercept_)

[1.01963155]
[1.05741623]


In [21]:
# print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

In [22]:
# print_weight(lr2_y_hat_proba)

### What if... we put scale and ReLU manually

If we do not scale. Then, the negative weights would be closer to zero. The magnitude would be lesser.

In [23]:
predicted_y = lr1.predict(X_train)
proba_y = lr1.predict_proba(X_train)[:,1]

# Scale

predicted_y = predicted_y * 2 - 1
proba_y = proba_y * 2 - 1

In [24]:
lr2_train_bin = list()
for i,y in enumerate(predicted_y):
    if y == 1:
        lr2_train_bin.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_bin.append(y_train_agreement_neg[i] * y)
        
lr2_train_bin = np.asarray(lr2_train_bin)

lr2_train_proba = list()
for i,y in enumerate(proba_y):
    if y >= 0.5:
        lr2_train_proba.append(y_train_agreement_pos[i] * y)
    else:
        lr2_train_proba.append(y_train_agreement_neg[i] * y)
        
lr2_train_proba = np.asarray(lr2_train_proba)

In [25]:
# Train with {0,1} label
# Train with [0,1] proba

lr2_y_hat_bin = LogisticRegression(penalty='l1', random_state=42)
lr2_y_hat_proba = LogisticRegression(penalty='l1', random_state=42)

lr2_y_hat_bin.fit(lr2_train_bin, y_train)
lr2_y_hat_proba.fit(lr2_train_proba, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
# Test with predict {0,1}

predicted_y_test = lr1.predict(X_test)
predicted_y_test = predicted_y_test * 2 - 1

lr2_test_bin = list()
for i,y in enumerate(predicted_y_test):
    if y==1:
        lr2_test_bin.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_bin.append(y_test_agreement_neg[i] * y)
    
lr2_test_bin = np.asarray(lr2_test_bin)


# Test with predict [0,1]

predicted_y_test_proba = lr1.predict_proba(X_test)[:,1]
predicted_y_test_proba = predicted_y_test_proba * 2 - 1

lr2_test_proba = list()
for i,y in enumerate(predicted_y_test_proba):
    if y>=0:
        lr2_test_proba.append(y_test_agreement_pos[i] * y)
    else:
        lr2_test_proba.append(y_test_agreement_neg[i] * y)
    
lr2_test_proba = np.asarray(lr2_test_proba)

In [27]:
print(lr2_y_hat_bin.score(lr2_test_bin,y_test))
print(lr2_y_hat_proba.score(lr2_test_proba,y_test))
print()
print(lr2_y_hat_bin.score(lr2_train_bin,y_train))
print(lr2_y_hat_proba.score(lr2_train_proba,y_train))

0.9164673413063478
0.9161913523459062

0.9205219991843763
0.9190720014499978


In [28]:
accept_indices = np.where(np.sum(lr2_test_proba,axis=1)!=0)[0]

In [29]:
print(lr2_y_hat_bin.score(lr2_test_bin[accept_indices], y_test[accept_indices]))
print(lr2_y_hat_proba.score(lr2_test_proba[accept_indices], y_test[accept_indices]))

0.9490515564202334
0.9486867704280155


In [39]:
from sklearn.metrics import log_loss

p = lr2_y_hat_proba.predict_proba(lr2_test_proba[accept_indices])[:,1]

log_loss(y_test[accept_indices], p)

0.19371377731153747

In [30]:
np.where(np.sum(lr2_test_bin,axis=1)==0)[0]

array([    3,    12,    13, ..., 10854, 10859, 10866], dtype=int64)

In [31]:
np.sum(np.sum(lr2_test_proba,axis=1)==0)/lr2_test_proba.shape[0]

0.24342226310947562

In [32]:
np.sum(np.sum(lr2_test_proba,axis=1)==0)

2646

In [33]:
lr2_test_proba[8]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.99558703,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.99558703, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [34]:
lr2_test_bin[8]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
predicted_y_test_proba[8]

0.9955870320224969

In [36]:
predicted_y_test[8]

1.0

In [37]:
print_weight(lr2_y_hat_bin, lr2_y_hat_proba)

intense 	 2.702 	 2.497 	 1
hooked 	 2.561 	 2.826 	 1
enjoyed 	 1.778 	 2.446 	 1
love 	 1.711 	 2.500 	 1
pleased 	 1.643 	 1.573 	 1
favorite 	 1.570 	 2.051 	 1
unexpected 	 1.561 	 2.088 	 1
awesome 	 1.529 	 2.393 	 1
great 	 1.519 	 2.348 	 1
addicted 	 1.508 	 1.778 	 1
addictive 	 1.506 	 2.211 	 1
fantastic 	 1.471 	 1.759 	 1
excellent 	 1.441 	 2.192 	 1
fascinating 	 1.414 	 2.352 	 1
enjoying 	 1.404 	 2.124 	 1
superb 	 1.389 	 1.453 	 1
amazing 	 1.330 	 2.018 	 1
wonderfully 	 1.300 	 0.653 	 1
entertaining 	 1.293 	 2.064 	 1
outstanding 	 1.242 	 1.982 	 1
informative 	 1.227 	 2.310 	 1
hilarious 	 1.112 	 1.556 	 1
enjoyable 	 1.061 	 1.747 	 1
perfect 	 0.947 	 1.671 	 1
happy 	 0.864 	 1.202 	 1
fun 	 0.813 	 1.927 	 1
liked 	 0.740 	 1.497 	 1
nicely 	 0.668 	 1.807 	 1
surprise 	 0.639 	 1.238 	 1
wow 	 0.628 	 1.312 	 1
satisfying 	 0.598 	 0.958 	 1
suspenseful 	 0.594 	 1.058 	 1
avoid 	 0.586 	 0.191 	 1
rare 	 0.543 	 1.275 	 1
laughing 	 0.483 	 0.626 	 1

In [38]:
# becasue LR sklearn use a regularization