In [43]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from prettytable import PrettyTable
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [23]:
def binarize(groups_train, groups_test):
    y_train = np.array(groups_train.target)
    y_test = np.array(groups_test.target)
    
    tf_vectorizer = CountVectorizer(max_df=1.0, min_df=5, binary=True)
    
    print('Data Vectorizer Transform start')
    print()
    X_train = tf_vectorizer.fit_transform(groups_train.data)

    print('Train Data Transformed')
    print('Train Data size ', X_train.shape)
    print()
    X_test = tf_vectorizer.transform(groups_test.data)
    print('Test Data Transformed')
    print('Test Data size ', X_test.shape)
    
    f_names = tf_vectorizer.get_feature_names()
    
    return X_train, y_train, X_test, y_test, f_names

In [133]:
def print_features(clf_L1, clf_L2, f_names, msg, iter_range=10):
    idx_L2 = np.argsort(np.absolute(clf_L2.coef_)[0,:])[::-1]
    idx_L1 = np.argsort(np.absolute(clf_L1.coef_)[0,:])[::-1]
    
    ### Print on Pretty Table
    table_features = PrettyTable(['Rank', 'L2 Features', 'L2 Weight', 'L1 Features', 'L1 Weight'])
    f_list_1 = np.zeros(iter_range, dtype='int16')
    f_list_2 = np.ones(iter_range, dtype='int16')
    
    for idx in range(0,iter_range):
        table_features.add_row([idx+1, 
                                f_names[idx_L2[idx]], 
                                np.around(clf_L2.coef_[0,idx_L2[idx]], decimals=4), 
                                f_names[idx_L1[idx]], 
                                np.around(clf_L1.coef_[0,idx_L1[idx]], decimals=4)])
        f_list_2[idx] = idx_L2[idx]
        f_list_1[idx] = idx_L1[idx]
        
    print('L2 and L1-regularized Logistic Regression Classifier', msg)
    print('Top 10 features and weights (with absolute value)')
    print()
    print(table_features)
    print(' ')
    print('List of features in both L1 and L2 penalty :')
    num=1
    for i in range(0,iter_range):
        for j in range(0,iter_range):
            if f_list_1[i] == f_list_2[j]:
                print('\t', num, f_names[f_list_1[i]])
                num += 1

### Load the Dataset

In [7]:
news_categories = ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware']
newsgroups_train = fetch_20newsgroups(subset='train', categories=news_categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=news_categories)


### Binarize the Dataset

In [24]:
X_train, y_train, X_test, y_test, f_names = binarize(newsgroups_train, newsgroups_test)

Data Vectorizer Transform start

Train Data Transformed
Train Data size  (1181, 4933)

Test Data Transformed
Test Data size  (786, 4933)


### Fit the Model to the Data without Z-score scaling

In [27]:
clf_noz_l2 = LogisticRegression()
clf_noz_l2.fit(X_train, y_train)
print(clf_noz_l2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [29]:
clf_noz_l1 = LogisticRegression(penalty='l1')
clf_noz_l1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [136]:
print_features(clf_noz_l1, clf_noz_l2, f_names, 'without Z-score scaling', iter_range=20)

L2 and L1-regularized Logistic Regression Classifier without Z-score scaling
Top 10 features and weights (with absolute value)

+------+-------------+-----------+-------------+-----------+
| Rank | L2 Features | L2 Weight | L1 Features | L1 Weight |
+------+-------------+-----------+-------------+-----------+
|  1   |   windows   |  -2.8115  |   windows   |  -3.4064  |
|  2   |     file    |  -1.0976  |    latest   |   -2.567  |
|  3   |    files    |  -0.9501  |     win3    |  -1.7427  |
|  4   |     ibm     |   0.8911  |     wish    |    1.69   |
|  5   |    window   |  -0.8903  |     nasa    |   1.6795  |
|  6   |     win     |  -0.8678  |    window   |  -1.6084  |
|  7   |   monitor   |   0.8255  |    floppy   |   1.5967  |
|  8   |     win3    |   -0.819  |     ide     |   1.5538  |
|  9   |  microsoft  |  -0.7222  |  currently  |   1.5456  |
|  10  |   gateway   |   0.7025  |     ibm     |   1.5095  |
|  11  |     ide     |   0.6961  |   download  |  -1.4465  |
|  12  | motherboa

In [40]:
y_noz_l1 = clf_noz_l1.predict(X_test)
y_noz_l2 = clf_noz_l2.predict(X_test)

print('L2 :', accuracy_score(y_noz_l2, y_test))
print('L1 :', accuracy_score(y_noz_l1, y_test))

L2 : 0.870229007634
L1 : 0.839694656489


### Fit the Model to the Data with Z-score scaling

In [42]:
### Transform the X_train to dense
X_train_dense = X_train.todense()
X_test_dense = X_test.todense()

In [44]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_dense)
X_train_scaled.shape



(1181, 4933)

In [46]:
X_test_scaled = scaler.transform(X_test_dense)
X_test_scaled.shape



(786, 4933)

In [47]:
clf_l2 = LogisticRegression()
clf_l2.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:
clf_l1 = LogisticRegression(penalty='l1')
clf_l1.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [137]:
print_features(clf_l1, clf_l2, f_names, 'with Z-score scaling', iter_range=20)

L2 and L1-regularized Logistic Regression Classifier with Z-score scaling
Top 10 features and weights (with absolute value)

+------+---------------+-----------+-------------+-----------+
| Rank |  L2 Features  | L2 Weight | L1 Features | L1 Weight |
+------+---------------+-----------+-------------+-----------+
|  1   |    windows    |  -0.7514  |   windows   |  -2.6773  |
|  2   |      file     |   -0.284  |    window   |  -0.7411  |
|  3   |     window    |  -0.2658  |    files    |  -0.5714  |
|  4   |      win3     |  -0.2561  |     file    |  -0.5552  |
|  5   |      win      |  -0.2424  |    bj200    |  -0.5214  |
|  6   |       nt      |  -0.2287  |     cica    |  -0.4608  |
|  7   |   protected   |   0.2129  |    board    |   0.4477  |
|  8   | manufacturers |   0.2066  |     bus     |   0.4415  |
|  9   |     files     |  -0.2014  |     win3    |  -0.4331  |
|  10  |    monitor    |   0.1958  |   rutgers   |  -0.4327  |
|  11  |    download   |  -0.1956  |     000     |  -0.4

In [79]:
y_pred = clf_l2.predict(X_test)
y_l1 = clf_l1.predict(X_test)


print('L2 :', accuracy_score(y_pred, y_test))
print('L1 :', accuracy_score(y_l1, y_test))

L2 : 0.860050890585
L1 : 0.81679389313


In [80]:
y_pred_proba = clf_l2.predict_proba(X_test_scaled)
y_pred_proba.shape

(786, 2)

In [81]:
y_pred_log_proba = clf_l2.predict_log_proba(X_test_scaled)
y_pred_log_proba.shape

(786, 2)

In [69]:
def print_object_evidence(msg, obj_idx, X_test, y_pred, y_pred_proba, X_ev, pos_ev, neg_ev, f_names, iter_range=10):
    print(msg)
    print(' ')
    print('Index of the object : ', obj_idx)
    print(X_test[obj_idx, :])
    print('Class : ', y_test[obj_idx])
    print('Predict Class : ', y_pred[obj_idx])

    if y_test[obj_idx] != y_pred[obj_idx]:
        print('----------False Positive-------------')
        print(' ')

    print('a) Total  positive evidence : ', pos_ev[obj_idx])
    print('b) Total negative evidence : ', neg_ev[obj_idx])
    print('c) Probability distribution', y_pred_proba[obj_idx])

    feature_pos = X_ev[obj_idx,:]
    pos_list = np.argsort(feature_pos)[::-1]
    feature_neg = X_ev[obj_idx,:]
    neg_list = np.argsort(feature_neg)

    print('d) Top', iter_range, 'features values that contribute most to the positive evidence')
    for i in range(0,iter_range):
    #    print('\t',pos_list[i], '\t Evidence Value : ', np.sort(feature_pos)[::-1][i])
        print('\t',f_names[pos_list[i]], '\t Value: ', X_test[obj_idx, pos_list[i]], '\t Evidence: ', np.sort(feature_pos)[::-1][i], '\tCoef_: ', coef_[0, pos_list[i]])   

    print('e) Top', iter_range, 'features values that contribute most to the negative evidence')
    for j in range(0,iter_range):
    #    print('\t',neg_list[j], '\t Evidence Value : ', np.sort(feature_neg)[j])
        print('\t',f_names[neg_list[j]], '\t Value: ', X_test[uncertain_idx, neg_list[j]], '\t Evidence: ', np.sort(feature_neg)[j], '\tCoef_: ', coef_[0, pos_list[i]])

In [55]:
'''Copy the data X and coefficient w_i'''
X = np.copy(X_test_scaled)
coef_ = np.copy(clf_l2.coef_)
n_samples, n_features = X.shape

'''Initialize X_ev : x_i * w_i'''
X_ev = np.zeros((n_samples, n_features))

In [56]:
'''Get evidence'''
'''Calculate the w_ia_i'''
for idx in range(n_samples):
    X_ev[idx, :] = X[idx,:] * coef_

'''Generate the sets of P and N'''
X_pos_ev = X_ev * (X_ev > 0)
X_neg_ev = X_ev * (X_ev < 0)

'''Sum each the set P and N'''
pos_ev = np.sum(X_pos_ev, axis=1)
neg_ev = np.sum(X_neg_ev, axis=1)


### The most positive object with respect to the probabilities

In [57]:
most_pos_obj_idx = np.argmax(y_pred_proba[:,1])

In [70]:
print_object_evidence('Most Positive Object w.r.t Probabilities', 
                      most_pos_obj_idx, X_test_scaled, y_pred, y_pred_proba, X_ev, pos_ev, neg_ev, f_names, iter_range=10)

Most Positive Object w.r.t Probabilities
 
Index of the object :  537
[-0.18479884 -0.09241055 -0.12440745 ..., -0.06520507 -0.06520507
 -0.06520507]
Class :  1
Predict Class :  1
a) Total  positive evidence :  39.3744025942
b) Total negative evidence :  -19.5166927197
c) Probability distribution [  2.29971975e-09   9.99999998e-01]
d) Top 10 features values that contribute most to the positive evidence
	 randy 	 Value:  12.9504550389 	 Evidence:  1.02727499381 	Coef_:  0.0793234670693
	 toshiba 	 Value:  9.47872110815 	 Evidence:  0.829930456686 	Coef_:  0.0875572186602
	 msc 	 Value:  12.9504550389 	 Evidence:  0.809522461873 	Coef_:  0.0625091905606
	 range 	 Value:  10.3132747643 	 Evidence:  0.74599367076 	Coef_:  0.0723333458878
	 austin 	 Value:  5.9921824071 	 Evidence:  0.724198341315 	Coef_:  0.120857192274
	 comments 	 Value:  5.41128924398 	 Evidence:  0.703637109718 	Coef_:  0.130031324883
	 1542b 	 Value:  12.9504550389 	 Evidence:  0.659264972351 	Coef_:  0.0509067033067


### The most negative object with respect to the probabilities

In [59]:
most_neg_obj_idx = np.argmin(y_pred_proba[:,1])

In [71]:
print_object_evidence('Most Negative Object w.r.t Probabilities', 
                      most_neg_obj_idx, X_test_scaled, y_pred, y_pred_proba, X_ev, pos_ev, neg_ev, f_names)

Most Negative Object w.r.t Probabilities
 
Index of the object :  19
[-0.18479884 -0.09241055 -0.12440745 ..., -0.06520507 -0.06520507
 -0.06520507]
Class :  0
Predict Class :  0
a) Total  positive evidence :  66.1733954293
b) Total negative evidence :  -85.3131487893
c) Probability distribution [  9.99999995e-01   5.03433350e-09]
d) Top 10 features values that contribute most to the positive evidence
	 protected 	 Value:  9.86998817966 	 Evidence:  2.10130061853 	Coef_:  0.212897987341
	 interrupt 	 Value:  8.53302408294 	 Evidence:  1.40005785697 	Coef_:  0.164075226246
	 instructions 	 Value:  9.47872110815 	 Evidence:  1.2207758356 	Coef_:  0.128791196794
	 provided 	 Value:  11.4114951791 	 Evidence:  1.15212231226 	Coef_:  0.100961556236
	 ethernet 	 Value:  9.86998817966 	 Evidence:  1.10164379596 	Coef_:  0.1116155132
	 wish 	 Value:  8.81665091366 	 Evidence:  0.942156430895 	Coef_:  0.106861033756
	 utexas 	 Value:  7.82035131799 	 Evidence:  0.922859760274 	Coef_:  0.1180074

### The object that has the largest positive evidence.

In [63]:
most_pos_ev_idx = np.argmax(pos_ev)

In [72]:
print_object_evidence('Object with Largest Positive Evidence', 
                      most_neg_obj_idx, X_test_scaled, y_pred, y_pred_proba, X_ev, pos_ev, neg_ev, f_names, iter_range=10)

Object with Largest Positive Evidence
 
Index of the object :  19
[-0.18479884 -0.09241055 -0.12440745 ..., -0.06520507 -0.06520507
 -0.06520507]
Class :  0
Predict Class :  0
a) Total  positive evidence :  66.1733954293
b) Total negative evidence :  -85.3131487893
c) Probability distribution [  9.99999995e-01   5.03433350e-09]
d) Top 10 features values that contribute most to the positive evidence
	 protected 	 Value:  9.86998817966 	 Evidence:  2.10130061853 	Coef_:  0.212897987341
	 interrupt 	 Value:  8.53302408294 	 Evidence:  1.40005785697 	Coef_:  0.164075226246
	 instructions 	 Value:  9.47872110815 	 Evidence:  1.2207758356 	Coef_:  0.128791196794
	 provided 	 Value:  11.4114951791 	 Evidence:  1.15212231226 	Coef_:  0.100961556236
	 ethernet 	 Value:  9.86998817966 	 Evidence:  1.10164379596 	Coef_:  0.1116155132
	 wish 	 Value:  8.81665091366 	 Evidence:  0.942156430895 	Coef_:  0.106861033756
	 utexas 	 Value:  7.82035131799 	 Evidence:  0.922859760274 	Coef_:  0.1180074555

### The object that has the largest negative evidence.

In [65]:
most_neg_ev_idx = np.argmin(neg_ev)

In [73]:
print_object_evidence('Object with Largest Negative Evidence', 
                      most_neg_ev_idx, X_test_scaled, y_pred, y_pred_proba, X_ev, pos_ev, neg_ev, f_names, iter_range=10)

Object with Largest Negative Evidence
 
Index of the object :  353
[  5.41128924  -0.09241055   8.0381037  ...,  15.33623161  15.33623161
  15.33623161]
Class :  0
Predict Class :  0
a) Total  positive evidence :  95.3842545282
b) Total negative evidence :  -107.430355622
c) Probability distribution [  9.99993937e-01   6.06280202e-06]
d) Top 10 features values that contribute most to the positive evidence
	 ra 	 Value:  11.4114951791 	 Evidence:  1.80035574664 	Coef_:  0.157766858627
	 el 	 Value:  10.8212753407 	 Evidence:  1.14690051354 	Coef_:  0.105985706622
	 ts 	 Value:  15.3362316101 	 Evidence:  1.06539737628 	Coef_:  0.0694693066302
	 ___ 	 Value:  8.03810370119 	 Evidence:  1.04404602503 	Coef_:  0.129887105695
	 wang 	 Value:  11.4114951791 	 Evidence:  0.922542533996 	Coef_:  0.0808432654545
	 amd 	 Value:  10.8212753407 	 Evidence:  0.896234375247 	Coef_:  0.0828215110536
	 yo 	 Value:  11.4114951791 	 Evidence:  0.895066654503 	Coef_:  0.0784355284261
	 sb 	 Value:  8.038

### The most uncertain object with respect to the probabilities.

In [67]:
uncertain_idx = np.argmin(np.square(y_pred_proba[:,1]-0.5))

In [74]:
print_object_evidence('The most uncertain object', 
                      uncertain_idx, X_test_scaled, y_pred, y_pred_proba, X_ev, pos_ev, neg_ev, f_names, iter_range=10)

The most uncertain object
 
Index of the object :  379
[-0.18479884 -0.09241055 -0.12440745 ..., -0.06520507 -0.06520507
 -0.06520507]
Class :  1
Predict Class :  1
a) Total  positive evidence :  19.3728532003
b) Total negative evidence :  -19.4071614659
c) Probability distribution [ 0.5003849  0.4996151]
d) Top 10 features values that contribute most to the positive evidence
	 irqs 	 Value:  12.9504550389 	 Evidence:  0.763230439448 	Coef_:  0.0589346426173
	 direction 	 Value:  11.4114951791 	 Evidence:  0.755287249742 	Coef_:  0.0661865283987
	 manuals 	 Value:  9.86998817966 	 Evidence:  0.699173966101 	Coef_:  0.070838379274
	 motherboard 	 Value:  3.84013888638 	 Evidence:  0.670654543096 	Coef_:  0.174643304042
	 windows 	 Value:  -0.850933364748 	 Evidence:  0.63939851851 	Coef_:  -0.751408447475
	 hook 	 Value:  9.47872110815 	 Evidence:  0.597926043425 	Coef_:  0.0630808773253
	 ports 	 Value:  5.80820920058 	 Evidence:  0.487780193577 	Coef_:  0.0839811681591
	 hello 	 Value