# Using regularized logistic regression to classify email

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn import linear_model
#import sklearn.cross_validation
from sklearn import model_selection
#from sklearn.cross_validation import KFold
import scipy.io

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# No modifications in this cell
# complete the functions in utils.py; then run the cell

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,typea,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print("best_lambda = %.3f" %best_lambda)

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print("Coefficients = %s" %lreg.intercept_,lreg.coef_)
    predy = lreg.predict(Xt)
    print("Accuracy on set aside test set for %s = %.4f" %(typea, np.mean(predy==ytest)))

print("L2 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print("L1 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------




best_lambda = 4.100
Coefficients = [-1.62837284] [[-0.01839052 -0.21661119  0.13128384  0.48674888  0.2602243   0.18532733
   0.90344911  0.31288822  0.14199547  0.06198638 -0.05335911 -0.15162932
  -0.0516569   0.02767041  0.23856918  0.76613529  0.46856035  0.08308522
   0.26257561  0.22073129  0.26177729  0.41125323  0.7503693   0.26021176
  -1.80207063 -0.62172528 -1.83095331 -0.11174736 -0.67814627 -0.16857307
  -0.29711007 -0.20770702 -0.41815432 -0.42931161 -0.34816875  0.32415601
   0.010483   -0.14344427 -0.3803836  -0.09968338 -0.63272648 -0.95488787
  -0.32285734 -0.7132242  -0.79373552 -1.16416329 -0.133999   -0.67460068
  -0.33001795 -0.15734097 -0.11687446  0.22802517  1.48301759  0.49456055
  -0.12310253  0.83739199  0.38195683]]
Accuracy on set aside test set for std = 0.9219




best_lambda = 0.600
Coefficients = [-4.60944617] [[-0.45145875 -0.28466495 -0.06327731  0.68295812  1.21053202  0.91505006
   2.83046274  1.4367798   0.24145467  0.35775817 -0.38642819 -0.4814281
  -0.69586878  0.37457001  0.64885487  1.53956274  1.38118288  0.07197719
   0.37642284  0.63501959  0.52274768  0.3856372   2.00138718  1.50817418
  -3.14060875 -0.66617071 -4.90648494 -0.03260376 -1.28886324 -0.15745816
  -0.63899741 -0.30229202 -1.0099018  -0.42568621 -1.08721685  1.28432907
  -0.90558935 -0.35285898 -1.12971423 -0.6258932  -1.40337084 -2.44123424
  -1.55653413 -1.94778103 -1.13113638 -2.79991175 -0.751223   -2.11602044
  -1.68510856 -0.66773463 -0.69125592  2.06913162  4.21977699  0.76308941
   0.70345801  0.17008549  0.43018823]]
Accuracy on set aside test set for logt = 0.9434




best_lambda = 1.600
Coefficients = [-1.82566816] [[-1.78313887e-01 -1.60085506e-01 -3.73001110e-01  2.36358803e-01
   9.46367588e-01  1.59613651e-01  2.03690641e+00  7.62617293e-01
   1.81159712e-01  3.12388353e-01 -2.60352275e-01 -4.14115142e-01
  -8.66097179e-01  2.36335390e-01  4.75358415e-01  1.43030139e+00
   8.23118667e-01 -6.18540141e-02  2.39595773e-01  4.50237962e-01
   7.24354332e-01  1.06352180e+00  8.70212070e-01  1.30340906e+00
  -2.20348245e+00 -4.57176451e-01 -3.39242058e+00  5.45347540e-01
  -5.60588209e-01 -1.85244388e-01 -8.05548612e-01 -4.84223733e-01
  -6.36751901e-01 -8.68074831e-02 -6.31860077e-01  3.04485691e-01
  -1.03756760e+00  4.18380738e-01 -7.08628404e-01 -2.18361509e-01
  -1.07385026e+00 -1.74862153e+00 -6.95533233e-01 -1.43004581e+00
  -7.40200632e-01 -2.11078935e+00 -9.46977030e-02 -1.24285032e+00
  -2.91376072e-01  1.90460650e-01 -1.65731167e-01  1.19345678e+00
   1.42337675e+00  6.04361397e-02  7.86190132e-04  7.86190132e-04
   7.86190132e-04]]
Accurac