### Using regularized logistic regression to classify email

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn import linear_model

import scipy.io

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# No modifications in this cell
# complete the functions in utils.py; then run the cell

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,typea,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print("best_lambda = %.3f" %best_lambda)

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print("Coefficients = %s" %lreg.intercept_,lreg.coef_)
    predy = lreg.predict(Xt)
    print("Accuracy on set aside test set for %s = %.4f" %(typea, np.mean(predy==ytest)))

print("L2 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print("L1 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda = 0.100
Coefficients = [-4.8631135] [[ -2.74146034e-02  -2.25297683e-01   1.21840882e-01   2.29362958e+00
    2.70425727e-01   2.32851135e-01   9.28595397e-01   2.95200204e-01
    1.62205925e-01   6.78259098e-02  -8.32603808e-02  -1.60373348e-01
   -4.72247990e-02   1.07676967e-02   1.87903762e-01   8.19771791e-01
    5.09529030e-01   3.98710870e-02   2.67729669e-01   3.47047297e-01
    2.60498935e-01   3.64605710e-01   7.25019842e-01   1.96728229e-01
   -3.15395711e+00  -4.03133852e-01  -1.25451036e+01  -6.16576481e-02
   -1.56114581e+00  -5.51430777e-02  -3.00823111e-02   4.07263812e-01
   -3.68156521e-01  -1.43611917e+00  -5.87182163e-01   4.44294629e-01
    4.23159797e-02  -1.56897100e-01  -4.55330679e-01  -1.02250215e-01
   -3.54273318e+00  -1.72944429e+00  -4.37529498e-01  -1.05999940e+00
   -9.18599255e-01  -1.75490290e+00  -1.67475811e-01  -9.56875749e-01
   -3.65653442e-01  -1.36535593e-01  -6.58692632e-02   2.06714074e-01
    1.7