### Using regularized logistic regression to classify email

In [1]:
import scipy.io
import utils
import numpy as np
from sklearn import linear_model

# No modifications in this script
# complete the functions in util.py; then run the script

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

print Xtrain.shape
# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)


# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,type,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print "best_lambda = ", best_lambda

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print "Coefficients = ", lreg.intercept_,lreg.coef_
    predy = lreg.predict(Xt)
    print "Accuracy on set aside test set for ", type, " = ", np.mean(predy==ytest)

print "L2 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print "L1 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

(3065, 57)
L2 Penalty experiments -----------
reg =  0.1
reg =  0.6
reg =  1.1
reg =  1.6
reg =  2.1
reg =  2.6
reg =  3.1
reg =  3.6
reg =  4.1
reg =  4.6
best_lambda =  4.1
Coefficients =  [-1.62837284] [[-0.01839052 -0.21661119  0.13128384  0.48674888  0.2602243   0.18532733
   0.90344911  0.31288822  0.14199547  0.06198638 -0.05335911 -0.15162932
  -0.0516569   0.02767041  0.23856918  0.76613529  0.46856035  0.08308522
   0.26257561  0.22073129  0.26177729  0.41125323  0.7503693   0.26021176
  -1.80207063 -0.62172528 -1.83095331 -0.11174736 -0.67814627 -0.16857307
  -0.29711007 -0.20770702 -0.41815432 -0.42931161 -0.34816875  0.32415601
   0.010483   -0.14344427 -0.3803836  -0.09968338 -0.63272648 -0.95488787
  -0.32285734 -0.7132242  -0.79373552 -1.16416329 -0.133999   -0.67460068
  -0.33001795 -0.15734097 -0.11687446  0.22802517  1.48301759  0.49456055
  -0.12310253  0.83739199  0.38195683]]
Accuracy on set aside test set for  std  =  0.921875
reg =  0.1
reg =  0.6
reg =  1.1
reg

In [6]:
from sklearn import model_selection
def select_lambda_crossval(X,y,lambda_low,lambda_high,lambda_step,penalty):

    best_lambda = lambda_low

    # Your code here
    # Implement the algorithm above.
    num_folds = 5
    best_accuracy = 0.0
    kf = model_selection.KFold(n_splits = num_folds)
    for reg in np.arange(lambda_low, lambda_high, lambda_step):
        accuracy = 0;
        for train_index, test_index in kf.split(X):
            X_train = X[train_index]
            y_train = y[train_index]
            X_test = X[test_index]
            y_test = y[test_index]
            if (penalty == "l2"):
                sk_logreg = linear_model.LogisticRegression(C=1.0/reg, solver='lbfgs',fit_intercept=False, penalty=penalty)
            elif (penalty == "l1"):
                sk_logreg = linear_model.LogisticRegression(C=1.0/reg, solver='saga',fit_intercept=True,penalty=penalty)
            else:
                raise ValueError("Incorrect penalty type! Penalty can only be l2 or l1.")
#             sk_logreg.fit(X_train, y_train)
            sk_logreg.fit(X_train, y_train)
#             print X_train.shape
            y_pred = sk_logreg.predict(X_test)
#             y_pred = bin_features(X_test.dot(sk_logreg.coef_[0]))
#             print y_pred
#             print y_test
            cur_accuracy= float(np.sum(y_pred == y_test)) / y_test.shape[0]
#             print "lambda = ", reg, ", accuracy = ", cur_accuracy      
            accuracy += cur_accuracy
            # plot_utils.plot_decision_boundary_sklearn(X_train, y_train, sk_logreg, 'Chip Test 1', 'Chip Test 2',['y = 0','y = 1']):
        accuracy = accuracy / num_folds
        print "lambda = ", reg, ", accuracy = ", accuracy, "best_accuracy",  best_accuracy
        if (accuracy > best_accuracy):
            best_accuracy = accuracy
            best_lambda = reg
        
    # end your code

    return best_lambda
# print Xtrain_std[range(100)]
# print ytrain[range(1000,3000)]
# select_lambda_crossval(Xtrain_std[range(0, 3000)],ytrain[range(0, 3000)],0.3,2,0.1, "l1")
select_lambda_crossval(Xtrain_std,ytrain,0.3,2,0.1, "l1")

lambda =  0.3 , accuracy =  0.86101141925 best_accuracy 0.0
lambda =  0.4 , accuracy =  0.86101141925 best_accuracy 0.86101141925
lambda =  0.5 , accuracy =  0.860032626427 best_accuracy 0.86101141925
lambda =  0.6 , accuracy =  0.860685154976 best_accuracy 0.86101141925
lambda =  0.7 , accuracy =  0.859706362153 best_accuracy 0.86101141925
lambda =  0.8 , accuracy =  0.859053833605 best_accuracy 0.86101141925
lambda =  0.9 , accuracy =  0.859380097879 best_accuracy 0.86101141925
lambda =  1.0 , accuracy =  0.858401305057 best_accuracy 0.86101141925
lambda =  1.1 , accuracy =  0.859053833605 best_accuracy 0.86101141925
lambda =  1.2 , accuracy =  0.857422512235 best_accuracy 0.86101141925
lambda =  1.3 , accuracy =  0.857096247961 best_accuracy 0.86101141925
lambda =  1.4 , accuracy =  0.866231647635 best_accuracy 0.86101141925
lambda =  1.5 , accuracy =  0.865905383361 best_accuracy 0.866231647635
lambda =  1.6 , accuracy =  0.865905383361 best_accuracy 0.866231647635
lambda =  1.7 , 

1.4000000000000004