# Logistic Regression - (with and without L1 & L2 Regularisation)

In [1]:
# Setup Libraries
%matplotlib inline
import pandas as pd
import sklearn
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# import the training data
training = pd.read_csv('training.csv',index_col= 'idx' )

# view the data
training.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30to59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60to89DaysPastDueNotWorse,NumberOfDependents
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [3]:
training.describe(include='all')
print training.isnull().sum() #ensure full dataset before continuing

SeriousDlqin2yrs                         0
RevolvingUtilizationOfUnsecuredLines     0
age                                      0
NumberOfTime30to59DaysPastDueNotWorse    0
DebtRatio                                0
MonthlyIncome                            0
NumberOfOpenCreditLinesAndLoans          0
NumberOfTimes90DaysLate                  0
NumberRealEstateLoansOrLines             0
NumberOfTime60to89DaysPastDueNotWorse    0
NumberOfDependents                       0
dtype: int64


In [4]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_curve, auc , roc_auc_score, confusion_matrix

In [5]:
# Split the target from the input variables
X = training.iloc[:, 1:]
y = training.iloc[:, 0]

In [7]:
# Fitting a standard logistic regression model
# C is a cutoff for coefficients. The higher it is, the less the amount of regularisation. 
# So we set it very high first which is equivalent to no regularisation
# def __logreg__(n):
from datetime import datetime
startTime = datetime.now()

n=5
AUC1 = []
AUC2 = []
for i , C in enumerate((1e10,1e2,1)):
    c = int(C)
    # Turn down tolerance iteratively. Lower tolerance means increased chance of zero coefficients
    # L1 and L2 setup
    logreg1 = LogisticRegressionCV(Cs=[C],cv=n,penalty='l1',solver='liblinear') 
    logreg2 = LogisticRegressionCV(Cs=[c],cv=n,penalty='l2')
    # fit the models
    logreg1.fit(X, y)
    logreg2.fit(X, y)
    # Coefficients
    coef_l1 = logreg1.coef_.ravel()
    coef_l2 = logreg2.coef_.ravel()
    # Calcualate the sparsity
    sparsity_l1_LR = np.mean(coef_l1 == 0) * 100
    sparsity_l2_LR = np.mean(coef_l2 == 0) * 100
    
    print("C=%.2f" % C)
    
    print("Sparsity with L1 penalty: %.2f%%" % sparsity_l1_LR)
    print("Score with L1 penalty: %.4f" % logreg1.score(X, y))
    
    print("Sparsity with L2 penalty: %.2f%%" % sparsity_l2_LR)
    print("Score with L2 penalty: %.4f" % logreg2.score(X, y))
        
    # Predict
    prds1 = logreg1.predict(X)
    prds2 = logreg2.predict(X)

    print "Confusion matrix for C=%.2f" % C , "and n= %i " % n , "fold cross validation and Penalty= L1" 
    print confusion_matrix(y, prds1)
    print "The Area under the Curve is" , roc_auc_score(y, prds1)
    
    print "Confusion matrix for C=%.2f" % C , "and n= %i " % n , "fold cross validation and Penalty= L2"  
    print confusion_matrix(y, prds2)
    print "The Area under the Curve is" , roc_auc_score(y, prds2)
    
    xx1 = roc_auc_score(y, prds1)
    xx2 = roc_auc_score(y, prds2)
    AUC1.append(xx1)
    AUC2.append(xx2)
    
print datetime.now() - startTime

print "The average AUC score for for L1 Penalty is" , np.mean(AUC1)
print "The average AUC score for for L2 Penalty is" , np.mean(AUC2)

C=10000000000.00
Sparsity with L1 penalty: 0.00%
Score with L1 penalty: 0.9333
Sparsity with L2 penalty: 0.00%
Score with L2 penalty: 0.9333
Confusion matrix for C=10000000000.00 and n= 5  fold cross validation and Penalty= L1
[[139777    197]
 [  9801    225]]
The Area under the Curve is 0.510517123737
Confusion matrix for C=10000000000.00 and n= 5  fold cross validation and Penalty= L2
[[139765    209]
 [  9798    228]]
The Area under the Curve is 0.510623869645
C=100.00
Sparsity with L1 penalty: 0.00%
Score with L1 penalty: 0.9337
Sparsity with L2 penalty: 0.00%
Score with L2 penalty: 0.9332
Confusion matrix for C=100.00 and n= 5  fold cross validation and Penalty= L1
[[139705    269]
 [  9675    351]]
The Area under the Curve is 0.516543595593
Confusion matrix for C=100.00 and n= 5  fold cross validation and Penalty= L2
[[139818    156]
 [  9857    169]]
The Area under the Curve is 0.507870840628
C=1.00
Sparsity with L1 penalty: 0.00%
Score with L1 penalty: 0.9337
Sparsity with L2 

## Evaluation

As one can see from the results of the cross validate logistic regression models, they do not perform well. There is no significant impact on the model fit when comparing the L1 and L2 regularisation. Indeed given the small number of variables, any regularisation would more than likely degrade the model fit. The AUC yielded results that were only just better than 0.5 which is what we would expect from random guessing

Any models produced from no on will not include explicit regularisation. 

## AUC L1 = 0.514662996417
## AUC L2 = 0.511440660929