# Logistic Regression - (with and without L1 & L2 Regularisation)

In [1]:
# Setup Libraries
%matplotlib inline
import pandas as pd
import sklearn
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# Set random seed for replication
seed = np.random.seed(12345)

In [3]:
# import the training data
training = pd.read_csv('../DATA/training.csv',index_col= 'idx' )

# view the data
training.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30to59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60to89DaysPastDueNotWorse,NumberOfDependents
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [4]:
training.describe(include='all')
print training.isnull().sum() #ensure full dataset before continuing

SeriousDlqin2yrs                         0
RevolvingUtilizationOfUnsecuredLines     0
age                                      0
NumberOfTime30to59DaysPastDueNotWorse    0
DebtRatio                                0
MonthlyIncome                            0
NumberOfOpenCreditLinesAndLoans          0
NumberOfTimes90DaysLate                  0
NumberRealEstateLoansOrLines             0
NumberOfTime60to89DaysPastDueNotWorse    0
NumberOfDependents                       0
dtype: int64


In [5]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_curve, auc , roc_auc_score, confusion_matrix
from sklearn.cross_validation import train_test_split

In [6]:
# Split the target from the input variables
X = training.iloc[:, 1:]
y = training.iloc[:, 0]

In [7]:
# Now, split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [8]:
from sklearn.metrics import roc_curve, auc , roc_auc_score, confusion_matrix, mean_absolute_error
from sklearn import grid_search

In [9]:
parameters = {'penalty': ['l1', 'l2'], 'cv': [3,5],'Cs':[[1e10],[1e5],[1e2],[1],[1e-2]]}

# Create my tree with grid search
logistic = grid_search.GridSearchCV(LogisticRegressionCV(solver='liblinear',
                                    random_state=1)
                                   , parameters, n_jobs=4,
                                   scoring='roc_auc',refit=True)

logistic.fit(X_train, y_train)
best_model = logistic.best_estimator_
print (logistic.best_score_, logistic.best_params_) 

(0.6981458278635555, {'Cs': [1], 'penalty': 'l1', 'cv': 3})


In [13]:
# Fitting a standard logistic regression model
# C is a cutoff for coefficients. The higher it is, the less the amount of regularisation. 
# So we set it very high first which is equivalent to no regularisation
# def __logreg__(n):
from datetime import datetime
startTime = datetime.now()

n=5
AUC1 = []
AUC2 = []
for i , C in enumerate((1e10,1e2,1)):
    c = int(C)
    # Turn down tolerance iteratively. Lower tolerance means increased chance of zero coefficients
    # L1 and L2 setup
    logreg1 = LogisticRegressionCV(Cs=[C],cv=n,penalty='l1',solver='liblinear',scoring='roc_auc',random_state=seed) 
    logreg2 = LogisticRegressionCV(Cs=[c],cv=n,penalty='l2',scoring='roc_auc',random_state=seed)
    # fit the models
    logreg1.fit(X, y)
    logreg2.fit(X, y)
    # Calculate the mean roc_auc score
    score1 = logreg1.scores_
    score2 = logreg2.scores_
    
    meanval1 = sum(score1.values()[0])/float(len(score1.values()[0]))
    meanval2 = sum(score2.values()[0])/float(len(score2.values()[0]))
    
    print "C=%.2f" % C
    print " "
    print "Mean AUC Score with L1 penalty: %.4f" % meanval1[0]
    print " "
    print "Mean AUC Score with L2 penalty: %.4f" % meanval2[0]
        
    AUC1.append(meanval1[0])
    AUC2.append(meanval2[0])
    


print "The average AUC score for for L1 Penalty is" , np.mean(AUC1)
print "The average AUC score for for L2 Penalty is" , np.mean(AUC2)
print " " 
print AUC1
print " "
print AUC2
print " "
print datetime.now() - startTime


C=10000000000.00
 
Mean AUC Score with L1 penalty: 0.6999
 
Mean AUC Score with L2 penalty: 0.6767
C=100.00
 
Mean AUC Score with L1 penalty: 0.7001
 
Mean AUC Score with L2 penalty: 0.6788
C=1.00
 
Mean AUC Score with L1 penalty: 0.6995
 
Mean AUC Score with L2 penalty: 0.6798
The average AUC score for for L1 Penalty is 0.699848385146
The average AUC score for for L2 Penalty is 0.678427651865
 
[0.69986704539762778, 0.7001337419617123, 0.69954436807745723]
 
[0.67665684808233595, 0.67878695418564561, 0.67983915332553735]
 
0:01:03.429808


## Evaluation

As one can see from the results of the cross validated logistic regression models, their performance results are quite poor. There is also negligible difference between the the AUC scores when comparing the L1 and L2 regularisation. Indeed given the small number of variables, any regularisation would more than likely degrade the model fit. The best AUC score we were able to obtain under 5 fold cross validation was 0.7 . Which as a first model is probably not half bad. It does however indicate the likely non-linear structure of our data. 

Any models produced from now on will not include explicit regularisation. 

## AUC L1 = 0.6996
## AUC L2 = 0.6786

In [10]:
# Predict the test set
# import the test data
test = pd.read_csv('../DATA/test.csv', index_col='idx')

In [11]:
X_ = test.iloc[:, 1:]

In [12]:
logreg_ = LogisticRegressionCV(Cs=[100],cv=5,penalty='l1',solver='liblinear',scoring='roc_auc',random_state=seed) 

logreg_.fit(X, y)

logreg_.predict(X_)

logreg_preds = logreg_.predict_proba(X_)[:, 1]

df = pd.DataFrame(logreg_preds)

# df.to_csv('../OUTPUT/preds_2.csv')

## Outcome of Private Score Submission: AUC = 0.697912