# Linear Discriminant Analysis

In [1]:
# Setup Libraries - if needed
%matplotlib inline
import pandas as pd
import sklearn
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# import the training data
training = pd.read_csv('../DATA/training.csv',index_col= 'idx' )


# view the data
training.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30to59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60to89DaysPastDueNotWorse,NumberOfDependents
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [3]:
# Import SciKit Learn functions
from sklearn.metrics import roc_curve, auc , roc_auc_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.cross_validation import train_test_split, KFold

In [4]:
# Split the target from the input variables
X = training.iloc[:, 1:]
y = training.iloc[:, 0]

In [5]:
def LDA_ITER(n):
    global AUC
    AUC = []    
    for x in range(0, n):
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=x)
        print X_train.shape 
        print X_test.shape 
        print y_train.shape 
        print y_test.shape
        # Create the LDA object7 
        LDA = LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, 
                                         store_covariance=False)

        # Fit to the data
        LDAfit = LDA.fit(X_train,y_train)

        # Output model evaluation
        LDApred = LDA.predict(X_test)
        print "The confusion matrix is " 
        print confusion_matrix(y_test, LDApred)
        print "The area under the curve is" , roc_auc_score(y_test, LDApred)
        xx = roc_auc_score(y_test, LDApred)
        AUC.append(xx)

    
LDA_ITER(10)

print "The average AUC score is" , np.mean(AUC)

(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34709   244]
 [ 2295   252]]
The area under the curve is 0.545979563268
(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34748   220]
 [ 2278   254]]
The area under the curve is 0.547012244641
(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34698   246]
 [ 2281   275]]
The area under the curve is 0.550275074593
(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34774   253]
 [ 2207   266]]
The area under the curve is 0.550169333296
(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34707   237]
 [ 2309   247]]
The area under the curve is 0.544926543771
(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34747   211]
 [ 2288   254]]
The area under the curve is 0.546942753694
(112500, 10)
(37500, 10)
(112500L,)
(37500L,)
The confusion matrix is 
[[34827   233]
 [ 2173   267]]
The area under the curve is 

## Evaluation

The results of the Linear Discriminant Analysis do not perform well. The results are worst than logistic regression with an AUC score of 0.548207825381. Note that Python does not have an internal CV tool for LDA, as well as a scoring mechanism that is internally assessed by the algorithm. There are also a number of assumptions that come into play when using Linear Discriminant Analysis. The most important of them being that the conditional distribution of our features given the target are gaussian distributed. This can be an unreasonable expectation.

## AUC = 0.548

### Alternative approach to cross validaton - using GridSearch

In [6]:
def cv_LDA(n):
    kfCV = KFold(len(training), n_folds = n )
    AUC = []
    for train_index, test_index in kfCV:
        X_train , X_test = X.iloc[train_index], X.iloc[test_index]
        y_train , y_test = y.iloc[train_index], y.iloc[test_index]

        LDA = LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, 
                                   store_covariance=False)
        LDAfit = LDA.fit(X_train,y_train)    
        # Output model evaluation
        LDApred = LDA.predict(X_test) 
        print "The confusion matrix is " 
        print confusion_matrix(y_test, LDApred)
        print ""
        print "The area under the curve is" , roc_auc_score(y_test, LDApred)
        xx = roc_auc_score(y_test, LDApred)
        AUC.append(xx)
    print ""
    print  "The average AUC score is" , np.mean(AUC)

In [7]:
cv_LDA(10)

The confusion matrix is 
[[13931    98]
 [  868   103]]

The area under the curve is 0.54954534006
The confusion matrix is 
[[13935    95]
 [  869   101]]

The area under the curve is 0.548676253389
The confusion matrix is 
[[13830   101]
 [  969   100]]

The area under the curve is 0.543147675779
The confusion matrix is 
[[13911    93]
 [  903    93]]

The area under the curve is 0.543366267125
The confusion matrix is 
[[13890   119]
 [  883   108]]

The area under the curve is 0.550243144111
The confusion matrix is 
[[13943    90]
 [  869    98]]

The area under the curve is 0.547465455006
The confusion matrix is 
[[13879    79]
 [  938   104]]

The area under the curve is 0.547074112384
The confusion matrix is 
[[13941    83]
 [  873   103]]

The area under the curve is 0.549807180665
The confusion matrix is 
[[13923    94]
 [  894    89]]

The area under the curve is 0.541916511639
The confusion matrix is 
[[13845    94]
 [  950   111]]

The area under the curve is 0.548937307897



So under the alternative approach, using scikit learn to use a more rigorous method for cross validation, we were still unable to produce improvements. LDA works better we out classification has much more balanced priors. In this case, out priors are extremely unbalanced so its probably no surprise that this method yields the worst results.