Setting Default Values

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#Changing Object to category

In [2]:
def change_obj_to_category(df):
    i=0;
    while i<df.columns.size:
        if (df.dtypes[df.columns[i]] == 'object' or df.dtypes[df.columns[i]] == 'bool'):
            col = df.dtypes.index[i]
            df[col] = df[col].astype('category')
        i = i+1 

#Changing Int objects to float

In [3]:
def change_int_to_float(df):
    i=0;
    while i<df.columns.size:
        if 'int' in str(df.dtypes[df.columns[i]]):
            col = df.dtypes.index[i]
            df[col] = df[col].astype('float64')
        i = i+1

#Generating XY

In [4]:
def generateXY(df,target_col,var_list):
   
    #make a copy of the required subset and drop rows containing Na
    mdata = df[var_list+[target_col]].copy()
    mdata.dropna(inplace=True)
   
    #Separate target and put it in Y
    Y = mdata[target_col].tolist()
    del mdata[target_col]
   
    #Now, process data and create dummy variables if required with final data in Xvars
    import pandas
    Xvars = pandas.DataFrame()
    import pandas as pd
    for cols in var_list:
        if (str(mdata[cols].dtype) == 'category'):
            dummySer= pd.get_dummies(mdata[cols],prefix=cols+'_')
            Xvars = pd.concat([Xvars,dummySer],axis=1)
        else:
            Xvars =  pd.concat([Xvars,mdata[cols]],axis=1)
   
    X = Xvars.values.tolist()
   
    return X,Y

In [224]:
def generateX(df,var_list):
   
    #make a copy of the required subset and drop rows containing Na
    mdata = df[var_list].copy()
    mdata.dropna(inplace=True)
   
    #Now, process data and create dummy variables if required with final data in Xvars
    import pandas
    Xvars = pandas.DataFrame()
    import pandas as pd
    for cols in var_list:
        if (str(mdata[cols].dtype) == 'category'):
            dummySer= pd.get_dummies(mdata[cols],prefix=cols+'_')
            Xvars = pd.concat([Xvars,dummySer],axis=1)
        else:
            Xvars =  pd.concat([Xvars,mdata[cols]],axis=1)
   
    X = Xvars.values.tolist()
   
    return X

#Scoring ROC for the model

In [5]:
def roc_score_model(model,X,Y):
    from sklearn.metrics import roc_auc_score
    return roc_auc_score(Y,pd.DataFrame((model.predict_proba(X)))[1].tolist())

#Feature Improtance

In [6]:
def feature_imp(colNames,imps):
    df = pd.DataFrame(columns=('Feature','Importance'),index=[x for x in range(0,len(colNames))])
    i = 0
    for col in colNames:
        df['Feature'][i] = col
        df['Importance'][i] = imps[i]
        i=i+1
   
    df = df.sort_values(by='Importance',ascending=False)
    return df

#predicting the test values with threshold

In [7]:
def predict_th(model,tx,ty,threshold=0.5):
    import pandas as pd
    probs = model.predict_proba(tx)[:,1].tolist()
    predictions = []
    for i in range(0,len(ty)):
        if probs[i]>threshold:
            predictions.append(1)
        else:
            predictions.append(0)
   
    return predictions 

In [8]:
pwd

'/Users/ashokvardhan/Downloads/Projects/Lending_Club'

#Reading the DataSet

In [245]:
print('Starting data read from csv file.')
match_data = pd.read_csv("train.csv", low_memory = False)

Starting data read from csv file.


In [246]:
print('Starting data read from csv file.')
test_data = pd.read_csv("test.csv", low_memory = False)

Starting data read from csv file.


In [247]:
print('Deleting columns containing huge missing values, Single valued items and id columns which are of no use.')
print('No data leakage is present as such.')

Deleting columns containing huge missing values, Single valued items and id columns which are of no use.
No data leakage is present as such.


In [248]:
match_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'], dtype='object')

In [249]:
test_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area'], dtype='object')

In [250]:
del match_data['Loan_ID']

In [251]:
sample=test_data['Loan_ID']

In [252]:
del test_data['Loan_ID']

In [253]:
match_data.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [254]:
test_data.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [255]:
match_data.shape

(614, 12)

In [256]:
test_data.shape

(367, 11)

In [257]:
# Check missing values
def num_missing(x):
    return sum(x.isnull())

#Apply per column

print ("Missing values per column:")
print (match_data.apply(num_missing, axis=0))

Missing values per column:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [258]:
# Check missing values
def num_missing(x):
    return sum(x.isnull())

#Apply per column

print ("Missing values per column:")
print (test_data.apply(num_missing, axis=0))

Missing values per column:
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


In [259]:
print('Changing the type of columns from object to category.')

Changing the type of columns from object to category.


In [260]:
change_obj_to_category(match_data)

In [261]:
change_obj_to_category(test_data)

In [262]:
from sklearn import preprocessing

categorical = match_data.select_dtypes(include=['category'])
numeric = match_data.select_dtypes(exclude=['category'])

print(categorical.columns.values)
print(numeric.columns.values)

['Gender' 'Married' 'Dependents' 'Education' 'Self_Employed'
 'Property_Area' 'Loan_Status']
['ApplicantIncome' 'CoapplicantIncome' 'LoanAmount' 'Loan_Amount_Term'
 'Credit_History']


In [263]:
categorical_columns=categorical.columns.values
numeric_columns=numeric.columns.values

In [264]:
match_data.Loan_Status = match_data.Loan_Status.astype('category')

In [265]:
print('Separating numeric data from the data set to perform numeric imputation on missing values.')

Separating numeric data from the data set to perform numeric imputation on missing values.


In [266]:
numeric_data = match_data[numeric_columns]
for col in numeric_columns:
    match_data.drop([col],axis=1,inplace=True)

In [267]:
numeric_data_test = test_data[numeric_columns]
for col in numeric_data_test:
    test_data.drop([col],axis=1,inplace=True)

In [268]:
print('Imputed categorical data with the mode of the respective column.')

Imputed categorical data with the mode of the respective column.


In [269]:
for i in match_data.columns:
    print(i+"  : "+match_data[i].value_counts().index[0])
for i in numeric_data.columns:
    print(i + "  : " + str(numeric_data[i].median()))

Gender  : Male
Married  : Yes
Dependents  : 0
Education  : Graduate
Self_Employed  : No
Property_Area  : Semiurban
Loan_Status  : Y
ApplicantIncome  : 3812.5
CoapplicantIncome  : 1188.5
LoanAmount  : 128.0
Loan_Amount_Term  : 360.0
Credit_History  : 1.0


In [270]:
Missing values per column:
Gender               11
Dependents           10
Self_Employed        23
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29

SyntaxError: invalid syntax (<ipython-input-270-fb08a7365e00>, line 1)

In [271]:
test_data['Gender']=test_data['Gender'].fillna('Male')
test_data['Dependents']=test_data['Dependents'].fillna('0')
test_data['Self_Employed']=test_data['Self_Employed'].fillna('No')
numeric_data_test['LoanAmount']=numeric_data_test['LoanAmount'].fillna(128)
numeric_data_test['Loan_Amount_Term']=numeric_data_test['Loan_Amount_Term'].fillna(360)
numeric_data_test['Credit_History']=numeric_data_test['Credit_History'].fillna(1)

In [272]:
match_data = match_data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [273]:
print('Imputed numeric columns to its median value.')

Imputed numeric columns to its median value.


In [274]:
numeric_data = numeric_data.fillna(numeric_data.median())

In [275]:
#Joining numeric data to main data set
match_data = match_data.join(numeric_data)

In [276]:
test_data=test_data.join(numeric_data_test)

In [277]:
print('Setting aside 25% of the data as hold_out.')
#Now, let's set 20% of the data as holdout data [test data]

Setting aside 25% of the data as hold_out.


In [278]:
match_data.shape

(614, 12)

In [279]:
match_data.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History'], dtype='object')

As we are concerned whether a loan is defaulted or not we are removing all those rows which indicate they are either of current users or havent defaulted yet

In [281]:
#match_data=match_data[match_data['loan_status'].isin( ['Charged Off',  'Default', 'Fully Paid']) ] 
match_data['Loan_Status'] = match_data['Loan_Status'].map({'Y': 1, 'N': 0})

In [282]:
from collections import Counter
Counter(match_data['Loan_Status'])

Counter({0: 192, 1: 422})

In [283]:
match_data.shape

(614, 12)

In [284]:
test_data.shape

(367, 11)

In [285]:
from sklearn.model_selection import train_test_split
train_data, hold_out = train_test_split(match_data, train_size = 0.80,random_state=2135)

In [287]:
print('Using standard scaler of sklearn library to Standardize and scale the numeric inputs.')
#Scale the data

Using standard scaler of sklearn library to Standardize and scale the numeric inputs.


In [288]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns])
hold_out[numeric_columns] = scaler.transform(hold_out[numeric_columns])
test_data[numeric_columns]=scaler.transform(test_data[numeric_columns])
match_data[numeric_columns]=scaler.transform(match_data[numeric_columns])

In [313]:
match_data[numeric_columns]=scaler.transform(match_data[numeric_columns])

In [289]:
test_data.shape

(367, 11)

In [290]:
print('Finding Optimal parameters for 4 different models using GridSearch.')

Finding Optimal parameters for 4 different models using GridSearch.


In [293]:
target_col = 'Loan_Status'
varToUse = train_data.columns.tolist()
varToUse.remove('Loan_Status')
Train_X,Train_Y = generateXY(train_data,target_col,varToUse)
Test_X,Test_Y = generateXY(hold_out,target_col,varToUse)
Test_main_X = generateX(test_data,varToUse)

In [314]:
Train_all_X,Train_all_Y= generateXY(match_data,target_col,varToUse)

In [294]:
len(Test_main_X)

367

In [228]:
len(Test_X[0])

20

In [127]:
len(Train_X[0])

20

In [129]:
Train_Y[0]

1

In [117]:
print('Optimal Parameters for: ')

Optimal Parameters for: 


In [118]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [119]:
#Regularized Logistic Regression
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', sgd) ])
optimized_sgd = GridSearchCV(estimator=pipeline
                            , cv=10
                            , param_grid=dict(model__alpha = [0.0001,0.0002,0.0003,0.0004])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
sgdgc = optimized_sgd.fit(Train_X,Train_Y)
print('1. Regularized Logistic Regression: ' + str (sgdgc.best_params_))

1. Regularized Logistic Regression: {'model__alpha': 0.0002}


In [130]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', dt) ])
optimized_dt = GridSearchCV(estimator=pipeline
                            , cv=10
                            , param_grid=dict(model__max_depth =  [5,6,7], model__max_features=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
dtgc = optimized_dt.fit(Train_X,Train_Y)
print('2. Decision Tree: ' + str (dtgc.best_params_))

2. Decision Tree: {'model__max_depth': 6, 'model__max_features': 12}


In [121]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', rf) ])
optimized_rf = GridSearchCV(estimator=pipeline
                            , cv=10
                            , param_grid=dict(model__max_depth =  [5,6], model__max_features=[5,10],model__n_estimators=[40,50])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
rfgc = optimized_rf.fit(Train_X,Train_Y)
print('3. Random Forest: ' + str (rfgc.best_params_))

3. Random Forest: {'model__max_depth': 6, 'model__max_features': 10, 'model__n_estimators': 40}


In [131]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', gbc) ])
optimized_gbc = GridSearchCV(estimator=pipeline
                            , cv=10
                            , param_grid=dict(model__max_depth =  [2,3,4,5], model__max_features=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],model__n_estimators=[60,70])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
gbcgc = optimized_gbc.fit(Train_X,Train_Y)
print('4. Gradient Boosting Classifier: ' + str (gbcgc.best_params_))

4. Gradient Boosting Classifier: {'model__max_depth': 2, 'model__max_features': 4, 'model__n_estimators': 60}


In [132]:
print('\nPerforming Cross validation for 4 different models using optimal parameters obtained from GridSearch.')
print('Cross Validation Scores for: ')


Performing Cross validation for 4 different models using optimal parameters obtained from GridSearch.
Cross Validation Scores for: 


In [133]:
best_alpha = float(sgdgc.best_params_['model__alpha'])
sgd_cvscore = cross_val_score(SGDClassifier(random_state=2135,alpha=best_alpha),Train_X,Train_Y,cv=10,scoring='roc_auc')
print ('1. Regularized Logistic Regression: %f' %np.mean(sgd_cvscore))

1. Regularized Logistic Regression: 0.690855


In [134]:
best_depth = int(dtgc.best_params_['model__max_depth'])
best_features = int(dtgc.best_params_['model__max_features'])
dtgc_cvscore = cross_val_score(DecisionTreeClassifier(random_state=2135,max_depth=best_depth,max_features=best_features),Train_X,Train_Y,cv=10,scoring='roc_auc')
print ('2. Decision Tree: %f' %np.mean(dtgc_cvscore))

2. Decision Tree: 0.753879


In [137]:
best_depthrf = int(rfgc.best_params_['model__max_depth'])
best_featuresrf = int(rfgc.best_params_['model__max_features'])
best_estimatorsrf = int(rfgc.best_params_['model__n_estimators'])
rfgc_cvscore = cross_val_score(RandomForestClassifier(random_state=2135,n_estimators=best_estimatorsrf,max_depth=best_depthrf,max_features=best_featuresrf),Train_X,Train_Y,cv=10,scoring='roc_auc')
print ('3. Random Forest: %f' %np.mean(rfgc_cvscore))

3. Random Forest: 0.777588


In [138]:
best_depthgbc = int(gbcgc.best_params_['model__max_depth'])
best_featuresgbc = int(gbcgc.best_params_['model__max_features'])
best_estimatorsgbc = int(gbcgc.best_params_['model__n_estimators'])
gbcgc_cvscore = cross_val_score(GradientBoostingClassifier(random_state=2135,n_estimators=best_estimatorsgbc,max_depth=best_depthgbc,max_features=best_featuresgbc),Train_X,Train_Y,cv=10,scoring='roc_auc')
print ('4. Gradient Boosting Classifier: %f' %np.mean(gbcgc_cvscore))

4. Gradient Boosting Classifier: 0.770481


In [315]:
print ('\nBased on the Cross validation ouptuts, the best model that can be selected is: \'Gradient Boosting Classifier\'')
print('\nFitting Gradient Boosting classifier for the dataset.')
print('Testing accuracy of this model on Test data(or hold_out data).')
gbc = RandomForestClassifier(random_state=2135,n_estimators=best_estimatorsrf,max_depth=best_depthrf,max_features=best_featuresrf)
gbc = gbc.fit(Train_all_X,Train_all_Y)


Based on the Cross validation ouptuts, the best model that can be selected is: 'Gradient Boosting Classifier'

Fitting Gradient Boosting classifier for the dataset.
Testing accuracy of this model on Test data(or hold_out data).


In [326]:
print ('\nBased on the Cross validation ouptuts, the best model that can be selected is: \'Gradient Boosting Classifier\'')
print('\nFitting Gradient Boosting classifier for the dataset.')
print('Testing accuracy of this model on Test data(or hold_out data).')
gbc = GradientBoostingClassifier(random_state=2135,n_estimators=best_estimatorsgbc,max_depth=best_depthgbc,max_features=best_featuresgbc)
gbc = gbc.fit(Train_all_X,Train_all_Y)


Based on the Cross validation ouptuts, the best model that can be selected is: 'Gradient Boosting Classifier'

Fitting Gradient Boosting classifier for the dataset.
Testing accuracy of this model on Test data(or hold_out data).


In [195]:
Train_X[0]

[0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.3200463855404147,
 -0.5791575961045491,
 -0.2539460440978989,
 -4.370269340996699,
 0.41115400891590365]

In [327]:
out1=list(gbc.predict(Test_main_X))

In [295]:
out=list(gbc.predict(Test_main_X))

In [317]:
m=0
for i in range(len(out)):
    if out[i]==out1[i]:
        m+=1
    

In [319]:
len(out)

367

In [318]:
m

360

In [143]:
print('\nHoldout ROC/AUC accuracy for Gradient Boosting classifier: '+ str(roc_score_model(gbc,Test_X,Test_Y)))


Holdout ROC/AUC accuracy for Gradient Boosting classifier: 0.736568986569


In [296]:
print('Starting data read from csv file.')
output = pd.read_csv("test.csv", low_memory = False)

Starting data read from csv file.


In [305]:
print('Starting data read from csv file.')
sample = pd.read_csv("Sample_Submission.csv", low_memory = False)

Starting data read from csv file.


In [307]:
sample['Loan_ID']=output['Loan_ID']

In [328]:
output['result']=out1

In [332]:
sample.to_csv("Sample_Submission.csv")

In [331]:
sample

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
5,LP001054,Y
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y


In [330]:
#match_data=match_data[match_data['loan_status'].isin( ['Charged Off',  'Default', 'Fully Paid']) ] 
sample['Loan_Status'] = sample['Loan_Status'].map({1: 'Y', 0: 'N'})

In [329]:
sample['Loan_Status']=output['result']

In [304]:
for i in output.columns:
    if i not in 

Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
result


In [299]:
len(Test_main_X)

367

In [300]:
print('Starting data read from csv file.')
test_data = pd.read_csv("test.csv", low_memory = False)

Starting data read from csv file.


In [62]:
# Check missing values
def num_missing(x):
    return sum(x.isnull())

#Apply per column

print ("Missing values per column:")
print (test_data.apply(num_missing, axis=0))

Missing values per column:
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


In [None]:
print('Starting data read from csv file.')
match_data = pd.read_csv("train.csv", low_memory = False)