In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

In [2]:
df = pd.read_csv('data.csv')

In [3]:
# Seeing data:

df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(614, 13)

In [5]:
# Dropping non essential Catagory Loan_ID

df = df.drop(columns=['Loan_ID'])
df.corr()

# This is when I realized I need to Change my catagorical Features to numerical ones.

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
ApplicantIncome,1.0,-0.116677,0.570893,-0.045213,-0.014548
CoapplicantIncome,-0.116677,1.0,0.1887,-0.059917,-0.002153
LoanAmount,0.570893,0.1887,1.0,0.039447,-0.008433
Loan_Amount_Term,-0.045213,-0.059917,0.039447,1.0,0.00147
Credit_History,-0.014548,-0.002153,-0.008433,0.00147,1.0


In [6]:
# shape before dropping instances:
start = len(df.index)
df.shape

(614, 12)

In [7]:
# dropping instances with ANY missing values. NEW SHAPE

df = df.dropna()
df.shape

(476, 12)

In [8]:
# Weighing whether I should drop the values or not:

end = len(df.index)
percentDropped = (float)(100*(1 - (end/start)))
print("Dropped ", percentDropped, "% of Data Frame.")

Dropped  22.475570032573287 % of Data Frame.


# Initial Feature Engineering:

In [9]:
# Getting numerical Features in place of all the categoricals

df = pd.get_dummies(df)
df.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,...,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,property_Area_Rural,property_Area_Semiurban,property_Area_Urban,Loan_Status_N,Loan_Status_Y
2,3000.0,0.0,66.0,360.0,1.0,0,1,0,1,1,...,0,1,0,0,1,0,0,1,0,1
3,2583.0,2358.0,120.0,360.0,1.0,0,1,0,1,1,...,0,0,1,1,0,0,0,1,0,1
4,6000.0,0.0,141.0,360.0,1.0,0,1,1,0,1,...,0,1,0,1,0,0,0,1,0,1


In [10]:
# NEW SHAPE

df.shape

(476, 22)

In [11]:
# Dimensionality Reduction, Both applic. and coApplic. have SAME correlation values AKA equal scalar magnitude.

df['Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']

This is where I made the realization that each dummy catagory has both positive AND NEGATIVE correlations and I need to scale them by both. So instead of t=1 f=0 I realize the change I needed to make was t=1 f=-1

In [12]:
# Printing Correlation Values for all Features.

corr = df[df.columns[0:]].corr()['Loan_Status_Y'][:]
corr

ApplicantIncome           -0.043324
CoapplicantIncome         -0.049885
LoanAmount                -0.075056
Loan_Amount_Term          -0.008123
Credit_History             0.531317
Gender_Female             -0.064307
Gender_Male                0.064307
Married_No                -0.115302
Married_Yes                0.115302
Dependents_0              -0.024744
Dependents_1              -0.031813
Dependents_2               0.067500
Dependents_3+             -0.005482
Education_Graduate         0.068230
Education_Not Graduate    -0.068230
Self_Employed_No           0.038754
Self_Employed_Yes         -0.038754
property_Area_Rural       -0.104051
property_Area_Semiurban    0.157574
property_Area_Urban       -0.064988
Loan_Status_N             -1.000000
Loan_Status_Y              1.000000
Income                    -0.062999
Name: Loan_Status_Y, dtype: float64

In [13]:
# experimenting with this, wanted to see what type this is: dataframe? array? 

corr.dtype

dtype('float64')

In [14]:
#setting a mincorr coefficient scalar for feature values correlated to 'Status'
positiveCorrelationsOnly = corr[corr > 0]
    
mincorr = positiveCorrelationsOnly.min()
print("minimum correlation: ", mincorr)

minimum correlation:  0.038754033542204684


# Most of the Feature Engineering:

In [15]:
# Rename:

df['Married'] = df['Married_Yes']
df['Rural'] = df['property_Area_Rural']
df['Semi Urb'] = df['property_Area_Semiurban']
df['Urb'] = df['property_Area_Urban']
df['do'] = df['Dependents_0']
df['d1'] = df['Dependents_1']
df['d2'] = df['Dependents_2']
df['d3'] = df['Dependents_3+']
df['Status'] = df['Loan_Status_Y']
df['Term Length'] = df['Loan_Amount_Term']
df['Credit'] = df['Credit_History']

In [16]:
# Re-Parameterize:

df['Married'] = [1 if x == 1 else -1 for x in df['Married']]
df['Credit'] = [1 if x == 1 else -1 for x in df['Credit']]
df['Education_Not Graduate'] = [-1 if x == 1 else 0 for x in df['Education_Not Graduate']]
df['Income'] = [1 if x > df['Income'].median() else -1 for x in df['Income']]
df['Self_Employed_No'] = [-1 if x == 1 else 0 for x in df['Self_Employed_No']]
df['Term Length'] = [1 if x == 360 else .5 for x in df['Term Length']]
df['Gender'] = [1 if x==1 else -1 for x in df['Gender_Male']]
df['Status'] = [1 if x == 1 else -1 for x in df['Status']]

In [17]:
# Reducing Dimension:
df['Grad'] = df['Education_Graduate'] + df['Education_Not Graduate']
df['Business Owner'] = df['Self_Employed_Yes'] + df['Self_Employed_No']

In [18]:
# Define Individual Feature Correlations:

s = "Status"
marriedCorr = df['Married_Yes'].corr(df[s])
creditCorr = df['Credit'].corr(df[s])
educationCorr = df['Education_Graduate'].corr(df[s])
ruralCorr = df['Rural'].corr(df[s])
semiUrbCorr = df['Semi Urb'].corr(df[s])
urbanCorr = df['Urb'].corr(df[s])
genderCorr = df['Gender'].corr(df[s])
docorr = df['do'].corr(df[s])
d1corr = df['d1'].corr(df[s])
d2corr = df['d2'].corr(df[s])
d3corr = df['d3'].corr(df[s])
incomeCorr = df['Income'].corr(df[s])
businessOwnerCorr = df['Business Owner'].corr(df[s])
termLengthCorr = df['Term Length'].corr(df[s])

In [19]:
# Delete OLD features whose names' are TOO LONG:

del df['Married_Yes']
del df['Married_No']
del df['Education_Not Graduate']
del df['Education_Graduate']
del df['property_Area_Rural']
del df['property_Area_Semiurban']
del df['property_Area_Urban']
del df['Dependents_0']
del df['Dependents_1']
del df['Dependents_2']
del df['Dependents_3+']
del df['Self_Employed_Yes']
del df['Self_Employed_No']
del df['LoanAmount']
del df['Loan_Status_N']
del df['Loan_Status_Y']
del df['Gender_Female']
del df['Gender_Male']
del df['Loan_Amount_Term']
del df['Credit_History']
del df['ApplicantIncome']
del df['CoapplicantIncome']

In [20]:
# Compute New Values of features scaled by mincorrelation:

df['Married'] = df['Married'] * (marriedCorr / mincorr)
df['Credit'] = df['Credit'] * (creditCorr / mincorr)
df['Grad'] = df['Grad'] * (educationCorr / mincorr)
df['Rural'] = df['Rural'] * (ruralCorr / mincorr)
df['Semi Urb'] = df['Semi Urb'] * (semiUrbCorr / mincorr)
df['Urb'] = df['Urb'] * (urbanCorr / mincorr)
df['Gender'] = df['Gender'] * (genderCorr / mincorr)
df['do'] = df['do'] * (docorr / mincorr)
df['d1'] = df['d1'] * (d1corr / mincorr)
df['d2'] = df['d2'] * (d2corr / mincorr)
df['d3'] = df['d3'] * (d3corr / mincorr)
df['Income'] = df['Income'] * (incomeCorr / mincorr)
df['Business Owner'] = df['Business Owner'] * (businessOwnerCorr / mincorr)
df['Term Length'] = df['Term Length'] * (termLengthCorr / mincorr)

In [21]:
# More Dimension Reduction: Combining all Dependents into df['W2']

df['W2'] = df['do'] + df['d1'] + df['d2'] +df['d3']

# Dimensional Reduction:

del df['do']
del df['d1']
del df['d2']
del df['d3']

In [22]:
df.head(5)

Unnamed: 0,Income,Married,Rural,Semi Urb,Urb,Status,Term Length,Credit,Gender,Grad,Business Owner,W2
2,-0.352004,2.975238,-0.0,0.0,-1.676927,1,2.56333,13.709972,1.659376,1.760601,-1.0,-0.638478
3,-0.352004,2.975238,-0.0,0.0,-1.676927,1,2.56333,13.709972,1.659376,-1.760601,1.0,-0.638478
4,0.352004,-2.975238,-0.0,0.0,-1.676927,1,2.56333,13.709972,1.659376,1.760601,1.0,-0.638478
6,-0.352004,2.975238,-0.0,0.0,-1.676927,1,2.56333,13.709972,1.659376,-1.760601,1.0,-0.638478
7,0.352004,2.975238,-0.0,4.066,-0.0,-1,2.56333,-13.709972,1.659376,1.760601,1.0,-0.141454


In [23]:
# Making Sure Target Feature is at End of dataframe:

df['Target'] = df['Status']
del df['Status']
df.head(12)

Unnamed: 0,Income,Married,Rural,Semi Urb,Urb,Term Length,Credit,Gender,Grad,Business Owner,W2,Target
2,-0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,1.760601,-1.0,-0.638478,1
3,-0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,-1.760601,1.0,-0.638478,1
4,0.352004,-2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,1.760601,1.0,-0.638478,1
6,-0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,-1.760601,1.0,-0.638478,1
7,0.352004,2.975238,-0.0,4.066,-0.0,2.56333,-13.709972,1.659376,1.760601,1.0,-0.141454,-1
9,0.352004,2.975238,-0.0,4.066,-0.0,2.56333,13.709972,1.659376,1.760601,1.0,-0.820898,-1
10,-0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,1.760601,1.0,1.741759,1
12,0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,1.760601,1.0,1.741759,1
13,-0.352004,-2.975238,-2.68492,0.0,-0.0,2.56333,13.709972,1.659376,1.760601,1.0,-0.638478,-1
14,-0.352004,2.975238,-0.0,0.0,-1.676927,1.281665,13.709972,1.659376,1.760601,1.0,1.741759,1


In [24]:
print("df.shape = ", df.shape)

df.shape =  (476, 12)


# Splitting Data

In [25]:
# Because I removed rows with missing values, I used 85% for training and 15%for testing.

normalTrainingPercent80 = .8
additionalPercentTraining = percentDropped * .4
trnsz = (int)(len(df) * (normalTrainingPercent80 + additionalPercentTraining))
tstsz = len(df) - trnsz
print("Training Size: ", trnsz, "\nTesting Size: ", tstsz, "\nlen(df) = ", len(df))

Training Size:  4660 
Testing Size:  -4184 
len(df) =  476


In [26]:
# Splitting data:

X = df.iloc[:,:11]
Y = df.iloc[:,11]

In [27]:
X.head(3)

Unnamed: 0,Income,Married,Rural,Semi Urb,Urb,Term Length,Credit,Gender,Grad,Business Owner,W2
2,-0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,1.760601,-1.0,-0.638478
3,-0.352004,2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,-1.760601,1.0,-0.638478
4,0.352004,-2.975238,-0.0,0.0,-1.676927,2.56333,13.709972,1.659376,1.760601,1.0,-0.638478


In [28]:
Y.head(3)

2    1
3    1
4    1
Name: Target, dtype: int64

In [29]:
# Splitting training data:

xtrn = X.iloc[:trnsz]
ytrn = Y.iloc[:trnsz]

In [30]:
# Splitting testing data: 

xtst = X.iloc[trnsz:len(X.columns)]
ytst = Y.iloc[trnsz:]

# Training Classifier

In [31]:
# Initializing a Classifier model from sklearn.linear_model

from sklearn.linear_model import SGDClassifier
cl = SGDClassifier(random_state=42)


In [32]:
# Fitting the data to my model (training):

cl.fit(xtrn, ytrn)

SGDClassifier(random_state=42)

In [33]:
# Printing Mean accuracy with k-fold cross validation:

from sklearn.model_selection import cross_val_score
print("'4'-Fold Cross Validation:\n\nMean Accuracy: ", cross_val_score(cl, xtrn, ytrn, cv=4, scoring='accuracy').mean())

'4'-Fold Cross Validation:

Mean Accuracy:  0.7184873949579831


In [34]:
# Creating a prediction using my model and the training data both x & y:

from sklearn.model_selection import cross_val_predict
ypred = cross_val_predict(cl, xtrn, ytrn, cv=4)

In [35]:
# Confusion Matrix for TRAINING:
from sklearn.metrics import confusion_matrix

confusion_matrix(ytrn, ypred)

array([[ 73,  74],
       [ 60, 269]], dtype=int64)

# Testing Model

In [36]:
# final is the same as ypred, except now I am using my model on TEST DATA:

final = cl.predict(xtst)
confusion_matrix(ytst, final)

ValueError: Found array with 0 sample(s) (shape=(0, 11)) while a minimum of 1 is required.

In [None]:
# Printing Results:

from sklearn.metrics import recall_score, precision_score
prec = precision_score(ytst, final)
recall = recall_score(ytst, final)
acc = cl.score(xtst, ytst)

print("\tTesting:\n\n", "Precision = ", prec, "\nRecall = ", recall, "\nAccuracy = ", acc)