In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
import seaborn as sns
%matplotlib inline

In [14]:
df = pd.read_csv("AV_LP_train.csv")

In [18]:
def df_feature_eng(df):
    #    Handle NaNs
    
    df["Gender"].fillna(df["Gender"].mode(), inplace = True)
    df["Married"].fillna(df["Married"].mode(), inplace = True)
    df["Dependents"].fillna('0', inplace = True)
    df["Self_Employed"].fillna(df["Self_Employed"].mode(), inplace = True)
    df["Credit_History"].fillna(1, inplace = True)
    df["LoanAmount"].fillna(df["LoanAmount"].mean(), inplace = True)
    df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].mode(), inplace = True)
    
    #Drop all rows with Nan which are not handled in our code
    df.dropna()
        
    #    One Hot Encoding: ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']
    
    # Send Output Colummn to First location
    cols = [df.columns[-1]] + [col for col in df if col != df.columns[-1]]
    df = df[cols]
    
    cat_col = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']

    for i in cat_col:
        df1 = pd.get_dummies(df[i],prefix = i, drop_first=True)
        df = pd.concat([df,df1], axis = 1)

    df.drop(cat_col,axis=1,inplace=True)

    
    # Send output column to last
    cols = [col for col in df if col != df.columns[0]] + [df.columns[0]] 
    df = df[cols]
        
    # Labelling Could be merged with One Hot Encoding
    df["Loan_Status"].replace('Y',1, inplace = True)
    df["Loan_Status"].replace('N',0, inplace = True)
    
    #    Scaling/Normalizing features : Z-score scaling
    cat_col = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

    for i in cat_col:
        df[i]=(df[i]-df[i].mean())/df[i].std()
        df.head()
    return df;

In [19]:
df = df_feature_eng(df)
df.head()

Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status
0,LP001002,0.072931,-0.554036,3.382028e-16,0.276411,1.0,1,0,0,0,0,0,0,0,1,1
1,LP001003,-0.134302,-0.0387,-0.2190947,0.276411,1.0,1,1,1,0,0,0,0,0,0,0
2,LP001005,-0.393427,-0.554036,-0.9568608,0.276411,1.0,1,1,0,0,0,0,1,0,1,1
3,LP001006,-0.461686,0.251774,-0.3142903,0.276411,1.0,1,1,0,0,0,1,0,0,1,1
4,LP001008,0.097649,-0.554036,-0.06440178,0.276411,1.0,1,0,0,0,0,0,0,0,1,1


In [11]:
# 1st column is unique identifier
X = df.iloc[:,1:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]
print (X.shape,  y.shape)

(614, 14) (614,)


In [12]:
#Train Test Split 80/20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (491, 14) (491,)
Test set: (123, 14) (123,)


In [24]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
C = [0.001,0.01,0.1,1,10]
for c in C:
    LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
    yhat = LR.predict(X_test)
    #yhat
    accuracy = accuracy_score(y_test,yhat)
    print (c," --> ",accuracy)

0.001 0.6747967479674797
0.01 0.6747967479674797
0.1 0.6747967479674797
1 0.6747967479674797
10 0.6747967479674797


In [25]:
#Random Forest CLassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth = 5,n_estimators=200)
rfc.set_params(n_jobs=-1).fit(X_train,y_train)
# Use the forest's predict method on the test data
predictions = rfc.predict(X_test)

In [26]:
# Calculate the absolute errors
errors = abs(yhat - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / yhat)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.33 degrees.
Accuracy: 67.48 %.
