# Loan Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from scipy.stats import uniform

import os
import tensorflow as tf
import keras


Using TensorFlow backend.


In [2]:
# read files
df_train_ori = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
df_test_ori = pd.read_csv("test_Y3wMUE5_7gLdaTN.csv")
#print(df_train_ori.count())
#print(df_test_ori.count())
df = pd.concat([df_train_ori, df_test_ori],sort=False,keys=['train', 'test'])
#df[0:614]
#df[614:981]


In [3]:
# Replace categorical values
df['Gender'] = df.Gender.replace('Male', 1)
df['Gender'] = df.Gender.replace('Female', 0)
df['Married'] = df.Married.replace('Yes', 1)
df['Married'] = df.Married.replace('No', 0)
df['Education'] = df.Education.replace('Graduate', 1)
df['Education'] = df.Education.replace('Not Graduate', 0)
df['Self_Employed'] = df.Self_Employed.replace('Yes', 1)
df['Self_Employed'] = df.Self_Employed.replace('No', 0)
df['Loan_Status'] = df.Loan_Status.replace('N', 0)
df['Loan_Status'] = df.Loan_Status.replace('Y', 1)
columns = ['Gender','Married', 'Education', 'Self_Employed','Loan_Status']


In [4]:
# categorical missing values
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Education'] = df['Education'].fillna(df['Education'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['Property_Area'] = df['Property_Area'].fillna(df['Property_Area'].mode()[0])
# continuous missing values
df['ApplicantIncome'] = df['ApplicantIncome'].fillna(df['ApplicantIncome'].mean())
df['CoapplicantIncome'] = df['CoapplicantIncome'].fillna(df['CoapplicantIncome'].mean())
#df.loc[ (pd.isnull(df['Credit_History'])) & (df['Loan_Status'] == 1), 'Credit_History'] = 1
#df.loc[ (pd.isnull(df['Credit_History'])) & (df['Loan_Status'] == 0), 'Credit_History'] = 0
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())

In [5]:
df.count()
df.loc[df['Gender'].isna()]

Unnamed: 0,Unnamed: 1,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


In [6]:
# dummy variables
#cat_vars=['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']
cat_vars=['Dependents','Property_Area']
for var in cat_vars:
    cat_list = pd.get_dummies(df[var], prefix=var)
    df1=df.join(cat_list)
    df=df1

In [7]:
# remove columns
data_vars=df.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]
df_final=df[to_keep]
del df_final['Loan_ID']
df_final.columns.values

array(['Gender', 'Married', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'], dtype=object)

In [8]:
# remove outliers
numeric_columns = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
for col in columns:
    elements = df_final[col]
    mean = elements.mean()
    sd = elements.std()
    df_final.loc[((df_final[col] > mean + 3*sd) | (df_final[col] < mean - 3*sd)),col] = mean


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
# defines a predefined scaler 
def scale_numeric(data, numeric_columns, scaler):
    for col in numeric_columns:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1,1))
    return data

# we can now define the scaler we want to use and apply it to our dataset
# a good exercise would be to research what StandardScaler does - it is from the scikit learn library
scaler = StandardScaler()
df_final = scale_numeric(df_final, numeric_columns,scaler)
# here we can see the result
df_final[0:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

Unnamed: 0,Unnamed: 1,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
train,0,1.0,0.0,1.0,0.0,0.117565,-0.589506,0.0,0.276368,1.0,1.0,1,0,0,0,0,0,1
train,1,1.0,1.0,1.0,0.0,-0.104844,-0.034561,-0.190168,0.276368,1.0,0.0,0,1,0,0,1,0,0
train,2,1.0,1.0,1.0,1.0,-0.382944,-0.589506,-1.002656,0.276368,1.0,1.0,1,0,0,0,0,0,1
train,3,1.0,1.0,0.0,0.0,-0.456202,0.278239,-0.295006,0.276368,1.0,1.0,1,0,0,0,0,0,1
train,4,1.0,0.0,1.0,0.0,0.144093,-0.589506,-0.019808,0.276368,1.0,1.0,1,0,0,0,0,0,1
train,5,1.0,1.0,1.0,1.0,0.041672,0.954624,1.631377,0.276368,1.0,1.0,0,0,1,0,0,0,1
train,6,1.0,1.0,0.0,0.0,-0.500122,-0.031617,-0.622622,0.276368,1.0,1.0,1,0,0,0,0,0,1
train,7,1.0,1.0,1.0,0.0,-0.37662,0.331967,0.202971,0.276368,0.0,0.0,0,0,0,1,0,1,0
train,8,1.0,1.0,1.0,0.0,-0.206211,-0.027937,0.334017,0.276368,1.0,1.0,0,0,1,0,0,0,1
train,9,1.0,1.0,1.0,0.0,1.345913,3.446725,2.705957,0.276368,1.0,0.0,0,1,0,0,0,1,0


In [10]:
# Credit_History missing values - Random Forest
#df.loc[ (pd.isnull(df['Credit_History'])) & (df['Loan_Status'] == 1), 'Credit_History'] = 1
#df.loc[ (pd.isnull(df['Credit_History'])) & (df['Loan_Status'] == 0), 'Credit_History'] = 0
X_train_ch = df_final.loc[ (pd.notnull(df_final['Credit_History']) & pd.notnull(df_final['Loan_Status'])) , df_final.columns != 'Credit_History']
y_train_ch = df_final.loc[ (pd.notnull(df_final['Credit_History']) & pd.notnull(df_final['Loan_Status'])) , 'Credit_History']
X_test_ch = df_final.loc[ (pd.isnull(df_final['Credit_History']) & pd.notnull(df_final['Loan_Status'])) , df_final.columns != 'Credit_History']
rnd_clf = RandomForestClassifier()
#X_train_ch.isnull().any()
rnd_clf.fit(X_train_ch, y_train_ch)
ch_pred = rnd_clf.predict(X_test_ch)
#print(X_test_ch['Loan_Status'])
#ch_pred
#X_test_ch 
df_final.loc[ (pd.isnull(df_final['Credit_History']) & pd.notnull(df_final['Loan_Status'])) , 'Credit_History'] = ch_pred
#
X_train_ch = df_final.loc[ (pd.notnull(df_final['Credit_History'])) , ~df_final.columns.isin(['Credit_History','Loan_Status'])]
y_train_ch = df_final.loc[ (pd.notnull(df_final['Credit_History'])) , 'Credit_History']
X_test_ch = df_final.loc[ (pd.isnull(df_final['Credit_History'])) , ~df_final.columns.isin(['Credit_History','Loan_Status'])]
rnd_clf = RandomForestClassifier()
#X_train_ch.isnull().any()
rnd_clf.fit(X_train_ch, y_train_ch)
ch_pred = rnd_clf.predict(X_test_ch)
#print(X_test_ch['Loan_Status'])
#ch_pred
#X_test_ch 
df_final.loc[ (pd.isnull(df_final['Credit_History'])) , 'Credit_History'] = ch_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
# get X and y from dataset
df_train = df_final.loc['train']
X = df_train.loc[:, df_train.columns != 'Loan_Status']
#y = df_final.loc[:, df_final.columns == 'Loan_Status']
y = df_train.Loan_Status
y = y.astype(int)




In [12]:
#model building

# define vars
input_num_units = 16
hidden_num_units = 8
output_num_units = 1
#output_num_units = 2

epochs = 5
batch_size = 16

# import keras modules

from keras.models import Sequential
from keras.layers import Dense

# create model
model = Sequential([
  Dense(units=hidden_num_units, input_dim=input_num_units, activation='relu'),
  Dense(units=output_num_units, input_dim=hidden_num_units, activation='sigmoid'),
#  Dense(units=output_num_units, input_dim=hidden_num_units, activation='softmax'),
])

# compile the model with necessary attributes
#model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

W0726 18:27:45.480887 140128722429760 deprecation_wrapper.py:119] From /home/lemos/miniconda3/envs/av/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0726 18:27:45.828700 140128722429760 deprecation_wrapper.py:119] From /home/lemos/miniconda3/envs/av/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 18:27:45.850337 140128722429760 deprecation_wrapper.py:119] From /home/lemos/miniconda3/envs/av/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0726 18:27:45.880348 140128722429760 deprecation_wrapper.py:119] From /home/lemos/miniconda3/envs/av/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.tr

In [13]:
trained_model = model.fit(X, y, nb_epoch=epochs, batch_size=batch_size)


  """Entry point for launching an IPython kernel.
W0726 18:27:56.691157 140128722429760 deprecation_wrapper.py:119] From /home/lemos/miniconda3/envs/av/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
# result
df_test = df_final.loc['test']
X_test_final = df_test.loc[:, df_test.columns != 'Loan_Status']
predictions = model.predict(X_test_final)
df_result = pd.DataFrame(data=predictions,columns=['Loan_Status']) 
df_result['Loan_Status'] = (df_result.Loan_Status > 0.5).astype(int)
df_result['Loan_Status'] = df_result['Loan_Status'].apply(str)
df_result['Loan_Status'] = df_result.Loan_Status.replace('0', 'N')
df_result['Loan_Status'] = df_result.Loan_Status.replace('1', 'Y')
df_result['Loan_ID'] = df_test_ori['Loan_ID']
df_result.to_csv('loanPredictionNeural.csv',index=False)
