# Read data
Data is available at https://www.analyticsvidhya.com/wp-content/uploads/2016/02/Dataset.rar

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date
from sklearn.ensemble import GradientBoostingClassifier   

df1 = pd.read_csv('./data/DataHackathon3x_train.csv', encoding = "ISO-8859-1")
df2 = pd.read_csv('./data/DataHackathon3x_test.csv', encoding = "ISO-8859-1")
df1['src'] = 'train'; df2['src'] = 'test'
df = pd.concat([df1, df2], ignore_index=True, sort=False)

# Data preprocessing

In [2]:
df['EMI_Loan_Submitted_Missing'] = df['EMI_Loan_Submitted'].apply(lambda x: 1 if np.isnan(x) else 0)
df['Loan_Amount_Submitted_Missing'] = df['Loan_Amount_Submitted'].apply(lambda x: 1 if np.isnan(x) else 0)
df['Loan_Tenure_Submitted_Missing'] = df['Loan_Tenure_Submitted'].apply(lambda x: 1 if np.isnan(x) else 0)
df['Interest_Rate_Missing'] = df['Interest_Rate'].apply(lambda x: 1 if np.isnan(x) else 0)
df['Processing_Fee_Missing'] = df['Processing_Fee'].apply(lambda x: 1 if np.isnan(x) else 0)
df['Source'] = df['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)
df['Age'] = df['DOB'].apply(lambda x:(date.today() - datetime.strptime(x, '%d-%b-%y').date()).days/365.0)

df['Existing_EMI'] = df['Existing_EMI'].fillna(0)
df['Loan_Amount_Applied'] = df['Loan_Amount_Applied'].fillna(df['Loan_Amount_Applied'].median())
df['Loan_Tenure_Applied'] = df['Loan_Tenure_Applied'].fillna(df['Loan_Tenure_Applied'].median())

In [3]:
df = df.drop(['City', 'DOB', 'EMI_Loan_Submitted', 'Employer_Name', 'Interest_Rate', 'Lead_Creation_Date','Loan_Amount_Submitted', 'Loan_Tenure_Submitted', 'LoggedIn', 'Salary_Account', 'Processing_Fee'], axis=1)

## Numerical encoding + One-hot encoding

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encode_cols = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']
for col in encode_cols:
    df[col] = le.fit_transform(df[col])

df = pd.get_dummies(df, columns = encode_cols)

In [5]:
train = df.loc[df['src']=='train']
test = df.loc[df['src']=='test']
del train['src'], test['src']

In [6]:
train.head(10)

Unnamed: 0,ID,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Var5,Var4,Disbursed,EMI_Loan_Submitted_Missing,Loan_Amount_Submitted_Missing,...,Var2_2,Var2_3,Var2_4,Var2_5,Var2_6,Mobile_Verified_0,Mobile_Verified_1,Source_0,Source_1,Source_2
0,ID000002C20,20000,300000.0,5.0,0.0,0,1,0.0,1,1,...,0,0,0,0,1,1,0,1,0,0
1,ID000004E40,35000,200000.0,2.0,0.0,13,3,0.0,0,0,...,0,0,0,0,1,0,1,1,0,0
2,ID000007H20,22500,600000.0,4.0,0.0,0,1,0.0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,ID000008I30,35000,1000000.0,5.0,0.0,10,3,0.0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,ID000009J40,100000,500000.0,2.0,25000.0,17,3,0.0,1,0,...,0,0,0,0,0,0,1,0,0,1
5,ID000010K00,45000,300000.0,5.0,15000.0,17,3,0.0,0,0,...,0,0,0,0,0,0,1,0,0,1
6,ID000011L10,70000,6.0,5.0,0.0,0,1,0.0,1,1,...,0,0,0,0,0,1,0,0,1,0
7,ID000012M20,20000,200000.0,5.0,2597.0,3,3,0.0,1,0,...,0,0,0,0,0,0,1,0,0,1
8,ID000013N30,75000,0.0,0.0,0.0,13,5,0.0,0,0,...,1,0,0,0,0,0,1,1,0,0
9,ID000014O40,30000,300000.0,3.0,0.0,0,1,0.0,0,0,...,0,0,0,0,0,0,1,0,1,0
