In [1]:
import warnings                                  # `do not disturbe` mode
warnings.filterwarnings('ignore')

import numpy as np                               # vectors and matrices
import pandas as pd                              # tables and data manipulations
import matplotlib.pyplot as plt                  # plots
import seaborn as sns                            # more plots

from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
trainxy = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_xy.csv')
trainx  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_x.csv')
trainy  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_y.csv')
testx   = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/test_x.csv')
trainxy.drop(['ID'],axis=1,inplace=True)
trainx.drop(['ID'],axis=1,inplace=True)
trainy.drop(['ID'],axis=1,inplace=True)
testx_model=testx.copy()
testx_model.drop(['ID_Test'],axis=1,inplace=True)

#Basic conversion of Categorical variables
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy=pd.get_dummies(trainxy, columns=["Occupation type"], prefix=["Occupation"])

#Imputing Age
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2>=194.9185),'Age']=1
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2<194.9185),'Age']=0

#Imputing Loan type
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47<trainxy.Score2),'Loan type']=1
trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47>trainxy.Score2),'Loan type']=0

#Imputing Occupation type
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(602.81<trainxy.Score4),'Occupation_X']=1
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(598.71<trainxy.Score4)&(trainxy.Score4<602.38),'Occupation_Y']=1
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(trainxy.Score4<598.67),'Occupation_Z']=1

#Imputing Score4
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1)].Score4.median()
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1)].Score4.median()
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1)].Score4.median()

#Imputing Expense and Score5
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score2 and Score4
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score3 and the remaining Expense
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score3.notna()),'Expense']=trainxy.Score3*11.3017+1628.15
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score3.isna()),'Score3']=(trainxy.Expense-1628.15)/11.3017

#Imputing Score1 and the remaining Score4 
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Score1.notna()),'Score4']=trainxy.Score1*17.7343+597.066
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Score1.isna()),'Score1']=(trainxy.Score4-597.066)/17.7343

#Imputing Score2 and Income
trainxy.loc[(trainxy.Income.isna())&(trainxy.Score2.notna()),'Income']=trainxy.Score2*20.9689+11613.7
trainxy.loc[(trainxy.Income.notna())&(trainxy.Score2.isna()),'Score2']=(trainxy.Income-11613.7)/20.9689


#Dropping Score5 since it has perfect correlation w Expense
trainxy.drop(['Score5'],axis=1,inplace=True)

#Dropping the remaining Null values
trainxy.dropna(inplace=True)
trainx=trainxy.copy()
trainx.drop(['Label'],axis=1,inplace=True)

In [3]:
loan_modeld1 = xgb.XGBClassifier(random_state=0,n_estimators=1000,learning_rate=0.05,max_depth=6, tree_method='auto')
crossd1=cross_val_score(estimator=loan_modeld1, X=trainx, y=trainxy.Label, error_score='raise',scoring='accuracy') 
print(crossd1.mean()*100)

98.32575907264705


In [4]:
trainxy = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_xy.csv')
trainx  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_x.csv')
trainy  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_y.csv')
testx   = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/test_x.csv')
trainxy.drop(['ID'],axis=1,inplace=True)
trainx.drop(['ID'],axis=1,inplace=True)
trainy.drop(['ID'],axis=1,inplace=True)
testx_model=testx.copy()
testx_model.drop(['ID_Test'],axis=1,inplace=True)

#Basic conversion of Categorical variables
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy=pd.get_dummies(trainxy, columns=["Occupation type"], prefix=["Occupation"])

#Imputing Age
#trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2>=194.9185),'Age']=1
#trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2<194.9185),'Age']=0

#Imputing Loan type
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
#trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47<trainxy.Score2),'Loan type']=1
#trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47>trainxy.Score2),'Loan type']=0

#Imputing Occupation type
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(602.81<trainxy.Score4),'Occupation_X']=1
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(598.71<trainxy.Score4)&(trainxy.Score4<602.38),'Occupation_Y']=1
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(trainxy.Score4<598.67),'Occupation_Z']=1

#Imputing Score4
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1)].Score4.median()
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1)].Score4.median()
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1)].Score4.median()

#Imputing Expense and Score5
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score2 and Score4
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score3 and the remaining Expense
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score3.notna()),'Expense']=trainxy.Score3*11.3017+1628.15
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score3.isna()),'Score3']=(trainxy.Expense-1628.15)/11.3017

#Imputing Score1 and the remaining Score4 
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Score1.notna()),'Score4']=trainxy.Score1*17.7343+597.066
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Score1.isna()),'Score1']=(trainxy.Score4-597.066)/17.7343

#Imputing Score2 and Income
#trainxy.loc[(trainxy.Income.isna())&(trainxy.Score2.notna()),'Income']=trainxy.Score2*20.9689+11613.7
#trainxy.loc[(trainxy.Income.notna())&(trainxy.Score2.isna()),'Score2']=(trainxy.Income-11613.7)/20.9689


#Dropping Score5 since it has perfect correlation w Expense
#trainxy.drop(['Score5'],axis=1,inplace=True)

#Dropping the remaining Null values
trainxy.dropna(inplace=True)
trainx=trainxy.copy()
trainx.drop(['Label'],axis=1,inplace=True)

In [5]:
loan_modeld2 = xgb.XGBClassifier(random_state=0,n_estimators=1000,learning_rate=0.05,max_depth=6, tree_method='auto')
crossd2=cross_val_score(estimator=loan_modeld2, X=trainx, y=trainxy.Label, error_score='raise',scoring='accuracy') 
print(crossd2.mean()*100)

98.45540125679804


In [6]:
trainxy = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_xy.csv')
trainx  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_x.csv')
trainy  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_y.csv')
testx   = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/test_x.csv')
trainxy.drop(['ID'],axis=1,inplace=True)
trainx.drop(['ID'],axis=1,inplace=True)
trainy.drop(['ID'],axis=1,inplace=True)
testx_model=testx.copy()
testx_model.drop(['ID_Test'],axis=1,inplace=True)

#Basic conversion of Categorical variables
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy=pd.get_dummies(trainxy, columns=["Occupation type"], prefix=["Occupation"])

#Imputing Age
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2>=194.9185),'Age']=1
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2<194.9185),'Age']=0

#Imputing Loan type
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
#trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47<trainxy.Score2),'Loan type']=1
#trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47>trainxy.Score2),'Loan type']=0

#Imputing Occupation type
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(602.81<trainxy.Score4),'Occupation_X']=1
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(598.71<trainxy.Score4)&(trainxy.Score4<602.38),'Occupation_Y']=1
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(trainxy.Score4<598.67),'Occupation_Z']=1

#Imputing Score4
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1)].Score4.median()
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1)].Score4.median()
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1)].Score4.median()

#Imputing Expense and Score5
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score2 and Score4
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score3 and the remaining Expense
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score3.notna()),'Expense']=trainxy.Score3*11.3017+1628.15
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score3.isna()),'Score3']=(trainxy.Expense-1628.15)/11.3017

#Imputing Score1 and the remaining Score4 
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Score1.notna()),'Score4']=trainxy.Score1*17.7343+597.066
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Score1.isna()),'Score1']=(trainxy.Score4-597.066)/17.7343

#Imputing Score2 and Income
#trainxy.loc[(trainxy.Income.isna())&(trainxy.Score2.notna()),'Income']=trainxy.Score2*20.9689+11613.7
#trainxy.loc[(trainxy.Income.notna())&(trainxy.Score2.isna()),'Score2']=(trainxy.Income-11613.7)/20.9689


#Dropping Score5 since it has perfect correlation w Expense
trainxy.drop(['Score5'],axis=1,inplace=True)

#Dropping the remaining Null values
trainxy.dropna(inplace=True)
trainx=trainxy.copy()
trainx.drop(['Label'],axis=1,inplace=True)

In [7]:
loan_modeld3 = xgb.XGBClassifier(random_state=0,n_estimators=1000,learning_rate=0.05,max_depth=6, tree_method='auto')
crossd3=cross_val_score(estimator=loan_modeld3, X=trainx, y=trainxy.Label, error_score='raise',scoring='accuracy') 
print(crossd3.mean()*100,crossd3)

98.46302015086964 [0.98562471 0.98470714 0.9839425  0.9838648  0.98501185]


In [8]:
trainxy = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_xy.csv')
trainx  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_x.csv')
trainy  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_y.csv')
testx   = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/test_x.csv')
trainxy.drop(['ID'],axis=1,inplace=True)
trainx.drop(['ID'],axis=1,inplace=True)
trainy.drop(['ID'],axis=1,inplace=True)
testx_model=testx.copy()
testx_model.drop(['ID_Test'],axis=1,inplace=True)

#Basic conversion of Categorical variables
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy=pd.get_dummies(trainxy, columns=["Occupation type"], prefix=["Occupation"])

#Imputing Age
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2>=194.9185),'Age']=1
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2<194.9185),'Age']=0

#Imputing Loan type
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
#trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47<trainxy.Score2),'Loan type']=1
#trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47>trainxy.Score2),'Loan type']=0

#Imputing Occupation type
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(602.81<trainxy.Score4),'Occupation_X']=1
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(598.71<trainxy.Score4)&(trainxy.Score4<602.38),'Occupation_Y']=1
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(trainxy.Score4<598.67),'Occupation_Z']=1

#Imputing Score4
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1)].Score4.median()
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1)].Score4.median()
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1)].Score4.median()

#Imputing Expense and Score5
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score2 and Score4
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score3 and the remaining Expense
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score3.notna()),'Expense']=trainxy.Score3*11.3017+1628.15
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score3.isna()),'Score3']=(trainxy.Expense-1628.15)/11.3017

#Imputing Score1 and the remaining Score4 
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Score1.notna()),'Score4']=trainxy.Score1*17.7343+597.066
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Score1.isna()),'Score1']=(trainxy.Score4-597.066)/17.7343

#Imputing Score2 and Income
#trainxy.loc[(trainxy.Income.isna())&(trainxy.Score2.notna()),'Income']=trainxy.Score2*20.9689+11613.7
#trainxy.loc[(trainxy.Income.notna())&(trainxy.Score2.isna()),'Score2']=(trainxy.Income-11613.7)/20.9689


#Dropping Score5 since it has perfect correlation w Expense
trainxy.drop(['Score5'],axis=1,inplace=True)

#Dropping the remaining Null values
trainxy.dropna(inplace=True)
trainx=trainxy.copy()
trainx.drop(['Label'],axis=1,inplace=True)

In [9]:
loan_modeld4 = xgb.XGBClassifier(random_state=0,n_estimators=1000,learning_rate=0.05,max_depth=6, tree_method='auto')
crossd4=cross_val_score(estimator=loan_modeld4, X=trainx, y=trainxy.Label, error_score='raise',scoring='accuracy') 
print(crossd4.mean()*100,crossd4)

98.46302015086964 [0.98562471 0.98470714 0.9839425  0.9838648  0.98501185]


In [10]:
trainxy = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_xy.csv')
trainx  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_x.csv')
trainy  = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/train_y.csv')
testx   = pd.read_csv('/home/kshitij/Acads/MS4610 project/IDA project dataset/test_x.csv')
trainxy.drop(['ID'],axis=1,inplace=True)
trainx.drop(['ID'],axis=1,inplace=True)
trainy.drop(['ID'],axis=1,inplace=True)
testx_model=testx.copy()
testx_model.drop(['ID_Test'],axis=1,inplace=True)

#Basic conversion of Categorical variables
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy=pd.get_dummies(trainxy, columns=["Occupation type"], prefix=["Occupation"])

#Imputing Age
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2>=194.9185),'Age']=1
trainxy.loc[(trainxy.Age.isna())&(trainxy.Score2<194.9185),'Age']=0

#Imputing Loan type
trainxy["Loan type"] = trainxy["Loan type"].astype('category')
trainxy["Loan type"] = trainxy["Loan type"].cat.codes
trainxy.loc[(trainxy['Loan type']==-1),'Loan type']=np.NaN
trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47<trainxy.Score2),'Loan type']=1
trainxy.loc[(trainxy.Score2.notna())&(trainxy['Loan type'].isna())&(192.47>trainxy.Score2),'Loan type']=0

#Imputing Expense and Score5
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Score2 and Score4
trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score5.notna()),'Expense']=trainxy.Score5*2.06976-5339.9
trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score5.isna()),'Score5']=(trainxy.Expense+5339.9)/2.06976

#Imputing Occupation type
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(602.81<trainxy.Score4),'Occupation_X']=1
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(598.71<trainxy.Score4)&(trainxy.Score4<602.38),'Occupation_Y']=1
trainxy.loc[(trainxy.Score4.notna())&(trainxy.Occupation_X==0)&(trainxy.Occupation_Y==0)&(trainxy.Occupation_Z==0)&(trainxy.Score4<598.67),'Occupation_Z']=1

#Imputing Score4
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_X==1)].Score4.median()
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Y==1)].Score4.median()
trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1),'Score4']=trainxy.loc[(trainxy.Score4.isna())&(trainxy.Occupation_Z==1)].Score4.median()


#Imputing Score3 and the remaining Expense
#trainxy.loc[(trainxy.Expense.isna())&(trainxy.Score3.notna()),'Expense']=trainxy.Score3*11.3017+1628.15
#trainxy.loc[(trainxy.Expense.notna())&(trainxy.Score3.isna()),'Score3']=(trainxy.Expense-1628.15)/11.3017

#Imputing Score1 and the remaining Score4 
#trainxy.loc[(trainxy.Score4.isna())&(trainxy.Score1.notna()),'Score4']=trainxy.Score1*17.7343+597.066
#trainxy.loc[(trainxy.Score4.notna())&(trainxy.Score1.isna()),'Score1']=(trainxy.Score4-597.066)/17.7343

#Imputing Score2 and Income
#trainxy.loc[(trainxy.Income.isna())&(trainxy.Score2.notna()),'Income']=trainxy.Score2*20.9689+11613.7
#trainxy.loc[(trainxy.Income.notna())&(trainxy.Score2.isna()),'Score2']=(trainxy.Income-11613.7)/20.9689


#Dropping Score5 since it has perfect correlation w Expense
trainxy.drop(['Score5'],axis=1,inplace=True)

#Dropping the remaining Null values
trainxy.dropna(inplace=True)
trainx=trainxy.copy()
trainx.drop(['Label'],axis=1,inplace=True)

In [11]:
loan_modeld5 = xgb.XGBClassifier(random_state=0,n_estimators=1000,learning_rate=0.05,max_depth=6, tree_method='auto')
crossd5=cross_val_score(estimator=loan_modeld5, X=trainx, y=trainxy.Label, error_score='raise',scoring='accuracy') 
print(crossd5.mean()*100)

98.43807993403931
