<a href="https://colab.research.google.com/github/abhranil-datascience/LoanDeliquencyPredictionHackathon/blob/master/LoanPredictionHackathon_AV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Public LeaderBoard Score: 0.33
######################### Suppress Warnings #######################################

import warnings
warnings.filterwarnings('ignore')

############################## Mount Drive ########################################

from google.colab import drive
drive.mount('/content/gdrive')

############################## Change Directory ###################################

import os
os.chdir('/content/gdrive/My Drive/AnalyticsVidhya/MLHackathon2019')

################# Root Import Statements #####################

import pandas as pd
from sklearn.preprocessing import Imputer
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as K
from sklearn.metrics import f1_score
from keras.optimizers import Adam
import random
#from numpy.random import seed
#seed(1)

################# Declare Functions ##########################
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

################# Declare Constants ##########################

TrainingDatasetPath="Dataset/LoanDeliquencyTrain.csv"
TestDatasetPath="Dataset/LoanDeliquencyTest.csv"
BestClassifierModel="BestModel/best_model.hdf5"
SaveBestModel=ModelCheckpoint(filepath=BestClassifierModel,monitor='val_f1_m', verbose=1, save_best_only=True, mode='max')
EarlyStopping=EarlyStopping(monitor='val_f1_m', min_delta=0.001, patience=12, verbose=1, mode='max', baseline=None, restore_best_weights=True)

###################################### Description #####################################################################################

#Index 0 : source || Index 1: financial_institution || Index 2: interest_rate || Index 3: unpaid_principal_bal || Index 4: loan_term || Index 5 : origination_date ||

#Index 6 : first_payment_date || Index 7: loan_to_value || Index 8: number_of_borrowers || Index 9: debt_to_income_ratio || Index 10: borrower_credit_score ||  

#Index 11 : loan_purpose || Index 12: insurance_percent || Index 13: co-borrower_credit_score || Index 14: insurance_type || 

#Index 15 : m1 || Index 16: m2 || Index 17: m3 || Index 18: m4 || Index 19: m5 || Index 20 : m6 || Index 21: m7 || Index 22: m8 || Index 23: m9 || Index 24 : m10 || 

#Index 25 : m11 || Index 26: m12 || Index 27: m13(Target Variable)

#####################################################################################################################################################################

#################################################################### Functions ######################################################################################
# 1. Format Date since Train and Test have different date format
def FormatDate(TrainDataset,TestDataset):
  num_rows_in_train=TrainDataset.shape[0]
  num_rows_in_test=TestDataset.shape[0]
  for row_num in range(0,num_rows_in_train):
    curr_orig_date=TrainDataset[row_num][5]
    curr_pay_date=TrainDataset[row_num][6]
    curr_orig_date_token=curr_orig_date.split('-')
    curr_orig_month=curr_orig_date_token[1]
    curr_pay_date_token=curr_pay_date.split('/')
    curr_pay_month=curr_pay_date_token[0]
    if curr_orig_month=="01":
      TrainDataset[row_num][5]="J"
    elif curr_orig_month=="02":
      TrainDataset[row_num][5]="F"
    elif curr_orig_month=="03":
      TrainDataset[row_num][5]="M"
    else:
      TrainDataset[row_num][5]="NA"
    if curr_pay_month=="02":
      TrainDataset[row_num][6]="F"
    elif curr_pay_month=="03":
      TrainDataset[row_num][6]="M"
    elif curr_pay_month=="04":
      TrainDataset[row_num][6]="A"
    elif curr_pay_month=="05":
      TrainDataset[row_num][6]="MA"
    else:
      TrainDataset[row_num][6]="NA"
  for row_num in range(0,num_rows_in_test):
    curr_orig_date=TestDataset[row_num][5]
    curr_pay_date=TestDataset[row_num][6]
    curr_orig_date_token=curr_orig_date.split('/')
    curr_orig_month=curr_orig_date_token[1]
    curr_pay_date_token=curr_pay_date.split('-')
    curr_pay_month=curr_pay_date_token[0]
    if curr_orig_month=="01":
      TestDataset[row_num][5]="J"
    elif curr_orig_month=="02":
      TestDataset[row_num][5]="F"
    elif curr_orig_month=="03":
      TestDataset[row_num][5]="M"
    else:
      TestDataset[row_num][5]="NA"
    if curr_pay_month=="Feb":
      TestDataset[row_num][6]="F"
    elif curr_pay_month=="Mar":
      TestDataset[row_num][6]="M"
    elif curr_pay_month=="Apr":
      TestDataset[row_num][6]="A"
    elif curr_pay_month=="May":
      TestDataset[row_num][6]="MA"
    else:
      TestDataset[row_num][6]="NA"
  return TrainDataset,TestDataset

# 2. Function to deal with invalid values
def DealWithMissingValues(dataset):
  imputer=Imputer(missing_values=0,strategy='mean', axis=0)
  dataset[:,10:11]=imputer.fit_transform(dataset[:,10:11])
  for row_num in range(0,dataset.shape[0]):
    if dataset[row_num][8]==1:
      if dataset[row_num][13] != 0:
        dataset[row_num][13] = 0
  return dataset
  
# 3. Understand pattern in M1 to M12
def UnderstandPatternFromM1toM12(m1_to_m12):
  num_of_rows=m1_to_m12.shape[0]
  num_of_columns=m1_to_m12.shape[1]
  sum_of_defaults=[]
  regular_defaulter=[]
  odd_defaulter=[]
  last_month_defaulter=[]
  last_two_month_defaulter=[]
  num_of_defaults_in_last_two_month=[]
  last_three_month_defaulter=[]
  num_of_defaults_in_last_three_month=[]
  num_of_deliquency_till_m13=[]
  ProbabilityToDefaultInWhole=[]
  ProbabilityToDefaultInSixMon=[]
  ProbabilityToDefaultInThreeMon=[]
  for row_num in range(0,num_of_rows):
    ################## Calculating Sum of Defaults ##########################
    sum_of_default_in_current_row=0
    curr_prob_whole=0
    curr_six_mon_prob=0
    curr_three_mon_prob=0
    for col_num in range(0,num_of_columns):
      sum_of_default_in_current_row=sum_of_default_in_current_row+m1_to_m12[row_num][col_num]
      if m1_to_m12[row_num][col_num] != 0:
        curr_prob_whole=curr_prob_whole+1
      if col_num > 5:
        if m1_to_m12[row_num][col_num] != 0:
          curr_six_mon_prob=curr_six_mon_prob+1
      if col_num > 8:
        if m1_to_m12[row_num][col_num] != 0:
          curr_three_mon_prob=curr_three_mon_prob+1
    curr_prob_whole=curr_prob_whole/12
    curr_six_mon_prob=curr_six_mon_prob/6
    curr_three_mon_prob=curr_three_mon_prob/3
    ProbabilityToDefaultInWhole.append(curr_prob_whole)
    ProbabilityToDefaultInSixMon.append(curr_six_mon_prob)
    ProbabilityToDefaultInThreeMon.append(curr_three_mon_prob)
    sum_of_defaults.append(sum_of_default_in_current_row)
    ### Calculating Regular defaulter by checking if he defaulted on every 2 months ###
    m1_m2 =  m1_to_m12[row_num][0] + m1_to_m12[row_num][1]
    m3_m4 =  m1_to_m12[row_num][2] + m1_to_m12[row_num][3]
    m5_m6 =  m1_to_m12[row_num][4] + m1_to_m12[row_num][5]
    m7_m8 =  m1_to_m12[row_num][6] + m1_to_m12[row_num][7]
    m9_m10 =  m1_to_m12[row_num][8] + m1_to_m12[row_num][9]
    m11_m12 =  m1_to_m12[row_num][10] + m1_to_m12[row_num][11]
    if m1_m2 > 0: 
      m1_m2=1
    else:
      m1_m2=0
    if m3_m4 > 0: 
      m3_m4=1
    else:
      m3_m4=0 
    if m5_m6 > 0: 
      m5_m6=1
    else:
      m5_m6=0 
    if m7_m8 > 0: 
      m7_m8=1
    else:
      m7_m8=0
    if m9_m10 > 0: 
      m9_m10=1
    else:
      m9_m10=0
    if m11_m12 > 0: 
      m11_m12=1
    else:
      m11_m12=0
    if m1_m2+m3_m4+m5_m6+m7_m8+m9_m10+m11_m12 > 2:
      regular_defaulter.append("Y")
    else:
      regular_defaulter.append("N")
    ########### Calculating Odd Defaulter ###############
    m1=0
    m3=0
    m5=0
    m7=0
    m9=0
    m11=0
    if m1_to_m12[row_num][0]!=0:
      m1=1
    if m1_to_m12[row_num][2]!=0:
      m3=1
    if m1_to_m12[row_num][4]!=0:
      m5=1
    if m1_to_m12[row_num][6]!=0:
      m7=1
    if m1_to_m12[row_num][8]!=0:
      m9=1
    if m1_to_m12[row_num][10]!=0:
      m11=1
    if (m1+m3+m5+m7+m9+m11)/6 > 0.5:
      odd_defaulter.append("Y")
    else:
      odd_defaulter.append("N")
    ############ Calculating last Month Defaulter #############
    if m1_to_m12[row_num][11]>0:
      last_month_defaulter.append("Y")
    else:
      last_month_defaulter.append("N")
    ########## Checking if defaulted in last 2 months ##########
    if m1_to_m12[row_num][10]>0 and m1_to_m12[row_num][11]>0:
      last_two_month_defaulter.append("Y")
    else:
      last_two_month_defaulter.append("N")
    ###### Checking num of defaults in last 2 months ###########
    num_of_defaults_in_last_two_month.append(m1_to_m12[row_num][10]+m1_to_m12[row_num][11])
    ########## Checking if defaulted in last 3 months ##########
    if m1_to_m12[row_num][9]>0 and m1_to_m12[row_num][10]>0 and m1_to_m12[row_num][11]>0:
      last_three_month_defaulter.append("Y")
    else:
      last_three_month_defaulter.append("N")
    ###### Checking num of defaults in last 3 months ###########
    num_of_defaults_in_last_three_month.append(m1_to_m12[row_num][9]+m1_to_m12[row_num][10]+m1_to_m12[row_num][11])
    ###### Number of deliquency till m13 ###############
    num_of_del_still_pending=0
    if m1_to_m12[row_num][0] >= 12:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][1] >= 11:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][2] >= 10:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][3] >= 9:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][4] >= 8:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][5] >= 7:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][6] >= 6:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][7] >= 5:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][8] >= 4:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][9] >= 3:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][10] >= 2:
      num_of_del_still_pending=num_of_del_still_pending+1
    if m1_to_m12[row_num][11] >= 1:
      num_of_del_still_pending=num_of_del_still_pending+1
    num_of_deliquency_till_m13.append(num_of_del_still_pending)
  
  return sum_of_defaults,regular_defaulter,odd_defaulter,last_month_defaulter,last_two_month_defaulter,num_of_defaults_in_last_two_month,last_three_month_defaulter,num_of_defaults_in_last_three_month,num_of_deliquency_till_m13,ProbabilityToDefaultInWhole,ProbabilityToDefaultInSixMon,ProbabilityToDefaultInThreeMon

# 4. Segment Credit Score
def SegmentScreditScore(dataset,credit_score_col_num):
  Credit_Score=[]
  for row_num in range(0,dataset.shape[0]):
    curr_score=dataset[row_num][credit_score_col_num]
    if curr_score > 299 and curr_score < 580:
      Credit_Score.append("VP")
    elif curr_score > 579 and curr_score < 670:
      Credit_Score.append("P")
    elif curr_score > 669 and curr_score < 740:
      Credit_Score.append("A")
    elif curr_score > 739 and curr_score < 800:
      Credit_Score.append("G")
    elif curr_score > 799 and curr_score <= 850:
      Credit_Score.append("VG")
    else:
      Credit_Score.append("NA")
  return Credit_Score

# 5. Parse Origin Date
def ParseOriginandfirstpaymentDate(dataset,origin_column_index,payment_column_index):
  OriginMonth=[]
  PaymentMonth=[]
  Interval=[]
  Recurrence=[]
  EMIFrequency=[]
  for row_num in range(0,dataset.shape[0]):
    orig_month=dataset[row_num][origin_column_index]
    pay_month=dataset[row_num][payment_column_index]
    OriginMonth.append(orig_month)
    PaymentMonth.append(pay_month)
    if orig_month=="J" and pay_month=="F":
      Interval.append("A")
    elif orig_month=="J" and pay_month=="M":
      Interval.append("B")
    elif orig_month=="F" and pay_month=="A":
      Interval.append("C")
    elif (orig_month=="F" and pay_month=="M") or (orig_month=="M" and pay_month=="A"):
      Interval.append("D")
    else:
      Interval.append("E")
    if (orig_month=="J" and pay_month=="F") or (orig_month=="F" and pay_month=="M") or (orig_month=="M" and pay_month=="A"):
      Recurrence.append("A")
      EMIFrequency.append(1)
    elif (orig_month=="J" and pay_month=="M") or (orig_month=="F" and pay_month=="A") or (orig_month=="M" and pay_month=="MA"):
      Recurrence.append("B")
      EMIFrequency.append(2)
    else:
      Recurrence.append("C")
      EMIFrequency.append(3)
  return OriginMonth,PaymentMonth,Interval,Recurrence,EMIFrequency
    
# 6. Parse Loan Term and Principal
def ParseLoanTermandPrincipal(dataset,interest_rate,loan_term_index,unpaid_principal_index,borrower_count,EMIFrequency,insurance_percent,debt_to_income_ratio):
  LoanTerm=[]
  UnpaidPrincipal=[]
  PtoTRatio=[]
  InterestPending=[]
  EMI=[]
  EMIPerBorrower=[]
  UnpaidPrincipalNotInsured=[]
  Potential=[]
  LoanSize=[]
  for row_num in range(0,dataset.shape[0]):
    curr_loan_term=dataset[row_num][loan_term_index]
    curr_unpaid_principal=dataset[row_num][unpaid_principal_index]
    curr_interest_rate=dataset[row_num][interest_rate]
    curr_PtoT_ratio=curr_unpaid_principal/curr_loan_term
    curr_interest_pending=curr_loan_term*curr_unpaid_principal*curr_interest_rate
    curr_borrower_count=borrower_count[row_num][0]
    if curr_borrower_count==0:
      curr_borrower_count=1
    else:
      curr_borrower_count=2
    curr_EMI_frequency=EMIFrequency[row_num][0]
    curr_insurance_percent=insurance_percent[row_num][0]
    curr_debt_to_income=debt_to_income_ratio[row_num][0]
    LoanTerm.append(curr_loan_term)
    UnpaidPrincipal.append(curr_unpaid_principal)
    PtoTRatio.append(curr_PtoT_ratio)
    InterestPending.append(curr_interest_pending)
    per_month_principal=curr_unpaid_principal/curr_loan_term
    curr_emi=(per_month_principal+per_month_principal*(100+curr_interest_rate))/curr_EMI_frequency
    EMI.append(curr_emi)
    curr_emi_per_borr=curr_emi/curr_borrower_count
    EMIPerBorrower.append(curr_emi_per_borr)
    curr_unpaid_principal_not_insured=curr_unpaid_principal-(curr_unpaid_principal*curr_insurance_percent/100)
    UnpaidPrincipalNotInsured.append(curr_unpaid_principal_not_insured)
    curr_potential=curr_unpaid_principal/(curr_loan_term*curr_debt_to_income)
    Potential.append(curr_potential)
    LoanSize.append(curr_unpaid_principal*(100+curr_interest_rate)/curr_loan_term)
  return LoanTerm,UnpaidPrincipal,PtoTRatio,InterestPending,EMI,EMIPerBorrower,UnpaidPrincipalNotInsured,Potential,LoanSize


## 7. OneHotEnoding Data
def OneHotEncodeDataset(feature):
  le=LabelEncoder()
  oe=OneHotEncoder(categorical_features=[0])
  feature[:,0]=le.fit_transform(feature[:,0])
  featureOHE=oe.fit_transform(feature).toarray()
  featureOHE=featureOHE[:,1:]
  return featureOHE

## 8. Preprocess Data
def PreprocessDataset(dataset):
  ########## Get M1 to M12 insights ########################
  m1_to_m12=dataset[:,15:27]
  sum_of_defaults,regular_defaulter,odd_defaulter,last_month_defaulter,last_two_month_defaulter,num_of_defaults_in_last_two_month,last_three_month_defaulter,num_of_defaults_in_last_three_month,num_of_deliquency_till_m13,ProbabilityToDefaultInWhole,ProbabilityToDefaultInSixMon,ProbabilityToDefaultInThreeMon=UnderstandPatternFromM1toM12(m1_to_m12)
  m1=m1_to_m12=dataset[:,15:16]
  m2=m1_to_m12=dataset[:,16:17]
  m3=m1_to_m12=dataset[:,17:18]
  m4=m1_to_m12=dataset[:,18:19]
  m5=m1_to_m12=dataset[:,19:20]
  m6=m1_to_m12=dataset[:,20:21]
  m7=m1_to_m12=dataset[:,21:22]
  m8=m1_to_m12=dataset[:,22:23]
  m9=m1_to_m12=dataset[:,23:24]
  m10=m1_to_m12=dataset[:,24:25]
  m11=m1_to_m12=dataset[:,25:26]
  m12=m1_to_m12=dataset[:,26:27]
  sum_of_defaults=np.array(sum_of_defaults,dtype=object).reshape(-1,1)
  regular_defaulter=np.array(regular_defaulter,dtype=object).reshape(-1,1)
  odd_defaulter=np.array(odd_defaulter,dtype=object).reshape(-1,1)
  last_month_defaulter=np.array(last_month_defaulter,dtype=object).reshape(-1,1)
  last_two_month_defaulter=np.array(last_two_month_defaulter,dtype=object).reshape(-1,1)
  num_of_defaults_in_last_two_month=np.array(num_of_defaults_in_last_two_month,dtype=object).reshape(-1,1)
  last_three_month_defaulter=np.array(last_three_month_defaulter,dtype=object).reshape(-1,1)
  num_of_defaults_in_last_three_month=np.array(num_of_defaults_in_last_three_month,dtype=object).reshape(-1,1)
  num_of_deliquency_till_m13=np.array(num_of_deliquency_till_m13,dtype=object).reshape(-1,1)
  ProbabilityToDefaultInWhole=np.array(ProbabilityToDefaultInWhole,dtype=object).reshape(-1,1)
  ProbabilityToDefaultInSixMon=np.array(ProbabilityToDefaultInSixMon,dtype=object).reshape(-1,1)
  ProbabilityToDefaultInThreeMon=np.array(ProbabilityToDefaultInThreeMon,dtype=object).reshape(-1,1)

  ############## Insurance Type ####################
  insurance_type=dataset[:,14:15]

  ############## Co Borrower's Credit Score ####################
  co_borrower_credit_score=SegmentScreditScore(dataset,13)
  co_borrower_credit_score=np.array(co_borrower_credit_score,dtype=object).reshape(-1,1)

  ############## Insurance Percent #############################
  insurance_percent=dataset[:,12:13]

  ############### Loan Purpose #########################
  loan_purpose=dataset[:,11]
  loan_purpose[loan_purpose=="A23"]="A"
  loan_purpose[loan_purpose=="B12"]="B"
  loan_purpose[loan_purpose=="C86"]="C"
  loan_purpose=loan_purpose.reshape(-1,1)

  ################### Borrower's Credit Score ######################
  borrower_credit_score_number=dataset[:,10:11]
  borrower_credit_score=SegmentScreditScore(dataset,10)
  borrower_credit_score=np.array(borrower_credit_score,dtype=object).reshape(-1,1)

  ################### Debt to income ratio #########################
  debt_to_income_ratio=dataset[:,9:10]

  ################## Number Of Borrower #############################
  num_of_borrower=dataset[:,8]
  num_of_borrower[num_of_borrower==1]=0
  num_of_borrower[num_of_borrower==2]=1
  num_of_borrower=num_of_borrower.reshape(-1,1)

  ################# Loan To Value #############################
  loan_to_value=dataset[:,7:8]

  ######### Origin, Payment and Interval Dates ################
  OriginMonth,PaymentMonth,Interval,Recurrence,EMIFrequency=ParseOriginandfirstpaymentDate(dataset,5,6)
  OriginMonth=np.array(OriginMonth,dtype=object).reshape(-1,1)
  PaymentMonth=np.array(PaymentMonth,dtype=object).reshape(-1,1)
  Interval=np.array(Interval,dtype=object).reshape(-1,1)
  Recurrence=np.array(Recurrence,dtype=object).reshape(-1,1)
  EMIFrequency=np.array(EMIFrequency,dtype=object).reshape(-1,1)

  ##### Unpaid Principal, Loan Term and PtoT ratio ############
  borrower_count=dataset[:,8:9]
  LoanTerm,UnpaidPrincipal,PtoTRatio,InterestPending,EMI,EMIPerBorrower,UnpaidPrincipalNotInsured,Potential,LoanSize=ParseLoanTermandPrincipal(dataset,2,4,3,borrower_count,EMIFrequency,insurance_percent,debt_to_income_ratio)
  LoanTerm=np.array(LoanTerm,dtype=object).reshape(-1,1)
  UnpaidPrincipal=np.array(UnpaidPrincipal,dtype=object).reshape(-1,1)
  PtoTRatio=np.array(PtoTRatio,dtype=object).reshape(-1,1)
  InterestPending=np.array(InterestPending,dtype=object).reshape(-1,1)
  EMI=np.array(EMI,dtype=object).reshape(-1,1)
  EMIPerBorrower=np.array(EMIPerBorrower,dtype=object).reshape(-1,1)
  UnpaidPrincipalNotInsured=np.array(UnpaidPrincipalNotInsured,dtype=object).reshape(-1,1)
  Potential=np.array(Potential,dtype=object).reshape(-1,1)
  LoanSize=np.array(LoanSize,dtype=object).reshape(-1,1)

  ################# Interest Rate #############################
  interest_rate=dataset[:,2:3]

  ################ Financial Institution #######################
  financial_institution=dataset[:,1:2]

  ###################### Source #################################
  source=dataset[:,0:1]
  
  DatasetPreProcessedNoOHE= np.concatenate((source,financial_institution,interest_rate,UnpaidPrincipal,LoanTerm,PtoTRatio,OriginMonth,PaymentMonth,Interval,Recurrence,
                                            loan_to_value,num_of_borrower,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co_borrower_credit_score,
                                            insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,sum_of_defaults,regular_defaulter,odd_defaulter,last_month_defaulter,
                                            last_two_month_defaulter,num_of_defaults_in_last_two_month,last_three_month_defaulter,num_of_defaults_in_last_three_month,
                                            num_of_deliquency_till_m13,InterestPending,ProbabilityToDefaultInWhole,ProbabilityToDefaultInSixMon,
                                            ProbabilityToDefaultInThreeMon,EMI,EMIPerBorrower,UnpaidPrincipalNotInsured,LoanSize,Potential,borrower_credit_score_number),axis=1)
  
  ################# OneHotEncodeDataset #########################
  regular_defaulterOHE=OneHotEncodeDataset(regular_defaulter)
  odd_defaulterOHE=OneHotEncodeDataset(odd_defaulter)
  last_month_defaulterOHE=OneHotEncodeDataset(last_month_defaulter)
  last_two_month_defaulterOHE=OneHotEncodeDataset(last_two_month_defaulter)
  last_three_month_defaulterOHE=OneHotEncodeDataset(last_three_month_defaulter)
  insurance_typeOHE=OneHotEncodeDataset(insurance_type)
  co_borrower_credit_scoreOHE=OneHotEncodeDataset(co_borrower_credit_score)
  loan_purposeOHE=OneHotEncodeDataset(loan_purpose)
  borrower_credit_scoreOHE=OneHotEncodeDataset(borrower_credit_score)
  OriginMonthOHE=OneHotEncodeDataset(OriginMonth)
  PaymentMonthOHE=OneHotEncodeDataset(PaymentMonth)
  IntervalOHE=OneHotEncodeDataset(Interval)
  RecurrenceOHE=OneHotEncodeDataset(Recurrence)
  financial_institutionOHE=OneHotEncodeDataset(financial_institution)
  sourceOHE=OneHotEncodeDataset(source)
  """
  print("sourceOHE: "+str(sourceOHE.shape[1]))
  print("financial_institutionOHE: "+str(financial_institutionOHE.shape[1]))
  print("interest_rate: "+str(interest_rate.shape[1]))
  print("UnpaidPrincipal: "+str(UnpaidPrincipal.shape[1]))
  print("LoanTerm: "+str(LoanTerm.shape[1]))
  print("PtoTRatio: "+str(PtoTRatio.shape[1]))
  print("OriginMonthOHE: "+str(OriginMonthOHE.shape[1]))
  print("PaymentMonthOHE: "+str(PaymentMonthOHE.shape[1]))
  print("IntervalOHE: "+str(IntervalOHE.shape[1]))
  print("RecurrenceOHE: "+str(RecurrenceOHE.shape[1]))
  print("loan_to_value: "+str(loan_to_value.shape[1]))
  print("num_of_borrower: "+str(num_of_borrower.shape[1]))
  print("debt_to_income_ratio: "+str(debt_to_income_ratio.shape[1]))
  print("borrower_credit_scoreOHE: "+str(borrower_credit_scoreOHE.shape[1]))
  print("loan_purposeOHE: "+str(loan_purposeOHE.shape[1]))
  print("insurance_percent: "+str(insurance_percent.shape[1]))
  print("co_borrower_credit_scoreOHE: "+str(co_borrower_credit_scoreOHE.shape[1]))
  print("insurance_typeOHE: "+str(insurance_typeOHE.shape[1]))
  print("m1: "+str(m1.shape[1]))
  print("m2: "+str(m2.shape[1]))
  print("m3: "+str(m3.shape[1]))
  print("m4: "+str(m4.shape[1]))
  print("m5: "+str(m5.shape[1]))
  print("m6: "+str(m6.shape[1]))
  print("m7: "+str(m7.shape[1]))
  print("m8: "+str(m8.shape[1]))
  print("m9: "+str(m9.shape[1]))
  print("m10: "+str(m10.shape[1]))
  print("m11: "+str(m11.shape[1]))
  print("m12: "+str(m12.shape[1]))
  print("sum_of_defaults: "+str(sum_of_defaults.shape[1]))
  print("regular_defaulterOHE: "+str(regular_defaulterOHE.shape[1]))
  print("odd_defaulterOHE: "+str(odd_defaulterOHE.shape[1]))
  print("last_month_defaulterOHE: "+str(last_month_defaulterOHE.shape[1]))
  print("last_two_month_defaulterOHE: "+str(last_two_month_defaulterOHE.shape[1]))
  print("num_of_defaults_in_last_two_month: "+str(num_of_defaults_in_last_two_month.shape[1]))
  print("last_three_month_defaulterOHE: "+str(last_three_month_defaulterOHE.shape[1]))
  print("num_of_defaults_in_last_three_month: "+str(num_of_defaults_in_last_three_month.shape[1]))
  """
  
  #################### Concat Columns ############################
  DatasetPreProcessed= np.concatenate((sourceOHE,financial_institutionOHE,interest_rate,UnpaidPrincipal,LoanTerm,PtoTRatio,OriginMonthOHE,PaymentMonthOHE,IntervalOHE,RecurrenceOHE,
                                       loan_to_value,num_of_borrower,debt_to_income_ratio,borrower_credit_scoreOHE,loan_purposeOHE,insurance_percent,co_borrower_credit_scoreOHE,
                                       insurance_typeOHE,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,sum_of_defaults,regular_defaulterOHE,odd_defaulterOHE,last_month_defaulterOHE,
                                       last_two_month_defaulterOHE,num_of_defaults_in_last_two_month,last_three_month_defaulterOHE,num_of_defaults_in_last_three_month,
                                       num_of_deliquency_till_m13,InterestPending,ProbabilityToDefaultInWhole,ProbabilityToDefaultInSixMon,
                                       ProbabilityToDefaultInThreeMon,EMI,EMIPerBorrower,UnpaidPrincipalNotInsured,LoanSize,Potential,borrower_credit_score_number),axis=1)
  
  
  return DatasetPreProcessed,DatasetPreProcessedNoOHE

## 9. Scale Dataset
def ScaleDataset(PreprocessedDataset):
  robust_scaler=RobustScaler(quantile_range=(5,95))
  scaler=StandardScaler()
  PreprocessedDatasetScaled=robust_scaler.fit_transform(PreprocessedDataset)
  return PreprocessedDatasetScaled
  
  
#####################################################################################################################################################################

###################################################################### Steps ########################################################################################
## 1. Import Dataset
TrainingDataset=pd.read_csv(TrainingDatasetPath)
TestDataset=pd.read_csv(TestDatasetPath)
XTrain=TrainingDataset.iloc[:,1:28].values
num_of_rows_in_training_set=XTrain.shape[0]
YTrain=TrainingDataset.iloc[:,28].values
XTest=TestDataset.iloc[:,1:28].values

XTrain,XTest=FormatDate(XTrain,XTest)

XTrainXTestConcatenated=np.concatenate((XTrain,XTest), axis=0)

## 2. Deal with Invalid Values
XTrainXTestImputed=DealWithMissingValues(XTrainXTestConcatenated)

## 3. Preprocess Data
XTrainXTestPreProcessed,DatasetPreProcessedNoOHE=PreprocessDataset(XTrainXTestImputed)

"""

0-1 source || 2-19 financial institution || 20 interest rate || 21 unpaid principal || 22 loan term || 23 prinicipal to term ratio || 24 - 25: origin month ||

26-28 pay month || 29-32 interval || 33-34 recurrence || 35 loan to value ratio || 36 num of borrowers || 37 debt to income || 38-41 borrower credit score ||

42-43 loan purpose || 44 insurance percent || 45-48 co borrower credit score || 49 insurance type || 50-61 m1 to m12 || 62 sum of defaults || 63 regular defaulter ||

64 odd defaulter || 65 last month defaulter || 66 last 2 month defaulter || 67 num of defaults in last 2 month || 68 last 3 month defaulter || 69 num of defaults in last 3 month ||

70 num_of_deliquency_till_m13 || 71 Interest Pending || 72 ProbabilityToDefaultInWhole || 73 ProbabilityToDefaultInSixMon || 74 ProbabilityToDefaultInThreeMon || 

75 EMI || 76 EMIPerBorrower || 77 UnpaidPrincipalNotInsured || 78 LoanSize || 79 Potential || 80 borrower_credit_score_number


"""
## 4. Scale Dataset
XTrainXTestScaled=ScaleDataset(XTrainXTestPreProcessed)

## 5. Divide Dataset to Train and Test
XTrainScaled=XTrainXTestScaled[0:num_of_rows_in_training_set,:]
XTestScaled=XTrainXTestScaled[num_of_rows_in_training_set:XTrainXTestScaled.shape[0],:]

## 6. Feature Importance
"""
np.savetxt(fname="WholeTrainDataset.csv",X=XTrainScaled,fmt="%s",delimiter=',')
#After downloading give name to column names and reupload, similarly do for labels
np.savetxt(fname="TrainSetLabel.csv",X=YTrain.reshape(-1,1),fmt="%s",delimiter=',')
#After downloading give name to column name as Label and reupload
#Now read the datasets with columns back 
dataset=pd.read_csv('WholeTrainDataset.csv')
Label=pd.read_csv('TrainSetLabel.csv')
from FeatureSelector import FeatureSelector
fs = FeatureSelector(data = dataset, labels = Label)
fs.identify_zero_importance(task = 'classification', eval_metric = 'f1', n_iterations = 10, early_stopping = True)
importance=fs.feature_importances.iloc[:,0:2].values
importance
# based on the above findings features with zero importances are FinInst2,FinInst10,FinInst13 and BorCredScore4 which has indexes 3, 11, 14, 41
"""
#selected_columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78]
#selected_columns=[0,1,2,5,6,9,20,21,22,23,30,35,36,37,38,39,40,42,43,45,48,60,62,69,71,75,76,77]
#selected_columns=[0,1,2,4,5,6,7,8,9,10,12,13,15,16,17,18,19,23,29,30,31,32,33,34,35,37,38,39,40,42,43,45,46,47,48,49,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78]
#selected_columns=[0,1,2,5,6,7,9,13,17,18,19,23,30,34,35,37,38,39,40,42,43,45,46,48,61,62,67,69,70,71,72,73,75,76,77,78]
#selected_columns=[0,1,2,5,6,7,9,10,14,17,18,19,20,21,22,23,26,28,35,36,37,38,39,40,42,43,45,47,48,51,53,57,61,62,66,67,69,70,71,72,73,75,76,77,78]
#selected_columns=[0,1,2,5,6,7,9,10,14,17,18,19,20,21,22,23,26,35,36,37,38,39,40,42,43,45,47,48,53,62,67,69,70,71,72,73,75,76,77,78]
selected_columns=[0,1,2,5,6,7,9,10,14,17,18,19,20,21,22,23,26,28,35,36,37,38,39,40,42,43,45,47,48,51,53,55,57,61,62,66,67,69,70,71,72,73,75,76,77,78,79,80]

XTrainScaled=XTrainScaled[:,selected_columns]
XTestScaled=XTestScaled[:,selected_columns]

TestSetPrediction1=[]
TestSetPrediction2=[]
TestSetPrediction3=[]
TestSetPrediction4=[]
TestSetPrediction5=[]
TestSetPrediction6=[]
TestSetPrediction7=[]
TestSetPrediction8=[]
TestSetPrediction9=[]
TestSetPrediction10=[]

XTrainScaled_Train, XTrainScaled_Test, YTrain_Train, YTrain_Test = train_test_split(XTrainScaled,YTrain,test_size=.10,shuffle=True,stratify=YTrain,random_state=1)

for i in range(1,11):
## 6. Train Test split
  class_weight = {0:1,1:181.48}

  print("################################################### Starting Iteration "+str(i)+" ##################################################################")  
  
  ClassifierANN=Sequential()
  ClassifierANN.add(Dense(input_shape=(48,),units=800,activation='relu',kernel_initializer='random_uniform'))
  ClassifierANN.add(Dropout(0.5))
  ClassifierANN.add(Dense(units=800,activation='relu',kernel_initializer='random_uniform'))
  ClassifierANN.add(Dropout(0.5))
  ClassifierANN.add(Dense(units=800,activation='relu',kernel_initializer='random_uniform'))
  ClassifierANN.add(Dropout(0.5))
  ClassifierANN.add(Dense(units=1,activation='sigmoid',kernel_initializer='random_uniform'))
  ClassifierANN.compile(optimizer='adam',loss='binary_crossentropy',metrics=[f1_m])
  ClassifierANN.fit(x=XTrainScaled_Train,y=YTrain_Train,batch_size=96,epochs=50,validation_data=(XTrainScaled_Test,YTrain_Test),callbacks=[EarlyStopping],class_weight=class_weight)
  YPred=ClassifierANN.predict(XTestScaled)
  YPred[YPred>0.5]=1
  YPred[YPred<=0.5]=0
  if i==1:
    TestSetPrediction1=YPred
  elif i==2:
    TestSetPrediction2=YPred
  elif i==3:
    TestSetPrediction3=YPred
  elif i==4:
    TestSetPrediction4=YPred
  elif i==5:
    TestSetPrediction5=YPred
  elif i==6:
    TestSetPrediction6=YPred
  elif i==7:
    TestSetPrediction7=YPred
  elif i==8:
    TestSetPrediction8=YPred
  elif i==9:
    TestSetPrediction9=YPred
  else:
    TestSetPrediction10=YPred
  K.clear_session()
  print("")
  print("################################################### Ending Iteration "+str(i)+" ##################################################################")

In [0]:
##################################################################### Test Set Prediction ################################################################################################
for count in range(0,len(TestSetPrediction1)):
  if (TestSetPrediction1[count]+TestSetPrediction2[count]+TestSetPrediction3[count]+TestSetPrediction4[count]+TestSetPrediction6[count]+
      TestSetPrediction7[count]+TestSetPrediction8[count]+TestSetPrediction9[count]+TestSetPrediction10[count]) > 6:
    FinalYPred.append(1)
  else:
    FinalYPred.append(0)
np.savetxt(fname="TestSetPrediction.txt",X=FinalYPred,fmt="%s")