In [None]:
import numpy as np
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score,make_scorer
from sklearn.decomposition import PCA
import seaborn as sns
sns.set(color_codes=True)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
np.random.seed(25)
import os
import warnings 
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
train = pd.read_csv('../input/train_foreclosure.csv')
test = pd.read_csv('../input/test_foreclosure.csv')
cust_31JAN = pd.read_excel('../input/Customers_31JAN2019.xlsx')
lms_31JAN = pd.read_excel('../input/LMS_31JAN2019.xlsx')
RF_Final = pd.read_excel('../input/RF_Final_Data.xlsx')

In [None]:
train.head()

In [None]:
target = train['FORECLOSURE']

In [None]:
cust_31JAN.head()

In [None]:
cust_31JAN.shape

In [None]:
cust_31JAN.isnull().sum()

In [None]:
cust_31JAN.dtypes

In [None]:
sns.countplot(cust_31JAN['SEX'])

In [None]:
cust_31JAN['AGE'].fillna(cust_31JAN['AGE'].mean(),inplace=True)
cust_31JAN['SEX'].fillna(cust_31JAN['SEX'].mode()[0],inplace=True)
cust_31JAN['MARITAL_STATUS'].fillna(cust_31JAN['MARITAL_STATUS'].mode()[0],inplace=True)
cust_31JAN['QUALIFICATION'].fillna(cust_31JAN['QUALIFICATION'].mode()[0],inplace=True)
cust_31JAN['NO_OF_DEPENDENT'].fillna(cust_31JAN['NO_OF_DEPENDENT'].mode()[0],inplace=True)
cust_31JAN['BRANCH_PINCODE'].fillna(cust_31JAN['BRANCH_PINCODE'].mode()[0],inplace=True)

from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
cust_31JAN["SEX"] = lb_make.fit_transform(cust_31JAN["SEX"])
cust_31JAN["MARITAL_STATUS"] = lb_make.fit_transform(cust_31JAN["MARITAL_STATUS"])
cust_31JAN["QUALIFICATION"] = lb_make.fit_transform(cust_31JAN["QUALIFICATION"])

cust_31JAN.drop(['PROFESSION','OCCUPATION','POSITION','PRE_JOBYEARS'],axis=1,inplace=True)

In [None]:
RF_Final.head()

In [None]:
RF_Final.rename(index=str, columns={"Masked_CustomerID": "CUSTOMERID", "Masked_AgreementID": "AGREEMENTID"},inplace=True)

In [None]:
RF_Final.isnull().sum()

In [None]:
lms_31JAN.head()

In [None]:
lms_31JAN.shape

In [None]:
lms_31JAN = lms_31JAN.sort_values(by='LAST_RECEIPT_DATE',ascending=False).drop_duplicates(subset=['AGREEMENTID'])

In [None]:
train_df = pd.merge(train,lms_31JAN,how='inner',on='AGREEMENTID')

In [None]:
test_df = pd.merge(test,lms_31JAN,how='inner',on='AGREEMENTID')

In [None]:
train_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
train_df.dtypes

In [None]:
#correlation matrix
corrmat = train_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, square=True);

In [None]:
corrmat['FORECLOSURE'].sort_values()

In [None]:
sns.countplot(train_df['FORECLOSURE'])

In [None]:
sns.kdeplot(train_df['ORIGNAL_TENOR'],label = 'non-scaled',shade=True)
sns.kdeplot(np.log(train_df['ORIGNAL_TENOR']+1), label='SCALED',shade=True)
plt.legend();

In [None]:
sns.kdeplot(train_df['ORIGNAL_ROI']-train_df['CURRENT_ROI'],label = 'non-scaled',shade=True)
sns.kdeplot(np.log(train_df['ORIGNAL_ROI']-train_df['CURRENT_ROI']+1), label='SCALED',shade=True)
plt.legend();

In [None]:
train_df['diff_LOAN_AMT'] = train_df['LOAN_AMT']-train_df['NET_DISBURSED_AMT']
train_df['diff_tenor'] = train_df['ORIGNAL_TENOR']-train_df['CURRENT_TENOR']
train_df['diff_ROI'] = train_df['ORIGNAL_ROI']-train_df['CURRENT_ROI']
train_df['total_PRINCIPAL'] = train_df['OUTSTANDING_PRINCIPAL']+train_df['PAID_PRINCIPAL']
#train_df['total_tenure'] = train_df['COMPLETED_TENURE']+train_df['BALANCE_TENURE']
train_df['SCHEMEID'].fillna(999,inplace=True)
train_df['LAST_RECEIPT_AMOUNT'].fillna(train_df['LAST_RECEIPT_AMOUNT'].mean(),inplace=True)

test_df['diff_tenor'] = test_df['ORIGNAL_TENOR']-test_df['CURRENT_TENOR']
test_df['diff_ROI'] = test_df['ORIGNAL_ROI']-test_df['CURRENT_ROI']
test_df['diff_LOAN_AMT'] = test_df['LOAN_AMT']-test_df['NET_DISBURSED_AMT']
test_df['total_PRINCIPAL'] = test_df['OUTSTANDING_PRINCIPAL']+test_df['PAID_PRINCIPAL']
#test_df['total_tenure'] = test_df['COMPLETED_TENURE']+test_df['BALANCE_TENURE']
test_df['SCHEMEID'].fillna(999,inplace=True)
test_df['LAST_RECEIPT_AMOUNT'].fillna(test_df['LAST_RECEIPT_AMOUNT'].mean(),inplace=True)

In [None]:
# Date Features
import datetime as dt
train_df['AUTH_DAY'] = train_df['AUTHORIZATIONDATE'].map(lambda x: x.day)
test_df['AUTH_DAY'] = test_df['AUTHORIZATIONDATE'].map(lambda x: x.day)
train_df['AUTH_MONTH'] = train_df['AUTHORIZATIONDATE'].map(lambda x: x.month)
test_df['AUTH_MONTH'] = test_df['AUTHORIZATIONDATE'].map(lambda x: x.month)
train_df['AUTH_YEAR'] = train_df['AUTHORIZATIONDATE'].map(lambda x: x.year)
test_df['AUTH_YEAR'] = test_df['AUTHORIZATIONDATE'].map(lambda x: x.year)

train_df['INTEREST_DAY'] = train_df['INTEREST_START_DATE'].map(lambda x: x.day)
test_df['INTEREST_DAY'] = test_df['INTEREST_START_DATE'].map(lambda x: x.day)
train_df['INTEREST_MONTH'] = train_df['INTEREST_START_DATE'].map(lambda x: x.month)
test_df['INTEREST_MONTH'] = test_df['INTEREST_START_DATE'].map(lambda x: x.month)
train_df['INTEREST_YEAR'] = train_df['INTEREST_START_DATE'].map(lambda x: x.year)
test_df['INTEREST_YEAR'] = test_df['INTEREST_START_DATE'].map(lambda x: x.year)

train_df['LAST_RECEIPT_DAY'] = train_df['LAST_RECEIPT_DATE'].map(lambda x: x.day)
test_df['LAST_RECEIPT_DAY'] = test_df['LAST_RECEIPT_DATE'].map(lambda x: x.day)
train_df['LAST_RECEIPT_MONTH'] = train_df['LAST_RECEIPT_DATE'].map(lambda x: x.month)
test_df['LAST_RECEIPT_MONTH'] = test_df['LAST_RECEIPT_DATE'].map(lambda x: x.month)
train_df['LAST_RECEIPT_YEAR'] = train_df['LAST_RECEIPT_DATE'].map(lambda x: x.year)
test_df['LAST_RECEIPT_YEAR'] = test_df['LAST_RECEIPT_DATE'].map(lambda x: x.year)

In [None]:
# train_df['LOAN_AMT'] = np.log(train_df['LOAN_AMT']+1)
# train_df['LOAN_AMT'] = np.log(train_df['LOAN_AMT']+1)
# train_df['LOAN_AMT'] = np.log(train_df['LOAN_AMT']+1)
# train_df['NET_DISBURSED_AMT'] = np.log(train_df['NET_DISBURSED_AMT']+1)
# train_df['MONTHOPENING'] = np.log(train_df['MONTHOPENING']+1)

for i in train_df.columns:
    if train_df[i].dtype == 'float64':
        train_df[i] = np.log(train_df[i]+1)
        test_df[i] = np.log(test_df[i]+1)
        
# train_df['DUEDAY'] = np.log(train_df['DUEDAY']+1)
# test_df['DUEDAY'] = np.log(test_df['DUEDAY']+1)

# train_df['ORIGNAL_TENOR'] = np.log(train_df['ORIGNAL_TENOR']+1)
# test_df['ORIGNAL_TENOR'] = np.log(test_df['ORIGNAL_TENOR']+1)

# train_df['COMPLETED_TENURE'] = np.log(train_df['COMPLETED_TENURE']+1)
# test_df['COMPLETED_TENURE'] = np.log(test_df['COMPLETED_TENURE']+1)

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
train_df["CITY"] = lb_make.fit_transform(train_df["CITY"])
train_df["PRODUCT"] = lb_make.fit_transform(train_df["PRODUCT"])

test_df["CITY"] = lb_make.fit_transform(test_df["CITY"])
test_df["PRODUCT"] = lb_make.fit_transform(test_df["PRODUCT"])

In [None]:
feature_names = [x for x in train_df.columns if x not in ['AGREEMENTID','FORECLOSURE','NPA_IN_LAST_MONTH','NPA_IN_CURRENT_MONTH'
                                                      ,'LAST_RECEIPT_DATE','CUSTOMERID',
                                                         'INTEREST_START_DATE','AUTHORIZATIONDATE']]

In [None]:
model = CatBoostClassifier(iterations=10000,eval_metric='AUC',random_seed=42,verbose=False,max_depth=10,learning_rate=0.01,task_type='GPU')
## model training and prediction
model.fit(train_df[feature_names],target)
pred1 = model.predict_proba(test_df[feature_names])

In [None]:
model = lgb.LGBMClassifier(n_estimators=10000,n_jobs = -1,max_depth=10,learning_rate=0.01)
## model training and prediction
model.fit(train_df[feature_names],target)
pred2 = model.predict_proba(test_df[feature_names])

In [None]:
model = xgb.XGBClassifier(n_estimators=10000,n_jobs = -1,max_depth=10,learning_rate=0.01)
## model training and prediction
model.fit(train_df[feature_names],target)
pred3 = model.predict_proba(test_df[feature_names])

In [None]:
model = CatBoostClassifier(iterations=9000,eval_metric='AUC',random_seed=42,verbose=False,max_depth=12,learning_rate=0.01,task_type='GPU')
## model training and prediction
model.fit(train_df[feature_names],target)
pred4 = model.predict_proba(test_df[feature_names])

In [None]:
model = CatBoostClassifier(iterations=8000,eval_metric='AUC',random_seed=42,verbose=False,max_depth=8,learning_rate=0.01,task_type='GPU')
## model training and prediction
model.fit(train_df[feature_names],target)
pred5 = model.predict_proba(test_df[feature_names])

In [None]:
pred = []
for i in range(len(pred1)):
    pred.append((pred1[i][1] + pred2[i][1] + pred3[i][1] + pred4[i][1] + pred5[i][1])/5)

In [None]:
## make submission
sub = pd.DataFrame()
sub['AGREEMENTID'] = test['AGREEMENTID']
sub['FORECLOSURE'] = pred#[i[1] for i in pred]
sub.to_csv('result.csv', index=False)

In [None]:
sub.head()