# IMPORTS

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#Run this to install version 1.5.2 of xgboost
!pip install 'xgboost==1.5.2'

# READING DATA

In [4]:
train_prev = pd.read_csv('./data/trainprevloans/trainprevloans.csv')
train_perf = pd.read_csv('./data/trainperf.csv')
train_demog = pd.read_csv('./data/traindemographics.csv')

# FEATURE ENGINEEERING AND PREPROCESSING

In [3]:
# Making new features with better insights from the previous loans data
train_prev.firstrepaiddate = pd.to_datetime(train_prev.firstrepaiddate)
train_prev.firstduedate = pd.to_datetime(train_prev.firstduedate)

sum_loans = train_prev.groupby('customerid').loanamount.sum()
sum_dues = train_prev.groupby('customerid').totaldue.sum()

train_prev['score']=0.0
train_prev['delay']= 0
def score_transformation(x):
    x.score = sum_dues[x.customerid]/sum_loans[x.customerid]
    return x
def delay_transformation(x):
    x.delay = (x.firstrepaiddate- x.firstduedate).days
    return x   
train_prev= train_prev.apply(score_transformation, axis=1)
train_prev= train_prev.apply(delay_transformation, axis=1)
score = train_prev.groupby('customerid').score.mean()
delay = train_prev.groupby('customerid').delay.mean()

sc_del= pd.concat([score, delay], axis=1)


In [4]:
#Merging the train dataframes
train = train_perf.set_index('customerid').join(sc_del.join(train_demog.set_index('customerid')))
train.head()

In [5]:
#Creating an age feature
train.birthdate = pd.to_datetime(train.birthdate)
train.birthdate = train.birthdate.map(lambda x: 2022-x.year)
train.rename(columns={'birthdate':'age'}, inplace=True)

In [6]:
#Preprocessing the target feature
train.good_bad_flag= train.good_bad_flag.map(lambda x : 0 if x=='Bad' else 1)

In [7]:
train

In [8]:
#Visualizing the proportions of target classes
sns.countplot(x='good_bad_flag', data=train)

In [9]:
#Resetting the index
train=train.reset_index()

In [10]:
#Dropping unnecessary columns and dealing with categorical ones
train=train.drop(['customerid'], axis=1)
X, y = train.drop(['loannumber', 'approveddate', 'creationdate', 'systemloanid', 'good_bad_flag'], axis=1), train['good_bad_flag']
for c in ['bank_account_type', 'bank_name_clients', 'bank_branch_clients', 'employment_status_clients', 'level_of_education_clients', 'referredby']:
    X[c] = X[c].astype('category')

In [11]:
#param_grid={
#    'max_depth': range(2, 10, 2),
#    'n_estimators': range(26, 86, 2),
#    'learning_rate': [0.2, 0.1, 0.03,0.05,  0.01]
#}

#xg = XGBClassifier(eval_metric='error', n_jobs = -1, tree_method='gpu_hist', use_label_encoder = False, enable_categorical='True')
#gs_xg = GridSearchCV(xg, param_grid, cv = 5, n_jobs = -1, verbose = 1)
#gs_xg.fit(X, y)

#xg_best = gs_xg.best_estimator_
#gs_xg.best_params_, gs_xg.score(X, y)

In [16]:
xg_best=XGBClassifier(eval_metric='error',learning_rate=0.05, n_estimators=58, tree_method='gpu_hist', max_depth=2, use_label_encoder=False, enable_categorical=True)
xg_best.fit(X,y)

# Preprocessing test data and preparing submission

In [17]:
submission = pd.read_csv('./data/SampleSubmission.csv')
test_prev = pd.read_csv('./data/testprevloans/testprevloans.csv')
test_perf = pd.read_csv('./data/testperf.csv')
test_demog = pd.read_csv('./data/testdemographics.csv')

In [18]:
test_prev.firstrepaiddate = pd.to_datetime(test_prev.firstrepaiddate)
test_prev.firstduedate = pd.to_datetime(test_prev.firstduedate)

sum_loans = test_prev.groupby('customerid').loanamount.sum()
sum_dues = test_prev.groupby('customerid').totaldue.sum()

test_prev['score']=0.0
test_prev['delay']= 0
def score_transformation(x):
    x.score = sum_dues[x.customerid]/sum_loans[x.customerid]
    return x
def delay_transformation(x):
    x.delay = (x.firstrepaiddate- x.firstduedate).days
    return x   
test_prev= test_prev.apply(score_transformation, axis=1)
test_prev= test_prev.apply(delay_transformation, axis=1)
score = test_prev.groupby('customerid').score.mean()
delay = test_prev.groupby('customerid').delay.mean()

sc_del= pd.concat([score, delay], axis=1)

In [19]:
test = test_perf.set_index('customerid').join(sc_del).join(test_demog.set_index('customerid'))

In [20]:
test.birthdate = pd.to_datetime(test.birthdate)
test.birthdate = test.birthdate.map(lambda x: 2022-x.year)
test.rename(columns={'birthdate':'age'}, inplace=True)

In [21]:
test=test.reset_index()

In [22]:
test=test.drop(['customerid'], axis=1)
X_test= test.drop(['loannumber', 'approveddate', 'creationdate', 'systemloanid' ], axis=1)
for c in ['bank_account_type', 'bank_name_clients', 'bank_branch_clients', 'employment_status_clients', 'level_of_education_clients', 'referredby']:
    X_test[c] = X_test[c].astype('category')

In [23]:
pred=xg_best.predict(X_test)

In [24]:
submission['Good_Bad_flag'] = pred
submission.to_csv('sub15.csv', index = False)