In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.simplefilter('ignore')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
train=pd.read_csv('/kaggle/input/delloite-machinehack/train.csv')
test=pd.read_csv('/kaggle/input/delloite-machinehack/test.csv')
ss=pd.read_csv('/kaggle/input/delloite-machinehack/submission.csv')

In [5]:
# feature importance file after generating lot of features(derived features using LGBM feature importance)
fets=pd.read_csv('/kaggle/input/impftds/importantfrs.csv')

In [6]:
# Concatinating train and test data for data preprocessing and feature engineering
df =pd.concat([train, test], axis=0).reset_index(drop = True)
df.shape

In [7]:
#Categorical columns
cat_cols =df.select_dtypes('object').columns
cat_cols

In [8]:
ID_COL, TARGET_COL = 'ID', 'Loan Status'
features = [c for c in df.columns if c not in [ID_COL, TARGET_COL]]

cat_cols = ['Batch Enrolled', 'Grade', 'Sub Grade', 'Employment Duration',
       'Verification Status', 'Payment Plan', 'Loan Title',
       'Initial List Status', 'Application Type']

num_cols = [c for c in features if c not in cat_cols]

In [9]:
num_cols

**Bining for numerical columns**

In [10]:
#bining features using num_cols
# 10 bins
df["loan_bin_10"] = pd.cut(df["Loan Amount"], bins=10, labels=False)
df["fund_bin_10"] = pd.cut(df["Funded Amount"], bins=10, labels=False)
df["fundinv_bin_10"] = pd.cut(df["Funded Amount Investor"], bins=10, labels=False)
df["inter_bin_5"] = pd.cut(df["Interest Rate"], bins=5, labels=False)
df["home_bin_10"] = pd.cut(df["Home Ownership"], bins=10, labels=False)
df["debit_bin_5"] = pd.cut(df["Debit to Income"], bins=5, labels=False)
df["rev_bin_5"] = pd.cut(df["Revolving Balance"], bins=5, labels=False)
df["revuti_bin_5"] = pd.cut(df["Revolving Utilities"], bins=5, labels=False)
df["revuti_bin_5"] = pd.cut(df["Revolving Utilities"], bins=5, labels=False)
df["totint_bin_5"] = pd.cut(df["Total Received Interest"], bins=5, labels=False)
df["totlatfe_bin_5"] = pd.cut(df["Total Received Late Fee"], bins=5, labels=False)
df["recov_bin_5"] = pd.cut(df["Recoveries"], bins=5, labels=False)
df["collrecov_bin_5"] = pd.cut(df["Collection Recovery Fee"], bins=5, labels=False)
df["tocam_bin_5"] = pd.cut(df["Total Collection Amount"], bins=5, labels=False)
df["tocbal_bin_5"] = pd.cut(df["Total Current Balance"], bins=5, labels=False)
df["tocredlim_bin_5"] = pd.cut(df["Total Revolving Credit Limit"], bins=5, labels=False)

**Generating arithematic features using numerical features**

In [11]:
#set_1 using loan amount funded and funded investor amount
df['l_f_fin_sum']=df['Loan Amount']+df['Funded Amount']+df['Funded Amount Investor']
df['loan_fun_div']=df['Loan Amount']/df['Funded Amount']
df['loan_funinve_div']=df['Loan Amount']/df['Funded Amount Investor']
df['fund_funinv_div']=df['Funded Amount']/df['Funded Amount Investor']

In [12]:
#set_2 using interest rate debit to income and home ownership
df['inte_home_div']=df['Home Ownership']/df['Interest Rate']
df['inte_debit_sum']=df['Interest Rate']+df['Debit to Income']
df['inte_debit_divi']=df['Interest Rate']/df['Debit to Income']
df['inte_debit_mult']=df['Interest Rate']*df['Debit to Income']
df['Home_debit_divi']=df['Home Ownership']/df['Debit to Income']

In [13]:
#set_3 using delinquen inquires open account public record
df['dl_in_op_pb']=df['Delinquency - two years']+df['Inquires - six months']+df['Open Account']+df['Public Record']
df['dl_in_sum']=df['Delinquency - two years']+df['Inquires - six months']
df['dl_op_sum']=df['Delinquency - two years']+df['Open Account']
df['dl_pb_sum']=df['Delinquency - two years']+df['Public Record']
df['in_op_sum']=df['Inquires - six months']+df['Open Account']
df['op_pb_sum']=df['Open Account']+df['Public Record']

In [14]:
#set_4 using revolving balance and utilities
df['revb_ut_sum']=df['Revolving Balance']+df['Revolving Utilities']
#df['revb_ut_div']=df['Revolving Balance']/df['Revolving Utilities']
df['revb_ut_mult']=df['Revolving Balance']*df['Revolving Utilities']
df['revb_ut_sub']=df['Revolving Balance']-df['Revolving Utilities']

In [15]:
#set_5 using openaccounts and total accounts
df['opac_totac_sum']=df['Open Account']+df['Total Accounts']
df['opac_totac_div']=df['Open Account']/df['Total Accounts']
df['opac_totac_mult']=df['Open Account']*df['Total Accounts']

In [16]:
#set_6 using total accounts received interest and late fee
df['to_rec_late']=df['Total Accounts']+df['Total Received Interest']+df['Total Received Late Fee']
df['to_rec']=df['Total Received Interest']/df['Total Accounts']
df['to_latefee']=df['Total Received Late Fee']/df['Total Accounts']

In [17]:
#set_7 using recoveries and collection fee
df['reco_colle_sum']=df['Recoveries']+df['Collection Recovery Fee']
df['reco_colle_mult']=df['Recoveries']*df['Collection Recovery Fee']
#df['reco_colle_divi']=df['Recoveries']/df['Collection Recovery Fee']

In [18]:
#set_8 using latweekpay totalamount balance and credit limit
df['lastweek_sum']=df['Last week Pay']+df['Total Collection Amount']+df['Total Current Balance']+df['Total Revolving Credit Limit']
df['last_total_mult']=df['Last week Pay']*df['Total Collection Amount']
#df['last_total_divi']=df['Last week Pay']/df['Total Collection Amount']
df['totalc_limit']=df['Total Current Balance']+df['Total Revolving Credit Limit']
df['totalc_limit_div']=df['Total Current Balance']/df['Total Revolving Credit Limit']

**Incrementing numerical features using PCA analysis and clustering the data into two groups**

In [19]:
from sklearn import decomposition

Principal component analysis on numerical columns

In [20]:
%%time

pca = decomposition.IncrementalPCA(n_components=12)
emb = pca.fit_transform(df[num_cols].values)
for i in range(emb.shape[1]):
    df[f'pca_{i+1}'] = emb[:, i]
    
print(df.shape)

In [21]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [22]:
# Fit the KMeans model to X_scaled and create the cluster labels
kmeans = KMeans(n_clusters=2, n_init=50, random_state=0)
df["Cluster"] = kmeans.fit_predict(df[num_cols].values)

**Combination of Categorical features**

In [23]:
df['Batch_empduration']=df['Batch Enrolled']+df['Employment Duration']
df['Grade_subgrade']=df['Grade']+df['Sub Grade']
df['status_plan']=df['Verification Status']+df['Payment Plan']
df['plan_loantitle']=df['Payment Plan']+df['Loan Title']
df['inital_aptype']=df['Initial List Status']+df['Application Type']
df['verify_initial']=df['Verification Status']+df['Initial List Status']

In [24]:
#keeping new categorical columns in a list to use further
new_cat_cols=['Batch_empduration','Grade_subgrade','status_plan','plan_loantitle','inital_aptype','verify_initial']

***Loading libraries***

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor

import gc
from tqdm import *

In [26]:
# I treated Term and accounts delinquent as categorical features instea of discrete values
cat_ord_cols=['Term','Accounts Delinquent']

In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

**Aggregation features for categorical and numerical features**

1. It will take 15 minutes to get new features

In [28]:
cat_fts =['Batch Enrolled', 'Grade', 'Sub Grade', 'Employment Duration',
       'Verification Status', 'Payment Plan', 'Loan Title',
       'Initial List Status', 'Application Type',
          'Batch_empduration','Grade_subgrade',
          'status_plan','plan_loantitle','inital_aptype','verify_initial']
num_fts_2=['Loan Amount', 'Funded Amount', 'Open Account',
       'Revolving Balance', 'Total Accounts', 'Collection 12 months Medical',
       'Last week Pay', 'Total Collection Amount',
       'Total Current Balance', 'Total Revolving Credit Limit','Funded Amount Investor', 'Interest Rate', 'Home Ownership',
       'Debit to Income', 'Revolving Utilities', 'Total Received Interest',
       'Total Received Late Fee', 'Recoveries', 'Collection Recovery Fee','Public Record']

for g in tqdm_notebook(cat_fts):

  num_fts = [c for c in num_fts_2 if c != g]
  grp = df.groupby(g)[num_fts].mean()
  grp.columns = [c + f'_grpd_by_{g}_mean' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

  grp = df.groupby(g)[num_fts].std()
  grp.columns = [c + f'_grpd_by_{g}_std' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

  grp = df.groupby(g)[num_fts].min()
  grp.columns = [c + f'_grpd_by_{g}_min' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

  grp = df.groupby(g)[num_fts].max()
  grp.columns = [c + f'_grpd_by_{g}_max' for c in grp.columns]
  df = pd.merge(df, grp, on = g, how = 'left')

In [29]:
df.shape

Concatinating all categorical features

In [30]:
fts=cat_cols+new_cat_cols+cat_ord_cols

**Dummy encoding for categorical features**

In [31]:
df=pd.get_dummies(df,columns=fts)

*Dividing train and test data*

In [32]:
train1, test1 = df[:train.shape[0]].reset_index(drop = True), df[train.shape[0]:].reset_index(drop = True)

In [33]:
# initially I built lgbm model on 2.1k features then used lgbm feature importance to get efficient features
fets=pd.read_csv('/kaggle/input/impftds/importantfrs.csv')

In [34]:
features=fets.imp_fts.values.tolist()

In [35]:
len(features)

In [36]:
y=train1['Loan Status']

**Hyperparameter tuning using optuna**

In [37]:
import optuna

In [43]:
def objective(trial,data=train1[features],target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'device':'gpu',  # this parameter means using the GPU when training our model to speedup the training process
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 1000,
        'num_leaves': trial.suggest_int('num_leaves', 34, 300),
        'min_child_samples':trial.suggest_int('min_child_samples', 100, 500),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = LGBMClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict_proba(test_x)[:,1]
    loss = log_loss(test_y, preds)
    return loss

In [44]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [45]:
Best_trial={'reg_lambda': 0.23300679680038147, 'reg_alpha': 0.13109870110522936,
             'colsample_bytree': 0.3, 'subsample': 0.5,
             'learning_rate': 0.016,
             'num_leaves': 177, 'min_child_samples': 244,
             'max_depth': 7, 'random_state': 2020, 'min_child_weight': 117,'n_estimators': 1000,'device':'gpu'}

In [46]:
fit_params = {'verbose': 0, 'early_stopping_rounds': 100}

In [47]:
def run_clf_kfold(clf,fit_params, train, test, features):

  N_SPLITS = 5

  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  folds = StratifiedKFold(n_splits = N_SPLITS)

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ############# Get train, validation and test sets along with targets ################
  
    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], y.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], y.iloc[val_idx]

    ### Test Set
    X_test = test[features]

    ############# Scaling Data ################
    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)


    ############# Fitting and Predicting ################

    _ = clf.fit(X_trn, y_trn)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict_proba(X_val)[:, 1]
    preds_test = clf.predict_proba(X_test)[:, 1]

    fold_score = log_loss(y_val, preds_val)
    print(f'\nloss score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = log_loss(y, oofs)
  print(f'\n\nloss score for oofs is {oofs_score}')

  return oofs, preds

In [48]:
lgbm=LGBMClassifier(**Best_trial)

In [49]:
lgb_oofs, lgb_preds =run_clf_kfold(lgbm,fit_params, train1, test1, features)

**Predictions using Gradient boosting and it will take around 40 minutes to get predictions**

In [None]:
ss['Loan Status']=lgb_preds
ss.to_csv('/kaggle/working/lgbmfinl.csv',index=False)

In [50]:
from sklearn.ensemble import GradientBoostingClassifier
grade=GradientBoostingClassifier()

In [51]:
def run_clf_kfold(clf, train, test, features):

  N_SPLITS = 5

  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  folds = StratifiedKFold(n_splits = N_SPLITS)

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ############# Get train, validation and test sets along with targets ################
  
    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], y.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], y.iloc[val_idx]

    ### Test Set
    X_test = test[features]

    ############# Scaling Data ################
    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)


    ############# Fitting and Predicting ################

    _ = clf.fit(X_trn, y_trn)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict_proba(X_val)[:, 1]
    preds_test = clf.predict_proba(X_test)[:, 1]

    fold_score = log_loss(y_val, preds_val)
    print(f'\nloss score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = log_loss(y, oofs)
  print(f'\n\nloss score for oofs is {oofs_score}')

  return oofs, preds

In [52]:
grade_oofs, grade_preds =run_clf_kfold(grade, train1, test1, features)

In [None]:
ss['Loan Status']=grade_preds
ss.to_csv('/kaggle/working/gradefinl.csv',index=False)

**Predictions using CatBoostClassifier**

In [55]:
from catboost import CatBoostClassifier
cat=CatBoostClassifier(task_type="GPU")
fit_params = {'verbose': 0, 'early_stopping_rounds': 100}

In [56]:
def run_clf_kfold(clf,fit_params, train, test, features):

  N_SPLITS = 5

  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  folds = StratifiedKFold(n_splits = N_SPLITS)

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train[TARGET_COL])):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ############# Get train, validation and test sets along with targets ################
  
    ### Training Set
    X_trn, y_trn = train[features].iloc[trn_idx], y.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train[features].iloc[val_idx], y.iloc[val_idx]

    ### Test Set
    X_test = test[features]

    ############# Scaling Data ################
    scaler = StandardScaler()
    _ = scaler.fit(X_trn)

    X_trn = scaler.transform(X_trn)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)


    ############# Fitting and Predicting ################

    _ = clf.fit(X_trn, y_trn)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict_proba(X_val)[:, 1]
    preds_test = clf.predict_proba(X_test)[:, 1]

    fold_score = log_loss(y_val, preds_val)
    print(f'\nloss score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = log_loss(y, oofs)
  print(f'\n\nloss score for oofs is {oofs_score}')

  return oofs, preds

In [None]:
cat_oofs, cat_preds = run_clf_kfold(cat, fit_params, train1, test1, features)

In [58]:
ss['Loan Status']=cat_preds
ss.to_csv('/kaggle/working/catfinl.csv',index=False)