In [1]:
# importing the libraries

import os
import numpy as np
import pandas as pd
from itertools import combinations
import gc

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import xgboost as xgb
from xgboost import plot_importance

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

In [2]:
# setting up the parameters

root_dir = os.path.dirname(os.path.abspath(os.getcwd()))
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", None)

In [3]:
# importing the dataset

trainpath = os.path.join(root_dir, "data", "train.csv")
testpath = os.path.join(root_dir, "data", "test.csv")

traindf = pd.read_csv(trainpath)
testdf = pd.read_csv(testpath)

In [4]:
traindf.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,Z,"Turner, Baldwin and Rhodes",4.25,214000,360,2012-03-01,05/2012,95,1.0,22.0,694.0,C86,30.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,672831657627,Y,"Swanson, Newton and Miller",4.875,144000,360,2012-01-01,03/2012,72,1.0,44.0,697.0,B12,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,742515242108,Z,Thornton-Davis,3.25,366000,180,2012-01-01,03/2012,49,1.0,33.0,780.0,B12,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,601385667462,X,OTHER,4.75,135000,360,2012-02-01,04/2012,46,2.0,44.0,633.0,B12,0.0,638.0,0.0,0,0,0,0,0,0,0,0,1,1,1,1,1
4,273870029961,X,OTHER,4.75,124000,360,2012-02-01,04/2012,80,1.0,43.0,681.0,C86,0.0,0.0,0.0,0,1,2,3,4,5,6,7,8,9,10,11,1


In [5]:
# categorical columns
categorical_columns = ["source", "number_of_borrowers", "loan_purpose", "insurance_type"]
datetime_columns = ["origination_date", "first_payment_date"]

In [6]:
traindf.financial_institution.unique()

array(['Turner, Baldwin and Rhodes', 'Swanson, Newton and Miller',
       'Thornton-Davis', 'OTHER', 'Browning-Hart', 'Richardson Ltd',
       'Edwards-Hoffman', 'Richards-Walters', 'Martinez, Duffy and Bird',
       'Miller, Mcclure and Allen', 'Anderson-Taylor',
       'Taylor, Hunt and Rodriguez', 'Nicholson Group',
       'Cole, Brooks and Vincent', 'Sanchez, Hays and Wilkerson',
       'Sanchez-Robinson', 'Suarez Inc', 'Romero, Woods and Johnson',
       'Chapman-Mcmahon'], dtype=object)

In [7]:
# transformation 01
## encoding all the categorical columns

def encoding_fn(dataframe):
    """
    this function encodes the categorical variables
    
    inputs: dataframe and categorical_column_list
    output: dataframe with encoded variables
    """
    
    # encoding source
    dataframe["source"] = dataframe["source"].map({
        "X": 0,
        "Y": 1,
        "Z": 2
    })
    
    # encoding loan_purpose
    dataframe["loan_purpose"] = dataframe["loan_purpose"].map({
        "C86": 0,
        "B12": 1,
        "A23": 2
    })
    
    # encoding_financial_institution
    dataframe["financial_institution"] = dataframe["financial_institution"].map({
        "OTHER": 0,
        "Turner, Baldwin and Rhodes": 1, 
        "Swanson, Newton and Miller": 2,
        "Thornton-Davis": 3, 
        "Browning-Hart": 4, 
        "Richardson Ltd": 5,
        'Edwards-Hoffman': 6, 
        'Richards-Walters': 7, 
        'Martinez, Duffy and Bird': 8,
        'Miller, Mcclure and Allen': 9, 
        'Anderson-Taylor': 10,
        'Taylor, Hunt and Rodriguez': 11, 
        'Nicholson Group': 12,
        'Cole, Brooks and Vincent': 13, 
        'Sanchez, Hays and Wilkerson': 14,
        'Sanchez-Robinson': 15, 
        'Suarez Inc': 16, 
        'Romero, Woods and Johnson': 17,
        'Chapman-Mcmahon': 18
    })
    
    return dataframe

In [8]:
traindf = encoding_fn(traindf)
traindf.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,2,1,4.25,214000,360,2012-03-01,05/2012,95,1.0,22.0,694.0,0,30.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,672831657627,1,2,4.875,144000,360,2012-01-01,03/2012,72,1.0,44.0,697.0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,742515242108,2,3,3.25,366000,180,2012-01-01,03/2012,49,1.0,33.0,780.0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,601385667462,0,0,4.75,135000,360,2012-02-01,04/2012,46,2.0,44.0,633.0,1,0.0,638.0,0.0,0,0,0,0,0,0,0,0,1,1,1,1,1
4,273870029961,0,0,4.75,124000,360,2012-02-01,04/2012,80,1.0,43.0,681.0,0,0.0,0.0,0.0,0,1,2,3,4,5,6,7,8,9,10,11,1


In [9]:
# transformation 02
## deleting all the date columns
traindf = traindf.drop(columns = ["origination_date", "first_payment_date"])
traindf.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,2,1,4.25,214000,360,95,1.0,22.0,694.0,0,30.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,672831657627,1,2,4.875,144000,360,72,1.0,44.0,697.0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,742515242108,2,3,3.25,366000,180,49,1.0,33.0,780.0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,601385667462,0,0,4.75,135000,360,46,2.0,44.0,633.0,1,0.0,638.0,0.0,0,0,0,0,0,0,0,0,1,1,1,1,1
4,273870029961,0,0,4.75,124000,360,80,1.0,43.0,681.0,0,0.0,0.0,0.0,0,1,2,3,4,5,6,7,8,9,10,11,1


In [10]:
# Transformation 03
## Generating all the possible pairs of interaction between columns

def column_pairs(dataframe, polynomial_object = None):
    """
    this function creates all the possible pairs of columns
    
    input: dataframe
    output: dataframe with new features
    """
    
    combination_list = list(combinations(list(dataframe.columns), 2))
    colnames = list(dataframe.columns) + ["_".join(i) for i in combination_list]
    
    if polynomial_object == None:
        poly = PolynomialFeatures(interaction_only = True, include_bias = False)
        polynomial_object = poly.fit(dataframe)
    
    dataframe = polynomial_object.transform(dataframe)
    dataframe = pd.DataFrame(dataframe)
    dataframe.columns = colnames
    
    indices = [i for i, j in enumerate(list((dataframe == 0).all())) if j]
    dataframe = dataframe.drop(dataframe.columns[indices], axis = 1)
    
    if polynomial_object == None:
        return dataframe, polynomial_object
    else:
        return dataframe

In [11]:
x_train = traindf.drop(columns = ["m13", "loan_id"])   # predictors
y_train = traindf["m13"]                    # label

x_train = column_pairs(x_train)
print("Shape of x_train after Feature Engineering = {}".format(x_train.shape))

Shape of x_train after Feature Engineering = (116058, 324)


In [12]:
x_train.head()

Unnamed: 0,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,source_financial_institution,source_interest_rate,source_unpaid_principal_bal,source_loan_term,source_loan_to_value,source_number_of_borrowers,source_debt_to_income_ratio,source_borrower_credit_score,source_loan_purpose,source_insurance_percent,source_co-borrower_credit_score,source_insurance_type,source_m1,source_m2,source_m3,source_m4,source_m5,source_m6,source_m7,source_m8,source_m9,source_m10,source_m11,source_m12,financial_institution_interest_rate,financial_institution_unpaid_principal_bal,financial_institution_loan_term,financial_institution_loan_to_value,financial_institution_number_of_borrowers,financial_institution_debt_to_income_ratio,financial_institution_borrower_credit_score,financial_institution_loan_purpose,financial_institution_insurance_percent,financial_institution_co-borrower_credit_score,financial_institution_insurance_type,financial_institution_m1,financial_institution_m2,financial_institution_m3,financial_institution_m4,financial_institution_m5,financial_institution_m6,financial_institution_m7,financial_institution_m8,financial_institution_m9,financial_institution_m10,financial_institution_m11,financial_institution_m12,interest_rate_unpaid_principal_bal,interest_rate_loan_term,interest_rate_loan_to_value,interest_rate_number_of_borrowers,interest_rate_debt_to_income_ratio,interest_rate_borrower_credit_score,interest_rate_loan_purpose,interest_rate_insurance_percent,interest_rate_co-borrower_credit_score,interest_rate_insurance_type,interest_rate_m1,interest_rate_m2,interest_rate_m3,interest_rate_m4,interest_rate_m5,interest_rate_m6,interest_rate_m7,interest_rate_m8,interest_rate_m9,interest_rate_m10,interest_rate_m11,interest_rate_m12,unpaid_principal_bal_loan_term,unpaid_principal_bal_loan_to_value,unpaid_principal_bal_number_of_borrowers,unpaid_principal_bal_debt_to_income_ratio,unpaid_principal_bal_borrower_credit_score,unpaid_principal_bal_loan_purpose,unpaid_principal_bal_insurance_percent,unpaid_principal_bal_co-borrower_credit_score,unpaid_principal_bal_insurance_type,unpaid_principal_bal_m1,unpaid_principal_bal_m2,unpaid_principal_bal_m3,unpaid_principal_bal_m4,unpaid_principal_bal_m5,unpaid_principal_bal_m6,unpaid_principal_bal_m7,unpaid_principal_bal_m8,unpaid_principal_bal_m9,unpaid_principal_bal_m10,unpaid_principal_bal_m11,unpaid_principal_bal_m12,loan_term_loan_to_value,loan_term_number_of_borrowers,loan_term_debt_to_income_ratio,loan_term_borrower_credit_score,loan_term_loan_purpose,loan_term_insurance_percent,loan_term_co-borrower_credit_score,loan_term_insurance_type,loan_term_m1,loan_term_m2,loan_term_m3,loan_term_m4,loan_term_m5,loan_term_m6,loan_term_m7,loan_term_m8,loan_term_m9,loan_term_m10,loan_term_m11,loan_term_m12,loan_to_value_number_of_borrowers,loan_to_value_debt_to_income_ratio,loan_to_value_borrower_credit_score,loan_to_value_loan_purpose,loan_to_value_insurance_percent,loan_to_value_co-borrower_credit_score,loan_to_value_insurance_type,loan_to_value_m1,loan_to_value_m2,loan_to_value_m3,loan_to_value_m4,loan_to_value_m5,loan_to_value_m6,loan_to_value_m7,loan_to_value_m8,loan_to_value_m9,loan_to_value_m10,loan_to_value_m11,loan_to_value_m12,number_of_borrowers_debt_to_income_ratio,number_of_borrowers_borrower_credit_score,number_of_borrowers_loan_purpose,number_of_borrowers_insurance_percent,number_of_borrowers_co-borrower_credit_score,number_of_borrowers_insurance_type,number_of_borrowers_m1,number_of_borrowers_m2,number_of_borrowers_m3,number_of_borrowers_m4,number_of_borrowers_m5,number_of_borrowers_m6,number_of_borrowers_m7,number_of_borrowers_m8,number_of_borrowers_m9,number_of_borrowers_m10,number_of_borrowers_m11,number_of_borrowers_m12,debt_to_income_ratio_borrower_credit_score,debt_to_income_ratio_loan_purpose,debt_to_income_ratio_insurance_percent,debt_to_income_ratio_co-borrower_credit_score,debt_to_income_ratio_insurance_type,debt_to_income_ratio_m1,debt_to_income_ratio_m2,debt_to_income_ratio_m3,debt_to_income_ratio_m4,debt_to_income_ratio_m5,debt_to_income_ratio_m6,debt_to_income_ratio_m7,debt_to_income_ratio_m8,debt_to_income_ratio_m9,debt_to_income_ratio_m10,debt_to_income_ratio_m11,debt_to_income_ratio_m12,borrower_credit_score_loan_purpose,borrower_credit_score_insurance_percent,borrower_credit_score_co-borrower_credit_score,borrower_credit_score_insurance_type,borrower_credit_score_m1,borrower_credit_score_m2,borrower_credit_score_m3,borrower_credit_score_m4,borrower_credit_score_m5,borrower_credit_score_m6,borrower_credit_score_m7,borrower_credit_score_m8,borrower_credit_score_m9,borrower_credit_score_m10,borrower_credit_score_m11,borrower_credit_score_m12,loan_purpose_insurance_percent,loan_purpose_co-borrower_credit_score,loan_purpose_insurance_type,loan_purpose_m1,loan_purpose_m2,loan_purpose_m3,loan_purpose_m4,loan_purpose_m5,loan_purpose_m6,loan_purpose_m7,loan_purpose_m8,loan_purpose_m9,loan_purpose_m10,loan_purpose_m11,loan_purpose_m12,insurance_percent_co-borrower_credit_score,insurance_percent_insurance_type,insurance_percent_m1,insurance_percent_m2,insurance_percent_m3,insurance_percent_m4,insurance_percent_m5,insurance_percent_m6,insurance_percent_m7,insurance_percent_m8,insurance_percent_m9,insurance_percent_m10,insurance_percent_m11,insurance_percent_m12,co-borrower_credit_score_insurance_type,co-borrower_credit_score_m1,co-borrower_credit_score_m2,co-borrower_credit_score_m3,co-borrower_credit_score_m4,co-borrower_credit_score_m5,co-borrower_credit_score_m6,co-borrower_credit_score_m7,co-borrower_credit_score_m8,co-borrower_credit_score_m9,co-borrower_credit_score_m10,co-borrower_credit_score_m11,co-borrower_credit_score_m12,insurance_type_m1,insurance_type_m2,insurance_type_m3,insurance_type_m4,insurance_type_m5,insurance_type_m6,insurance_type_m8,insurance_type_m9,insurance_type_m10,insurance_type_m11,insurance_type_m12,m1_m2,m1_m3,m1_m4,m1_m5,m1_m6,m1_m7,m1_m8,m1_m9,m1_m10,m1_m11,m1_m12,m2_m3,m2_m4,m2_m5,m2_m6,m2_m7,m2_m8,m2_m9,m2_m10,m2_m11,m2_m12,m3_m4,m3_m5,m3_m6,m3_m7,m3_m8,m3_m9,m3_m10,m3_m11,m3_m12,m4_m5,m4_m6,m4_m7,m4_m8,m4_m9,m4_m10,m4_m11,m4_m12,m5_m6,m5_m7,m5_m8,m5_m9,m5_m10,m5_m11,m5_m12,m6_m7,m6_m8,m6_m9,m6_m10,m6_m11,m6_m12,m7_m8,m7_m9,m7_m10,m7_m11,m7_m12,m8_m9,m8_m10,m8_m11,m8_m12,m9_m10,m9_m11,m9_m12,m10_m11,m10_m12,m11_m12
0,2.0,1.0,4.25,214000.0,360.0,95.0,1.0,22.0,694.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,8.5,428000.0,720.0,190.0,2.0,44.0,1388.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.25,214000.0,360.0,95.0,1.0,22.0,694.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,909500.0,1530.0,403.75,4.25,93.5,2949.5,0.0,127.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.25,0.0,0.0,0.0,0.0,0.0,77040000.0,20330000.0,214000.0,4708000.0,148516000.0,0.0,6420000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,214000.0,0.0,0.0,0.0,0.0,0.0,34200.0,360.0,7920.0,249840.0,0.0,10800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,360.0,0.0,0.0,0.0,0.0,0.0,95.0,2090.0,65930.0,0.0,2850.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,22.0,694.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,15268.0,0.0,660.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,20820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,694.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,4.875,144000.0,360.0,72.0,1.0,44.0,697.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,4.875,144000.0,360.0,72.0,1.0,44.0,697.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.75,288000.0,720.0,144.0,2.0,88.0,1394.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,702000.0,1755.0,351.0,4.875,214.5,3397.875,4.875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.875,0.0,51840000.0,10368000.0,144000.0,6336000.0,100368000.0,144000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144000.0,0.0,25920.0,360.0,15840.0,250920.0,360.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,360.0,0.0,72.0,3168.0,50184.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,0.0,44.0,697.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,30668.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,697.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,697.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,3.0,3.25,366000.0,180.0,49.0,1.0,33.0,780.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.5,732000.0,360.0,98.0,2.0,66.0,1560.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.75,1098000.0,540.0,147.0,3.0,99.0,2340.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1189500.0,585.0,159.25,3.25,107.25,2535.0,3.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65880000.0,17934000.0,366000.0,12078000.0,285480000.0,366000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8820.0,180.0,5940.0,140400.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,1617.0,38220.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,780.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25740.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,780.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,4.75,135000.0,360.0,46.0,2.0,44.0,633.0,1.0,0.0,638.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,641250.0,1710.0,218.5,9.5,209.0,3006.75,4.75,0.0,3030.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.75,4.75,4.75,4.75,48600000.0,6210000.0,270000.0,5940000.0,85455000.0,135000.0,0.0,86130000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135000.0,135000.0,135000.0,135000.0,16560.0,720.0,15840.0,227880.0,360.0,0.0,229680.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,360.0,360.0,360.0,360.0,92.0,2024.0,29118.0,46.0,0.0,29348.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,46.0,46.0,46.0,88.0,1266.0,2.0,0.0,1276.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,27852.0,44.0,0.0,28072.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,44.0,44.0,44.0,633.0,0.0,403854.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,633.0,633.0,633.0,633.0,0.0,638.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,638.0,638.0,638.0,638.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,4.75,124000.0,360.0,80.0,1.0,43.0,681.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,589000.0,1710.0,380.0,4.75,204.25,3234.75,0.0,0.0,0.0,0.0,0.0,4.75,9.5,14.25,19.0,23.75,28.5,33.25,38.0,42.75,47.5,52.25,44640000.0,9920000.0,124000.0,5332000.0,84444000.0,0.0,0.0,0.0,0.0,0.0,124000.0,248000.0,372000.0,496000.0,620000.0,744000.0,868000.0,992000.0,1116000.0,1240000.0,1364000.0,28800.0,360.0,15480.0,245160.0,0.0,0.0,0.0,0.0,0.0,360.0,720.0,1080.0,1440.0,1800.0,2160.0,2520.0,2880.0,3240.0,3600.0,3960.0,80.0,3440.0,54480.0,0.0,0.0,0.0,0.0,0.0,80.0,160.0,240.0,320.0,400.0,480.0,560.0,640.0,720.0,800.0,880.0,43.0,681.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,29283.0,0.0,0.0,0.0,0.0,0.0,43.0,86.0,129.0,172.0,215.0,258.0,301.0,344.0,387.0,430.0,473.0,0.0,0.0,0.0,0.0,0.0,681.0,1362.0,2043.0,2724.0,3405.0,4086.0,4767.0,5448.0,6129.0,6810.0,7491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,6.0,8.0,10.0,12.0,14.0,16.0,18.0,20.0,22.0,12.0,15.0,18.0,21.0,24.0,27.0,30.0,33.0,20.0,24.0,28.0,32.0,36.0,40.0,44.0,30.0,35.0,40.0,45.0,50.0,55.0,42.0,48.0,54.0,60.0,66.0,56.0,63.0,70.0,77.0,72.0,80.0,88.0,90.0,99.0,110.0


### Training the model

In [28]:
def hyperparameter_tuning(params):
    """
    hypertunes the XGB model
    """
    params = {
        "max_depth": int(params["max_depth"]),
        "gamma": "{:.3f}".format(params["gamma"]),
        "subsample": "{:.2f}".format(params["subsample"]),
        "reg_alpha": "{:.3f}".format(params["reg_alpha"]),
        "reg_lambda": "{:.3f}".format(params["reg_lambda"]),
        "learning_rate": "{:.3f}".format(params["learning_rate"]),
        "num_leaves": "{:.3f}".format(params["num_leaves"]),
        "colsample_bytree": "{:.3f}".format(params["colsample_bytree"]),
        "min_child_samples": "{:.3f}".format(params["min_child_samples"]),
        "feature_fraction": "{:.3f}".format(params["feature_fraction"]),
        "bagging_fraction": "{:.3f}".format(params["bagging_fraction"])
    }
    
    print("###################################################")
    print("Params = {}".format(params))
    FOLDS = 10
    count = 1
    skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
    y_preds = np.zeros(testdf.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score_mean = 0
    for trn_idx, val_idx in skf.split(x_train, y_train):
        clf = xgb.XGBClassifier(random_state = 42, 
                                verbose = True, 
                                tree_method = "gpu_hist", 
                                **params)
        train_x, valid_x = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        train_y, valid_y = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        
        clf.fit(train_x, train_y)
        score = make_scorer(roc_auc_score, needs_proba = True)(clf, valid_x, valid_y)
        score_mean += score
        print("Count = {} ... score = {}".format(count, score))
        count += 1
    gc.collect()
    print("Mean ROC_AUC = {}".format(score_mean / FOLDS))
    del train_x, valid_x, train_y, valid_y, clf, score
    
    return -(score_mean/FOLDS)

In [29]:
%%time
space = {
    "max_depth": hp.quniform("max_depth", 3, 8, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0.01, 0.05),
    "reg_lambda": hp.uniform("reg_lambda", 0.01, 0.05),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.2),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.3, 0.9),
    "gamma": hp.uniform("gamma", 0.01, 0.7),
    "num_leaves": hp.choice("num_leaves", list(range(20, 250, 10))),
    "min_child_samples": hp.choice("min_child_samples", list(range(100, 250, 10))),
    "subsample": hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}

CPU times: user 280 µs, sys: 62 µs, total: 342 µs
Wall time: 345 µs


In [30]:
%%time

# Set algoritm parameters
best = fmin(fn = hyperparameter_tuning,
            space = space,
            algo = tpe.suggest,
            max_evals = 27)

# Print best parameters
best_params = space_eval(space, best)

################################################### 
Params = {'max_depth': 8, 'gamma': '0.650', 'subsample': '0.40', 'reg_alpha': '0.021', 'reg_lambda': '0.038', 'learning_rate': '0.060', 'num_leaves': '240.000', 'colsample_bytree': '0.468', 'min_child_samples': '180.000', 'feature_fraction': '0.595', 'bagging_fraction': '0.686'}
Count = 1 ... score = 0.8784192800831674            
Count = 2 ... score = 0.9171995473447111            
Count = 3 ... score = 0.8983684586726737            
Count = 4 ... score = 0.8730140465257321            
Count = 5 ... score = 0.8893308135505111            
Count = 6 ... score = 0.9064923215214               
Count = 7 ... score = 0.8481405384888316            
Count = 8 ... score = 0.9121655073396538            
Count = 9 ... score = 0.9072950961705076            
Count = 10 ... score = 0.8584520852758593           
Mean ROC_AUC = 0.8888877694973047                   
###################################################                           
Param