In [1]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import pathlib
import time
import tree as miptree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from model.threshold_guess import compute_thresholds, cut
from model.gosdt import GOSDT
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Data Preprocessing

### Default Data

In [2]:
default_data = pd.read_csv('/Users/andrew/Documents/MBS/Practicum/Data/Default/default_data.csv') 

In [3]:
# Setting column limit for easy viewing to bypass limit

pd.set_option('display.max_columns', 500)

In [4]:
#Convert Account Age and Credit History Length columns to months

default_data['AVG_ACCT_AGE_MOS']= (default_data['AVERAGE_ACCT_AGE'].str.split(pat='yrs',n=1).str[0].astype('int')*12)+(default_data['AVERAGE_ACCT_AGE'].str.split(pat='yrs',n=1).str[1].str.split(pat='mon').str[0].astype('int'))

default_data['CREDIT_HIST_MOS']= (default_data['CREDIT_HISTORY_LENGTH'].str.split(pat='yrs',n=1).str[0].astype('int')*12)+(default_data['CREDIT_HISTORY_LENGTH'].str.split(pat='yrs',n=1).str[1].str.split(pat='mon').str[0].astype('int'))

default_data.drop(columns=['CREDIT_HISTORY_LENGTH', 'AVERAGE_ACCT_AGE'], inplace=True)

In [5]:
#Calculate (DOB - Disbursal Date) and convert to Days

born = pd.to_datetime(default_data['DATE_OF_BIRTH'],format = '%d-%m-%Y')
disbursed = pd.to_datetime(default_data['DISBURSAL_DATE'],format = '%d-%m-%Y')

default_data['AGE_IN_DAY_AT_DISBURSE'] = (disbursed - born).dt.days

default_data.drop(columns=['DATE_OF_BIRTH','DISBURSAL_DATE'], inplace=True)


In [6]:
#Drop unique identifiers

default_data.drop(columns=['PERFORM_CNS_SCORE_DESCRIPTION','SUPPLIER_ID','BRANCH_ID','UNIQUEID','CURRENT_PINCODE_ID','EMPLOYEE_CODE_ID'], inplace=True)
# default_data.drop(columns=['PERFORM_CNS_SCORE_DESCRIPTION','UNIQUEID','CURRENT_PINCODE_ID','EMPLOYEE_CODE_ID'], inplace=True)

In [7]:
# Time for encoding

# First some features are numeric that should be treated as categorical...

default_data = default_data.astype({'MANUFACTURER_ID': 'object', 'STATE_ID' : 'object'})
# default_data = default_data.astype({'BRANCH_ID': 'object', 'SUPPLIER_ID': 'object','MANUFACTURER_ID': 'object', 'STATE_ID' : 'object'})

In [8]:
# Ready to run in the models!

default_model_data = pd.get_dummies(default_data, dtype='float32')

default_model_data.head()

Unnamed: 0,DISBURSED_AMOUNT,ASSET_COST,LTV,MOBILENO_AVL_FLAG,AADHAR_FLAG,PAN_FLAG,VOTERID_FLAG,DRIVING_FLAG,PASSPORT_FLAG,PERFORM_CNS_SCORE,PRI_NO_OF_ACCTS,PRI_ACTIVE_ACCTS,PRI_OVERDUE_ACCTS,PRI_CURRENT_BALANCE,PRI_SANCTIONED_AMOUNT,PRI_DISBURSED_AMOUNT,SEC_NO_OF_ACCTS,SEC_ACTIVE_ACCTS,SEC_OVERDUE_ACCTS,SEC_CURRENT_BALANCE,SEC_SANCTIONED_AMOUNT,SEC_DISBURSED_AMOUNT,PRIMARY_INSTAL_AMT,SEC_INSTAL_AMT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,NO_OF_INQUIRIES,LOAN_DEFAULT,AVG_ACCT_AGE_MOS,CREDIT_HIST_MOS,AGE_IN_DAY_AT_DISBURSE,MANUFACTURER_ID_45,MANUFACTURER_ID_48,MANUFACTURER_ID_49,MANUFACTURER_ID_51,MANUFACTURER_ID_67,MANUFACTURER_ID_86,MANUFACTURER_ID_120,MANUFACTURER_ID_145,MANUFACTURER_ID_152,MANUFACTURER_ID_153,MANUFACTURER_ID_156,EMPLOYMENT_TYPE_Salaried,EMPLOYMENT_TYPE_Self employed,STATE_ID_1,STATE_ID_2,STATE_ID_3,STATE_ID_4,STATE_ID_5,STATE_ID_6,STATE_ID_7,STATE_ID_8,STATE_ID_9,STATE_ID_10,STATE_ID_11,STATE_ID_12,STATE_ID_13,STATE_ID_14,STATE_ID_15,STATE_ID_16,STATE_ID_17,STATE_ID_18,STATE_ID_19,STATE_ID_20,STATE_ID_21,STATE_ID_22
0,50578,58400,89.55,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12633,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,47145,65550,73.23,1,1,0,0,0,0,598,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,0,1,23,23,12110,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,53278,61360,89.63,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12030,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,57513,66113,88.48,1,1,0,0,0,0,305,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,1,1,8,15,9066,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52378,60300,88.39,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,14901,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
len(default_model_data.columns)

66

## Testing GOSDT on Dataset

In [10]:
y = default_model_data.LOAN_DEFAULT
x = default_model_data.drop(columns=['LOAN_DEFAULT'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.9, random_state=42)

len(X_train)

23315

In [11]:
# GBDT parameters for threshold and lower bound guesses
n_est = 10
max_depth = 5

# guess thresholds
# X = pd.DataFrame(X, columns=h)
# print("X:", X.shape)
# print("y:",y.shape)
X_train_guessed, thresholds, header, threshold_guess_time = compute_thresholds(X_train, y_train, n_est, max_depth)
X_test_guessed = cut(X_test.copy(), thresholds)
X_test_guessed = X_test_guessed[header]
print("train set column names == test set column names: {}".format(list(X_train_guessed.columns)==list(X_test_guessed.columns)))

# guess lower bound
start_time = time.perf_counter()
clf = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, random_state=42)
clf.fit(X_train_guessed, y_train.values.flatten())
warm_labels = clf.predict(X_train_guessed)
elapsed_time = time.perf_counter() - start_time
lb_time = elapsed_time

# save the labels from lower bound guesses as a tmp file and return the path to it.
labelsdir = pathlib.Path('./labels')
labelsdir.mkdir(exist_ok=True, parents=True)
labelpath = labelsdir / 'warm_label.tmp'
labelpath = str(labelpath)
pd.DataFrame(warm_labels, columns=["class_labels"]).to_csv(labelpath, header="class_labels",index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[colnames[j]+'<='+str(ts[j][s])] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


train set column names == test set column names: True


In [12]:
# train GOSDT model
config = {
            "regularization": 0.0001,
            "depth_budget": 5,
            "warm_LB": True,
            "reference_LB": True,
            "path_to_labels": labelpath,
            "time_limit": -1,
            "similar_support": False,
            "feature_transform": False,
            "allow_small_reg": True,
            
        }

model = GOSDT(config)

model.fit(pd.DataFrame(X_train_guessed), pd.DataFrame(y_train))
print("evaluate the model, extracting tree and scores", flush=True)

# get the results
train_acc = model.score(pd.DataFrame(X_train_guessed), pd.DataFrame(y_train))
n_leaves = model.leaves()
n_nodes = model.nodes()
time = model.utime

print("Model training time: {}".format(time))
print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(model.tree)

gosdt reported successful execution
training completed. 0.000/0.000/190.178 (user, system, wall), mem=0 MB
bounds: [0.215640..0.215640] (0.000000) loss=0.215140, iterations=373204
evaluate the model, extracting tree and scores
Model training time: 0.0
Training accuracy: 0.7848595324898134
# of leaves: 5
if STATE_ID_13<=0.5 = 1 then:
    predicted class: 0
    misclassification penalty: 0.191
    complexity penalty: 0.0

else if LTV<=70.36999893188477 = 1 and STATE_ID_13<=0.5 != 1 then:
    predicted class: 0
    misclassification penalty: 0.009
    complexity penalty: 0.0

else if DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS<=0.5 = 1 and LTV<=70.36999893188477 != 1 and NO_OF_INQUIRIES<=1.5 = 1 and STATE_ID_13<=0.5 != 1 then:
    predicted class: 0
    misclassification penalty: 0.013
    complexity penalty: 0.0

else if DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS<=0.5 = 1 and LTV<=70.36999893188477 != 1 and NO_OF_INQUIRIES<=1.5 != 1 and STATE_ID_13<=0.5 != 1 then:
    predicted class: 1
    misclassifi

In [13]:
test_score = model.score(X_test_guessed[:10000], y_test[:10000])
print(test_score)

0.7812
