In [301]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import pathlib
import time
import tree as miptree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from model.threshold_guess import compute_thresholds, cut
from model.gosdt import GOSDT
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

# Data Preprocessing

### Default Data

In [291]:
default_data = pd.read_csv('/Users/andrew/Documents/MBS/Practicum/Data/Default/default_data.csv') 

In [292]:
# Setting column limit for easy viewing to bypass limit

pd.set_option('display.max_columns', 500)

In [293]:
#Convert Account Age and Credit History Length columns to months

default_data['AVG_ACCT_AGE_MOS']= (default_data['AVERAGE_ACCT_AGE'].str.split(pat='yrs',n=1).str[0].astype('int')*12)+(default_data['AVERAGE_ACCT_AGE'].str.split(pat='yrs',n=1).str[1].str.split(pat='mon').str[0].astype('int'))

default_data['CREDIT_HIST_MOS']= (default_data['CREDIT_HISTORY_LENGTH'].str.split(pat='yrs',n=1).str[0].astype('int')*12)+(default_data['CREDIT_HISTORY_LENGTH'].str.split(pat='yrs',n=1).str[1].str.split(pat='mon').str[0].astype('int'))

default_data.drop(columns=['CREDIT_HISTORY_LENGTH', 'AVERAGE_ACCT_AGE'], inplace=True)

In [294]:
#Calculate (DOB - Disbursal Date) and convert to Days

born = pd.to_datetime(default_data['DATE_OF_BIRTH'],format = '%d-%m-%Y')
disbursed = pd.to_datetime(default_data['DISBURSAL_DATE'],format = '%d-%m-%Y')

default_data['AGE_IN_DAY_AT_DISBURSE'] = (disbursed - born).dt.days

default_data.drop(columns=['DATE_OF_BIRTH','DISBURSAL_DATE'], inplace=True)


In [295]:
#Drop unique identifiers

default_data.drop(columns=['PERFORM_CNS_SCORE_DESCRIPTION','SUPPLIER_ID','BRANCH_ID','UNIQUEID','CURRENT_PINCODE_ID','EMPLOYEE_CODE_ID'], inplace=True)
# default_data.drop(columns=['PERFORM_CNS_SCORE_DESCRIPTION','UNIQUEID','CURRENT_PINCODE_ID','EMPLOYEE_CODE_ID'], inplace=True)

In [296]:
# Time for encoding

# First some features are numeric that should be treated as categorical...

default_data = default_data.astype({'MANUFACTURER_ID': 'object', 'STATE_ID' : 'object'})
# default_data = default_data.astype({'BRANCH_ID': 'object', 'SUPPLIER_ID': 'object','MANUFACTURER_ID': 'object', 'STATE_ID' : 'object'})

In [297]:
# Ready to run in the models!

default_model_data = pd.get_dummies(default_data, dtype='float32')

default_model_data.head()

Unnamed: 0,DISBURSED_AMOUNT,ASSET_COST,LTV,MOBILENO_AVL_FLAG,AADHAR_FLAG,PAN_FLAG,VOTERID_FLAG,DRIVING_FLAG,PASSPORT_FLAG,PERFORM_CNS_SCORE,PRI_NO_OF_ACCTS,PRI_ACTIVE_ACCTS,PRI_OVERDUE_ACCTS,PRI_CURRENT_BALANCE,PRI_SANCTIONED_AMOUNT,PRI_DISBURSED_AMOUNT,SEC_NO_OF_ACCTS,SEC_ACTIVE_ACCTS,SEC_OVERDUE_ACCTS,SEC_CURRENT_BALANCE,SEC_SANCTIONED_AMOUNT,SEC_DISBURSED_AMOUNT,PRIMARY_INSTAL_AMT,SEC_INSTAL_AMT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,NO_OF_INQUIRIES,LOAN_DEFAULT,AVG_ACCT_AGE_MOS,CREDIT_HIST_MOS,AGE_IN_DAY_AT_DISBURSE,MANUFACTURER_ID_45,MANUFACTURER_ID_48,MANUFACTURER_ID_49,MANUFACTURER_ID_51,MANUFACTURER_ID_67,MANUFACTURER_ID_86,MANUFACTURER_ID_120,MANUFACTURER_ID_145,MANUFACTURER_ID_152,MANUFACTURER_ID_153,MANUFACTURER_ID_156,EMPLOYMENT_TYPE_Salaried,EMPLOYMENT_TYPE_Self employed,STATE_ID_1,STATE_ID_2,STATE_ID_3,STATE_ID_4,STATE_ID_5,STATE_ID_6,STATE_ID_7,STATE_ID_8,STATE_ID_9,STATE_ID_10,STATE_ID_11,STATE_ID_12,STATE_ID_13,STATE_ID_14,STATE_ID_15,STATE_ID_16,STATE_ID_17,STATE_ID_18,STATE_ID_19,STATE_ID_20,STATE_ID_21,STATE_ID_22
0,50578,58400,89.55,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12633,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,47145,65550,73.23,1,1,0,0,0,0,598,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,0,1,23,23,12110,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,53278,61360,89.63,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12030,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,57513,66113,88.48,1,1,0,0,0,0,305,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,1,1,8,15,9066,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52378,60300,88.39,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,14901,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [298]:
len(default_model_data.columns)

66

#### Set TTS

In [339]:
y = default_model_data.LOAN_DEFAULT
x = default_model_data.drop(columns=['LOAN_DEFAULT'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.8, random_state=42)

len(X_train)

46630

## GOSDT

In [None]:
# GBDT parameters for threshold and lower bound guesses
n_est = 10
max_depth = 5

# guess thresholds
# X = pd.DataFrame(X, columns=h)
# print("X:", X.shape)
# print("y:",y.shape)
X_train_guessed, thresholds, header, threshold_guess_time = compute_thresholds(X_train, y_train, n_est, max_depth)
X_test_guessed = cut(X_test.copy(), thresholds)
X_test_guessed = X_test_guessed[header]
print("train set column names == test set column names: {}".format(list(X_train_guessed.columns)==list(X_test_guessed.columns)))

# guess lower bound
start_time = time.perf_counter()
clf = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, random_state=42)
clf.fit(X_train_guessed, y_train.values.flatten())
warm_labels = clf.predict(X_train_guessed)
elapsed_time = time.perf_counter() - start_time
lb_time = elapsed_time

# save the labels from lower bound guesses as a tmp file and return the path to it.
labelsdir = pathlib.Path('./labels')
labelsdir.mkdir(exist_ok=True, parents=True)
labelpath = labelsdir / 'warm_label.tmp'
labelpath = str(labelpath)
pd.DataFrame(warm_labels, columns=["class_labels"]).to_csv(labelpath, header="class_labels",index=None)

In [None]:
# train GOSDT model
config = {
            "regularization": 0.0001,
            "depth_budget": 5,
            "warm_LB": True,
            "reference_LB": True,
            "path_to_labels": labelpath,
            "time_limit": -1,
            "similar_support": False,
            "feature_transform": False,
            "allow_small_reg": True,
            
        }

model = GOSDT(config)

model.fit(pd.DataFrame(X_train_guessed), pd.DataFrame(y_train))
print("evaluate the model, extracting tree and scores", flush=True)

# get the results
train_acc = model.score(pd.DataFrame(X_train_guessed), pd.DataFrame(y_train))
n_leaves = model.leaves()
n_nodes = model.nodes()
time = model.utime

print("Model training time: {}".format(time))
print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(model.tree)

In [None]:
accuracy_score(model.predict(X_test_guessed), y_test)

In [None]:
confusion_matrix(model.predict(X_test_guessed), y_test)

In [None]:
# TRAINING accuracy of GB...worse (slightly) than GOSDT training accuracy

accuracy_score(y_train, warm_labels)

### OCT

In [None]:
# Reshaping for OCT

X_train_OCT = X_train.to_numpy()
y_train_OCT = y_train.to_numpy()

X_test_OCT = X_test.to_numpy()
y_test_OCT = y_test.to_numpy()

In [None]:
octree = miptree.optimalDecisionTreeClassifier(max_depth=5, min_samples_split=10, alpha=0.05, warmstart=True, timelimit=300, output=True)
octree.fit(X_train_OCT, y_train_OCT)

In [None]:
# Training accuracy
y_train_pred = octree.predict(X_train_OCT)
accuracy_score(y_train_OCT, y_train_pred)

In [None]:
# Test accuracy
y_test_pred = octree.predict(X_test_OCT)
accuracy_score(y_test_OCT, y_test_pred)

In [None]:
confusion_matrix(y_test_pred, y_test_OCT)

In [None]:
boctree = miptree.binOptimalDecisionTreeClassifier(max_depth=3, min_samples_split=2, warmstart=True, timelimit=600, output=True)
boctree.fit(X_train_OCT, y_train_OCT)

# Training accuracy
y_train_pred = boctree.predict(X_train_OCT)
accuracy_score(y_train_OCT, y_train_pred)

In [None]:
mfoctree = miptree.maxFlowOptimalDecisionTreeClassifier(max_depth=5, alpha=0.005, warmstart=True, timelimit=300, output=True)
mfoctree.fit(X_train_OCT, y_train_OCT)

In [None]:
# Training accuracy
y_train_pred = mfoctree.predict(X_train_OCT)
print(accuracy_score(y_train_OCT, y_train_pred))

# Test accuracy
y_test_pred = mfoctree.predict(X_test_OCT)
print(accuracy_score(y_test_OCT, y_test_pred))


confusion_matrix(y_test_pred, y_test_OCT)

##### SMOTE Testing (Unsuccessful)

In [None]:
from imblearn.over_sampling import SMOTE

x = default_model_data.drop(columns= ['LOAN_DEFAULT'])
y = default_model_data['LOAN_DEFAULT']

sm = SMOTE(random_state = 2)
x, y = sm.fit_resample(x, y.ravel())


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.999, random_state=42)

len(X_train)

In [None]:
# Reshaping for OCT

X_train_OCT = X_train.to_numpy()
y_train_OCT = y_train

X_test_OCT = X_test.to_numpy()
y_test_OCT = y_test

len(X_train_OCT[0])

In [None]:
mfoctree = miptree.maxFlowOptimalDecisionTreeClassifier(max_depth=5, alpha=0.05, warmstart=True, timelimit=20, output=True)
mfoctree.fit(X_train_OCT, y_train_OCT)

### Logistic (Lasso)

In [342]:
clf = LogisticRegression(random_state=0, max_iter=10000, penalty="l1", solver="saga").fit(X_train, y_train)

In [343]:
y_train_pred = clf.predict(X_train)
print(accuracy_score(y_train, y_train_pred))

# Test accuracy
y_test_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_test_pred))


confusion_matrix(y_test_pred, y_test)

0.7836371434698692
0.7826499538933328


array([[145980,  40520],
       [    21,      3]])

In [None]:
y_pred = model.predict(X_test_guess)
matrix = confusion_matrix(y_test, y_pred)

matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['Non Fraud', 'Fraud',]
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()