In [13]:
# Import basic libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from cuml.ensemble import RandomForestClassifier as cuRFC
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split

import optuna

In [2]:
# Load the data

df = pd.read_csv('UCI_Credit_Card.csv')
df.sample(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
6159,6160,330000.0,2,2,2,40,2,2,0,0,...,132754.0,135691.0,139646.0,0.0,4725.0,4752.0,5077.0,6366.0,8550.0,0
26505,26506,120000.0,2,3,2,53,0,0,0,0,...,120457.0,90843.0,91267.0,4210.0,4435.0,4592.0,3171.0,3256.0,3260.0,0
10170,10171,330000.0,2,1,2,28,1,-2,-2,-2,...,-2.0,1683.0,1382.0,1056.0,2014.0,0.0,1685.0,1388.0,967.0,0
25866,25867,160000.0,2,1,1,35,1,-2,-1,-1,...,0.0,0.0,0.0,0.0,6371.0,0.0,0.0,0.0,8399.0,0
7602,7603,10000.0,2,3,1,47,0,0,0,0,...,5053.0,5275.0,0.0,1069.0,1081.0,251.0,375.0,0.0,0.0,1


In [3]:
df = df.rename(columns={'default.payment.next.month': 'def_pay', 
                        'PAY_0': 'PAY_1'})
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,def_pay
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [8]:
features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
X = df[features].copy()
X.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [10]:
# create the target variable
y = df['def_pay'].copy()
y.sample(5)

8523     1
16085    1
5767     0
29637    0
12445    0
Name: def_pay, dtype: int64

In [11]:
# split the df into train and test, it is important these two do not communicate during the training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# this means we will train on 80% of the data and test on the remaining 20%.

In [18]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1200)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 0, 100)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 0, 100)
    max_features = trial.suggest_float('max_features', 0.1, 0.9)
    bootstrap = trial.suggest_categorical("bootstrap", ["True", "False"])
    clf = cuRFC(n_estimators=n_estimators, criterion=criterion, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf,max_features = max_features, bootstrap = bootstrap)
    
    gbm = clf.fit(X_train, y_train)
    predictions = gbm.predict(X_test) 
    accuracy = accuracy_score(y_test, y_pred = predictions)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Number of finished trials: ', len(study.trials))
print("Best trial: ", study.best_trial.params)

[32m[I 2021-07-23 17:33:19,437][0m A new study created in memory with name: no-name-bcecaba8-8ad9-4681-bcf6-bb30d96cef61[0m
[32m[I 2021-07-23 17:36:33,914][0m Trial 0 finished with value: 0.8151666666666667 and parameters: {'n_estimators': 1130, 'criterion': 'entropy'}. Best is trial 0 with value: 0.8151666666666667.[0m
[32m[I 2021-07-23 17:37:23,636][0m Trial 1 finished with value: 0.815 and parameters: {'n_estimators': 353, 'criterion': 'gini'}. Best is trial 0 with value: 0.8151666666666667.[0m
[32m[I 2021-07-23 17:38:45,722][0m Trial 2 finished with value: 0.8165 and parameters: {'n_estimators': 419, 'criterion': 'entropy'}. Best is trial 2 with value: 0.8165.[0m
[32m[I 2021-07-23 17:41:20,792][0m Trial 3 finished with value: 0.8158333333333333 and parameters: {'n_estimators': 870, 'criterion': 'entropy'}. Best is trial 2 with value: 0.8165.[0m
[32m[I 2021-07-23 17:43:46,144][0m Trial 4 finished with value: 0.818 and parameters: {'n_estimators': 695, 'criterion': '

[32m[I 2021-07-23 18:52:31,407][0m Trial 44 finished with value: 0.8151666666666667 and parameters: {'n_estimators': 1124, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 18:53:27,819][0m Trial 45 finished with value: 0.8163333333333334 and parameters: {'n_estimators': 756, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 18:54:37,773][0m Trial 46 finished with value: 0.8171666666666667 and parameters: {'n_estimators': 951, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 18:55:54,096][0m Trial 47 finished with value: 0.8155 and parameters: {'n_estimators': 1039, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 18:56:39,266][0m Trial 48 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 620, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 18:57:14,376][0m Trial 49 finished with 

[32m[I 2021-07-23 19:30:11,666][0m Trial 89 finished with value: 0.8165 and parameters: {'n_estimators': 623, 'criterion': 'gini'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 19:30:34,110][0m Trial 90 finished with value: 0.8173333333333334 and parameters: {'n_estimators': 430, 'criterion': 'gini'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 19:31:08,624][0m Trial 91 finished with value: 0.8156666666666667 and parameters: {'n_estimators': 663, 'criterion': 'gini'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 19:31:36,136][0m Trial 92 finished with value: 0.817 and parameters: {'n_estimators': 535, 'criterion': 'gini'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 19:32:06,491][0m Trial 93 finished with value: 0.8178333333333333 and parameters: {'n_estimators': 587, 'criterion': 'gini'}. Best is trial 11 with value: 0.8185.[0m
[32m[I 2021-07-23 19:32:39,326][0m Trial 94 finished with value: 0.8178333333333333 and 

Number of finished trials:  100
Best trial:  {'n_estimators': 1175, 'criterion': 'entropy'}
