In [11]:
import pandas as pd
import numpy as np
from itertools import product
import random
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

import hyperopt.hp as hp
import hyperopt.fmin as fmin
import hyperopt.tpe as tpe

from tpot import TPOTClassifier




In [3]:
data = pd.read_csv('data/credit-card-full.csv', index_col='ID')
X = data.drop(axis=1, columns='default payment next month')
y = data['default payment next month']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75)

In [10]:
# Set up space dictionary with specified hyperparameters
space = {
    'max_depth': hp.quniform('max_depth', 2, 10, 2),
    'learning_rate': hp.uniform('learning_rate', 0.001 ,0.9)
}

# Set up objective function
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'learning_rate': params['learning_rate']
    }
    gbm_clf = GradientBoostingClassifier(n_estimators=100, **params) 
    best_score = cross_val_score(
        gbm_clf, 
        X_train, 
        y_train, 
        scoring='accuracy', 
        cv=2, 
        n_jobs=4
    ).mean()
    loss = 1 - best_score
    return loss

# Run the algorithm
best = fmin(
    fn=objective,
    space=space, 
    max_evals=40, 
    rstate=np.random.default_rng(42), 
    algo=tpe.suggest,
    loss_threshold = .05
)
print(best)

100%|██████████| 40/40 [06:09<00:00,  9.23s/trial, best loss: 0.17666666666666664]
{'learning_rate': 0.038093061276450534, 'max_depth': 2.0}


In [13]:
# Create the tpot classifier
tpot_clf = TPOTClassifier(
    generations=3, 
    population_size=4,
    offspring_size=3, 
    scoring='accuracy',
    verbosity=2, 
    random_state=92, 
    cv=2
)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

                                                                           
Generation 1 - Current best internal CV score: 0.8231555555555556
                                                                            
Generation 2 - Current best internal CV score: 0.8231555555555556
                                                                            
Generation 3 - Current best internal CV score: 0.8231999999999999
                                                                            
Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.6000000000000001, min_samples_leaf=13, min_samples_split=6, n_estimators=100)
0.8137333333333333
