In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (precision_score,
                             recall_score,
                             accuracy_score,
                             confusion_matrix)
from xgboost import XGBClassifier


In [2]:
def test_xgb_model(param_grid, X_train, y_train, X_test, y_test):
    """
    Test different preparations of train / test data with different grid search
    parameters

    Parameters
    ----------
        param_grid: dict
            param_grid input to sklearn GridSearchCV
        X_train: pd.DataFrame
            X / predictive features of training data
        y_train: pd.DataFrame
            y / target feature of training data
        X_test: pd.DataFrame
            X / predictive features of test data
        y_test: pd.DataFrame
            y / target feature of test data

    Returns
    _______
        (best_params_, results)
        best_params: dict
            parameters of best model from grid search
        results: dict
            train / test results from best model, including:
            accuracy, precision, recall, true & false positives and negatives

    """
    xgbmodel = XGBClassifier()
    model_search = GridSearchCV(estimator=xgbmodel,
                                param_grid=param_grid,
                                n_jobs=-2,
                                scoring='roc_auc')
    model_search.fit(X_train,y_train)

    train_pred = (model_search.predict(X_train)).reshape(-1,1)
    test_pred = (model_search.predict(X_test)).reshape(-1,1)

    train_accuracy = accuracy_score(train_pred, y_train)
    test_accuracy = accuracy_score(test_pred, y_test)

    train_recall = recall_score(y_train, train_pred)
    test_recall = recall_score(y_test, test_pred)

    train_precision = precision_score(y_train, train_pred)
    test_precision = precision_score(y_test, test_pred)

    test_conf_matrix = confusion_matrix(y_test, test_pred)

    results = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Train Precision": train_precision,
        "Train Recall": train_recall,
        "Test Precision": test_precision,
        "Test Recall": test_recall,
        "Test True -": test_conf_matrix[0][0],
        "Test False -": test_conf_matrix[1][0],
        "Test True +": test_conf_matrix[1][1],
        "Test False +": test_conf_matrix[0][1],
    }

    return model_search.best_params_, results

## Plain train / test:

In [3]:
# test with plain X_train i.e. not resampled or one hot / target encoded

data_dir = "../data/train_test_data/"
plain_X_train = pd.read_pickle(data_dir + "X_train.pkl")
y_train = pd.read_pickle(data_dir + "y_train.pkl")

plain_X_test = pd.read_pickle(data_dir + "X_test.pkl")
y_test = pd.read_pickle(data_dir + "y_test.pkl")

for feature in plain_X_train.columns:
    if plain_X_train[feature].dtype.name == "category":
        for df in [plain_X_train, plain_X_test]:
            df[feature] = df[feature].cat.codes

# requires scale_pos_weight i.e. ratio of negative / positive examples
plain_param_grid = {
    "scale_pos_weight": [4.85],
    "n_estimators": [10, 30, 100, 300, 1000, 1300, 3000],
    "max_depth": [2, 3, 10, 13, 20, 23],
    "learning_rate": [ 0.1, 0.03, 0.01, 0.003],
    "reg_lambda": [10, 50, 100],
    "gamma": [0.25, 1.0, 10]
}

plain_best_params, plain_results = test_xgb_model(plain_param_grid,
                                                  plain_X_train, y_train,
                                                  plain_X_test, y_test)

In [4]:
plain_best_params

{'gamma': 1.0,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 3000,
 'reg_lambda': 10,
 'scale_pos_weight': 4.85}

In [5]:
plain_results

{'Train Accuracy': 0.9449275362318841,
 'Test Accuracy': 0.704225352112676,
 'Train Precision': 0.7564102564102564,
 'Train Recall': 1.0,
 'Test Precision': 0.14814814814814814,
 'Test Recall': 0.17391304347826086,
 'Test True -': 96,
 'Test False -': 19,
 'Test True +': 4,
 'Test False +': 23}

## One Hot Encoded train / test:

In [6]:
# test with ohe data (not resampled)

X_train_ohe = pd.read_pickle(data_dir + "X_train_ohe.pkl")
X_test_ohe = pd.read_pickle(data_dir + "X_test_ohe.pkl")

ohe_best_params, ohe_results = test_xgb_model(plain_param_grid,
                                              X_train_ohe, y_train,
                                              X_test_ohe, y_test)

In [7]:
ohe_best_params

{'gamma': 1.0,
 'learning_rate': 0.03,
 'max_depth': 3,
 'n_estimators': 100,
 'reg_lambda': 100,
 'scale_pos_weight': 4.85}

In [8]:
%%time
ohe_results

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs


{'Train Accuracy': 0.7594202898550725,
 'Test Accuracy': 0.676056338028169,
 'Train Precision': 0.39473684210526316,
 'Train Recall': 0.7627118644067796,
 'Test Precision': 0.23255813953488372,
 'Test Recall': 0.43478260869565216,
 'Test True -': 86,
 'Test False -': 13,
 'Test True +': 10,
 'Test False +': 33}

## Target Encoded train / test:

In [9]:
%%time

# test with target encoded data (not resampled)


X_train_target = pd.read_pickle(data_dir + "X_train_target.pkl")
X_test_target = pd.read_pickle(data_dir + "X_test_target.pkl")

target_best_params, target_results = test_xgb_model(plain_param_grid,
                                              X_train_target, y_train,
                                              X_test_target, y_test)

CPU times: user 34.7 s, sys: 962 ms, total: 35.7 s
Wall time: 18min 17s


In [10]:
target_best_params

{'gamma': 1.0,
 'learning_rate': 0.003,
 'max_depth': 3,
 'n_estimators': 3000,
 'reg_lambda': 100,
 'scale_pos_weight': 4.85}

In [11]:
target_results

{'Train Accuracy': 0.8115942028985508,
 'Test Accuracy': 0.6901408450704225,
 'Train Precision': 0.47115384615384615,
 'Train Recall': 0.8305084745762712,
 'Test Precision': 0.23076923076923078,
 'Test Recall': 0.391304347826087,
 'Test True -': 89,
 'Test False -': 14,
 'Test True +': 9,
 'Test False +': 30}

## Resampled train / test:

In [12]:
%%time

X_train_res = pd.read_pickle(data_dir + "X_train_res.pkl")
y_train_res = pd.read_pickle(data_dir + "y_train_res.pkl")

for feature in X_train_res.columns:
    if X_train_res[feature].dtype.name == "category":
        X_train_res[feature] = X_train_res[feature].cat.codes

res_param_grid = {
    "n_estimators": [10, 30, 100, 300, 1000, 1300, 3000],
    "max_depth": [2, 3, 10, 13, 20, 23],
    "learning_rate": [ 0.1, 0.03, 0.01, 0.003],
    "reg_lambda": [10, 50, 100],
    "gamma": [0.25, 1.0, 10]
}

res_best_params, res_results = test_xgb_model(res_param_grid,
                                              X_train_res, y_train_res,
                                              plain_X_test, y_test)

CPU times: user 52.5 s, sys: 933 ms, total: 53.4 s
Wall time: 28min 24s


In [13]:
res_best_params

{'gamma': 0.25,
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 1000,
 'reg_lambda': 10}

In [14]:
res_results

{'Train Accuracy': 0.9947552447552448,
 'Test Accuracy': 0.795774647887324,
 'Train Precision': 1.0,
 'Train Recall': 0.9895104895104895,
 'Test Precision': 0.125,
 'Test Recall': 0.043478260869565216,
 'Test True -': 112,
 'Test False -': 22,
 'Test True +': 1,
 'Test False +': 7}

## Resampled One Hot Encoded train / test:

In [15]:
%%time


X_train_res_ohe = pd.read_pickle(data_dir + "X_train_res_ohe.pkl")

res_ohe_best_params, res_ohe_results = test_xgb_model(res_param_grid,
                                                      X_train_res_ohe,
                                                      y_train_res,
                                                      X_test_ohe,
                                                      y_test)

CPU times: user 28.9 s, sys: 1.42 s, total: 30.3 s
Wall time: 57min 51s


In [16]:
res_ohe_best_params

{'gamma': 0.25,
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 300,
 'reg_lambda': 10}

In [17]:
res_ohe_results

{'Train Accuracy': 0.9947552447552448,
 'Test Accuracy': 0.8028169014084507,
 'Train Precision': 1.0,
 'Train Recall': 0.9895104895104895,
 'Test Precision': 0.14285714285714285,
 'Test Recall': 0.043478260869565216,
 'Test True -': 113,
 'Test False -': 22,
 'Test True +': 1,
 'Test False +': 6}

## Resampled Target Encoded train / test:

In [18]:
%%time

# test with target encoded data (not resampled)


X_train_res_target = pd.read_pickle(data_dir + "X_train_res_target.pkl")

res_target_best_params, res_target_results = test_xgb_model(res_param_grid,
                                                            X_train_res_target,
                                                            y_train_res,
                                                            X_test_target,
                                                            y_test)

CPU times: user 24.4 s, sys: 856 ms, total: 25.3 s
Wall time: 24min 24s


In [19]:
res_target_best_params

{'gamma': 0.25,
 'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 300,
 'reg_lambda': 10}

In [20]:
res_target_results

{'Train Accuracy': 0.965034965034965,
 'Test Accuracy': 0.7887323943661971,
 'Train Precision': 0.975,
 'Train Recall': 0.9545454545454546,
 'Test Precision': 0.23076923076923078,
 'Test Recall': 0.13043478260869565,
 'Test True -': 109,
 'Test False -': 20,
 'Test True +': 3,
 'Test False +': 10}

In [None]:
all_results = [
    plain_results,
    ohe_results,
    target_results,
    res_results,
    res_ohe_results,
    res_target_results,
]

results_index = [
    "No Manipulation",
    "One Hot Encoded Categories",
    "Feature Encoded Categories",
    "SMOTE Resampled",
    "SMOTE & One Hot Encoded",
    "SMOTE & Feature Encoded",
]

all_results = pd.DataFrame(all_results,
                           index=results_index)
all_results.to_csv("../data/XGB_Results")

In [None]:
all_results