# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder

from xgboost import XGBClassifier

In [6]:
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.inspection import permutation_importance

# Read in Data

In [3]:
df = pd.read_csv("../DATA/clean.csv")

# TTS

In [7]:
X, X_test, y, y_test = train_test_split(df.drop('class',axis=1),
                                        df['class'],test_size =.1,
                                       random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=.09,
                                                 random_state=42)

# Baseline

In [8]:
baseline = y_train.value_counts(normalize=True).max()

# XGB Classifier

In [29]:
xgb = make_pipeline(OneHotEncoder(use_cat_names=True),
                    XGBClassifier(n_jobs=-1,random_state=42))

In [30]:
xgb.fit(X_train,y_train);

# Metrics

In [31]:
print(f"Baseline: {baseline}\n")
print(f"Train Score: {xgb.score(X_train,y_train)}")
print(f"Val score: {xgb.score(X_val,y_val)}")
print(f"Test Score: {xgb.score(X_test,y_test)}")

Baseline: 0.43687834736036724

Train Score: 0.8126434583014537
Val score: 0.6888888888888889
Test Score: 0.6812842599843383


# Hyperparameter Tuning

In [32]:
params = {
    'xgbclassifier__booster':['gbtree','gblinear','dart'],
    'xgbclassifier__eta':[.1,.3,.5,.7,.9,1,2,3],
    'xgbclassifier__gamma':[0,.5,1,3,5,10],
    'xgbclassifier__max_depth':[2,4,6,10,30,50,100,None],
    'xgbclassifier__min_child_weight':[0,1,3,5,7,15,30],
    'xgbclassifier__max_delta_step':[0,2,4,6,8,10],
    'xgbclassifier__sampling_method':['uniform','gradient_based']
}

In [33]:
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=params,
    n_iter = 333,
    verbose=1,
    n_jobs=-1,
    random_state=42,
    cv=3
)

In [34]:
random_search.fit(X_train,y_train);

Fitting 3 folds for each of 333 candidates, totalling 999 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 999 out of 999 | elapsed: 12.9min finished


In [35]:
random_search.best_params_

{'xgbclassifier__sampling_method': 'uniform',
 'xgbclassifier__min_child_weight': 15,
 'xgbclassifier__max_depth': None,
 'xgbclassifier__max_delta_step': 0,
 'xgbclassifier__gamma': 0,
 'xgbclassifier__eta': 2,
 'xgbclassifier__booster': 'gbtree'}

In [40]:
hyper_xgb = random_search.best_estimator_

In [41]:
hyper_xgb.fit(X_train,y_train);

In [42]:
print(f"Baseline: {baseline}\n")
print(f"Train Score: {hyper_xgb.score(X_train,y_train)}")
print(f"Val score: {hyper_xgb.score(X_val,y_val)}")
print(f"Test Score: {hyper_xgb.score(X_test,y_test)}")

Baseline: 0.43687834736036724

Train Score: 0.7486610558530987
Val score: 0.6801932367149759
Test Score: 0.6851996867658575


In [43]:
import pickle

with open('hyper_xgb',"wb") as myfile:
    pickle.dump(hyper_xgb,myfile)