# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder

from xgboost import XGBClassifier

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.inspection import permutation_importance

# Read in Data

In [4]:
df = pd.read_csv("../DATA/clean.csv")

# TTS

In [5]:
X, X_test, y, y_test = train_test_split(df.drop('class',axis=1),
                                        df['class'],test_size =.1,
                                       random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=.09,
                                                 random_state=42)

# Baseline

In [6]:
baseline = y_train.value_counts(normalize=True).max()

# XGB Classifier

In [7]:
xgb = make_pipeline(OneHotEncoder(use_cat_names=True),
                    XGBClassifier(n_jobs=-1,random_state=42))

In [8]:
xgb.fit(X_train,y_train);

# Metrics

In [9]:
print(f"Baseline: {baseline}\n")
print(f"Train Score: {xgb.score(X_train,y_train)}")
print(f"Val score: {xgb.score(X_val,y_val)}")
print(f"Test Score: {xgb.score(X_test,y_test)}")

Baseline: 0.43687834736036724

Train Score: 0.8126434583014537
Val score: 0.6888888888888889
Test Score: 0.6812842599843383


# Hyperparameter Tuning

In [10]:
params = {
    'xgbclassifier__booster':['gbtree','gblinear','dart'],
    'xgbclassifier__eta':[.1,.3,.5,.7,.9,1,2,3],
    'xgbclassifier__gamma':[0,.5,1,3,5,10],
    'xgbclassifier__max_depth':[2,4,6,10,30,50,100,None],
    'xgbclassifier__min_child_weight':[0,1,3,5,7,15,30],
    'xgbclassifier__max_delta_step':[0,2,4,6,8,10],
    'xgbclassifier__sampling_method':['uniform','gradient_based']
}

In [11]:
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=params,
    n_iter = 333,
    verbose=1,
    n_jobs=-1,
    random_state=42,
    cv=3
)

In [12]:
#random_search.fit(X_train,y_train);

In [13]:
#random_search.best_params_

# Lost my local copy of the tuned, but it was tahnkfully still on github

In [14]:
hyper_xgb = make_pipeline(OneHotEncoder(use_cat_names=True),
                    XGBClassifier(n_jobs=-1,random_state=42,sampling_method='uniform',
                                  min_child_weight=15,max_depth=None,max_delta_step=0,
                                  gamma=0,eta=2,booster='gbtree'))

In [15]:
hyper_xgb.fit(X_train,y_train);

In [16]:
print(f"Baseline: {baseline}\n")
print(f"Train Score: {hyper_xgb.score(X_train,y_train)}")
print(f"Val score: {hyper_xgb.score(X_val,y_val)}")
print(f"Test Score: {hyper_xgb.score(X_test,y_test)}")

Baseline: 0.43687834736036724

Train Score: 0.43687834736036724
Val score: 0.41642512077294686
Test Score: 0.44322631166797183


In [17]:
import pickle

with open('../Models/xgb',"wb") as myfile:
    pickle.dump(xgb,myfile)

In [18]:
import pickle

with open('../Models/hyper_xgb',"wb") as myfile:
    pickle.dump(hyper_xgb,myfile)