# XGBoost Classifier without PCA

# XGBoost Classifier with PCA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score

In [2]:
#loading in the datasets
df_training = pd.read_csv('../data/archive/train_pca.csv')
df_testing = pd.read_csv('../data/archive/test_pca.csv')

In [3]:
#Separating X and y training and testing
x_train_full = df_training.iloc[:, :-1]
y_train_full = df_training.iloc[:,-1]

x_test = df_testing.iloc[:, :-1]
y_test = df_testing.iloc[:,-1]

In [4]:
#getting all of our randomized samples from the training
all_samples = []

for sample in range(0,5):
    all_samples.append(df_training.sample(n=5000, replace= True))

In [5]:
#Making a grid of values we want our grid search to test to find the best parameters
grid_values =[{'n_estimators': [10, 100, 1000], 'learning_rate': [0.15, 0.2, 0.1, 0.01], 'max_depth':list(range(1,20,4))}]

In [6]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
f1_best_params = []
precision_best_params = []


for sample in all_samples:
    xgb = XGBClassifier(eval_metric='mlogloss')
    xgb_classifier = GridSearchCV(estimator = xgb, param_grid = grid_values, cv = StratifiedKFold(n_splits = 3),
                      scoring = ['accuracy', 'precision_micro', 'f1_micro'], refit = False, verbose = 0)
    
    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    xgb_model = xgb_classifier.fit(x_train, y_train)
    
    accuracy_best_params.append(xgb_model.cv_results_['params'][ np.argmin(xgb_model.cv_results_['rank_test_accuracy'])])
    f1_best_params.append(xgb_model.cv_results_['params'][ np.argmin(xgb_model.cv_results_['rank_test_f1_micro'])])
    precision_best_params.append(xgb_model.cv_results_['params'][ np.argmin(xgb_model.cv_results_['rank_test_precision_micro'])])

In [7]:
accuracy_best_params

[{'learning_rate': 0.15, 'max_depth': 17, 'n_estimators': 1000},
 {'learning_rate': 0.15, 'max_depth': 9, 'n_estimators': 1000},
 {'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 1000},
 {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 1000},
 {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}]

In [8]:
f1_best_params

[{'learning_rate': 0.15, 'max_depth': 17, 'n_estimators': 1000},
 {'learning_rate': 0.15, 'max_depth': 9, 'n_estimators': 1000},
 {'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 1000},
 {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 1000},
 {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}]

In [9]:
precision_best_params

[{'learning_rate': 0.15, 'max_depth': 17, 'n_estimators': 1000},
 {'learning_rate': 0.15, 'max_depth': 9, 'n_estimators': 1000},
 {'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 1000},
 {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 1000},
 {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}]

In [11]:
accuracy_test_score = []
f1_test_score = []
precision_test_score = []

for i in range(5):
    xgb_clf = XGBClassifier(learning_rate= f1_best_params[i]['learning_rate'], max_depth= f1_best_params[i]['max_depth'],
                        n_estimators= f1_best_params[i]['n_estimators'])
    
    model = xgb_clf.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    
    accuracy_test_score.append(accuracy_score(y_test, y_pred))
    f1_test_score.append(f1_score(y_test, y_pred, average = 'micro'))
    precision_test_score.append(precision_score(y_test, y_pred, average = 'micro'))



In [12]:
f1_test_score

[0.8998982015609094,
 0.9015948422124194,
 0.9043094672548354,
 0.8968442483881914,
 0.9009161859518154]

In [13]:
precision_test_score

[0.8998982015609094,
 0.9015948422124194,
 0.9043094672548354,
 0.8968442483881914,
 0.9009161859518154]

In [14]:
accuracy_test_score

[0.8998982015609094,
 0.9015948422124194,
 0.9043094672548354,
 0.8968442483881914,
 0.9009161859518154]