# XGBoost

In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.1


## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
#breast cancer data set where 0 is benigin(not cancerous tumor) and 1 is malignant(cancerous tumor)
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training XGBoost on the Training set

In [4]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Making the Confusion Matrix

In [5]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[85  2]
 [ 1 49]]


0.9781021897810219

## Applying k-Fold Cross Validation

In [6]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))



Accuracy: 96.53 %
Standard Deviation: 2.63 %


### Grid search CV to find better parameters

In [8]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 10,
    verbose=True,
    refit=True
)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    3.2s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    6.5s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   12.2s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:   21.2s




[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed:   26.1s finished


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1,
                                     enable_categorical=False, gamma=0,
                                     gpu_id=-1, importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=8,
                                     num_parallel_tree=1, predictor='auto',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
          

In [9]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=140, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
max(grid_search.cv_results_, key=lambda x: x[1])

'std_fit_time'

In [13]:
grid_search.cv_results_

{'mean_fit_time': array([0.22139878, 0.17319968, 0.19159856, 0.23499925, 0.13129957,
        0.17399914, 0.23459985, 0.28449864, 0.14869874, 0.1910991 ,
        0.245999  , 0.29819889, 0.15659924, 0.196099  , 0.25529902,
        0.30979891, 0.16439986, 0.20199928, 0.26390004, 0.31339917,
        0.16259894, 0.20509963, 0.26329877, 0.31369922, 0.16519918,
        0.203599  , 0.26149955, 0.31679945, 0.16160018, 0.20329938,
        0.26099927, 0.33689952, 0.09889958, 0.15399957, 0.19049885,
        0.23689923, 0.13299932, 0.17729924, 0.24209883, 0.29959919,
        0.15479925, 0.20499914, 0.27479863, 0.35449891, 0.16169939,
        0.22569892, 0.30109947, 0.39119895, 0.16469944, 0.23879883,
        0.31669865, 0.41119938, 0.1698998 , 0.24259903, 0.32249899,
        0.41689878, 0.17379882, 0.23959899, 0.32599947, 0.4264992 ,
        0.16199925, 0.24349937, 0.32559888, 0.4364995 , 0.09619908,
        0.15029919, 0.19369843, 0.23829904, 0.12319942, 0.17179871,
        0.22639928, 0.28509872,