# Libs:

In [1]:
import pandas as pd
import numpy as np


from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from xgboost import XGBClassifier

# Format data

In [2]:
data = load_breast_cancer()

In [3]:
df = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [4]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [5]:
df['target'].value_counts().reset_index()

Unnamed: 0,index,target
0,1.0,357
1,0.0,212


In [6]:
df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

## Prepare the data for modeling

In [7]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.30, 
                                                    random_state=42)

# Create a basic model

In [9]:
model = XGBClassifier()

## To determine the effectiveness of XGBoost on our dataset, we can use K fold cross validation
Then, we use the cross_val_score() function to return the ROC/AUC score for each run. A score of 1.0 is a perfect prediction, so all of the scores we generate are pretty decent.

In [10]:
cv = RepeatedStratifiedKFold(
    n_splits=10,
    n_repeats=3,
    random_state=42)

In [11]:
scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1)

In [12]:
for score in scores:
    print(score)

1.0
0.9253333333333333
0.9946666666666667
0.9973333333333334
1.0
1.0
0.9973333333333333
1.0
0.9771428571428572
1.0
1.0
1.0
0.9440000000000001
0.9946666666666667
0.9973333333333334
0.9893333333333334
0.9973333333333334
0.9893333333333334
0.9971428571428571
0.9944444444444445
0.9946666666666667
0.9786666666666668
0.9546666666666667
0.984
1.0
0.9946666666666667
0.9733333333333334
0.9866666666666667
1.0
1.0


In [13]:
print('Mean ROC/AUC = ', scores.mean())

Mean ROC/AUC =  0.9887354497354498


The mean ROC/AUC score across all the folds was 0.9887354497354498, which is not bad for an unoptimised model. Next, we’ll tune the model and see what improvements we can generate.

## Tuning your model hyper-parameters

In [14]:
# Identifying hyper-parameters
model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

## Creating a param grid

In [15]:
colsample_bytree = [0.3, 0.5, 1.0]
gamma = [0.1, 1, 1.5]
learning_rate = [0.001, 0.01]
min_child_weight = [1, 5, 10]
scale_pos_weight = [1, 2, 4]
subsample = [0.8, 0.9, 1.0]
n_estimators = [50, 100, 150]
max_depth = [5, 10]

param_grid = dict(
    colsample_bytree=colsample_bytree,
    gamma=gamma,
    learning_rate=learning_rate,
    min_child_weight=min_child_weight,
    scale_pos_weight=scale_pos_weight,
    subsample=subsample,
    n_estimators=n_estimators,
    max_depth=max_depth,
)

Once you have your param_grid the next step is to run GridSearchCV() on your model, pass in the parameters to test and define how you’ll determine what is “best”. We’re going to use ROC/AUC again. If you run this, GridSearchCV will now test all of the parameters in your param_grid and return the details on the combination with yields the highest ROC/AUC score.

model = XGBClassifier(
            random_state=42,
            verbosity=1)

grid_search = GridSearchCV(
                           estimator=model,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           )

best_model = grid_search.fit(X_train, y_train)

print('Optimum parameters', best_model.best_params_)

In [16]:
tuned_model = XGBClassifier(
                            random_state=1, 
                            colsample_bytree=1, 
                            learning_rate=0.05, 
                            max_depth=20, 
                            min_child_weight=1,
                            n_estimators=100,
                            subsample=0.6)

cv = RepeatedStratifiedKFold(
    n_splits=10,
    n_repeats=3,
    random_state=42)

scores = cross_val_score(tuned_model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC/AUC = ', scores.mean())

Mean ROC/AUC =  0.9884724867724868
