## Hyperparameter Optimization For Xgboost using K Fold and Stratified Cross Validation

In [2]:
import pandas as pd

In [6]:
#https://raw.githubusercontent.com/brahzr/Machine-Learning/refs/heads/main/Churn_Modelling.csv

In [7]:
## Read the Dataset
url= "https://raw.githubusercontent.com/brahzr/Machine-Learning/refs/heads/main/Churn_Modelling.csv"
df=pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
## Correlation
import seaborn as sns
import matplotlib.pyplot as plt
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

ValueError: could not convert string to float: 'Hargrave'

In [9]:
#Get the Independent and Dependent Features
X=df.iloc[:,3:13]
Y=df.iloc[:,13]

In [10]:
geography=pd.get_dummies(X['Geography'],drop_first=True)

In [11]:
geography.head()

Unnamed: 0,Germany,Spain
0,False,False
1,False,True
2,False,False
3,False,False
4,False,True


In [12]:
gender=pd.get_dummies(X['Gender'],drop_first=True)

In [13]:
gender.head()

Unnamed: 0,Male
0,False
1,False
2,False
3,False
4,False


In [14]:
## Drop Categorical Features
X=X.drop(['Geography','Gender'],axis=1)

In [15]:
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,42,2,0.0,1,1,1,101348.88
1,608,41,1,83807.86,1,0,1,112542.58
2,502,42,8,159660.8,3,1,0,113931.57
3,699,39,1,0.0,2,0,0,93826.63
4,850,43,2,125510.82,1,1,1,79084.1


In [16]:
X=pd.concat([X,geography,gender],axis=1)

In [17]:
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,False,True,False
2,502,42,8,159660.8,3,1,0,113931.57,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,False,True,False


In [25]:
## Hyper Parameter Optimization

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [26]:
#pip install xgboost

In [27]:
## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

In [28]:

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [29]:
classifier=xgboost.XGBClassifier()

In [30]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [31]:
from datetime import datetime
# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X,Y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0 hours 0 minutes and 14.27 seconds.


In [32]:
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,False,True,False
2,502,42,8,159660.8,3,1,0,113931.57,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,False,True,False


In [54]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=7, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [44]:
random_search.best_params_

{'min_child_weight': 7,
 'max_depth': 6,
 'learning_rate': 0.1,
 'gamma': 0.4,
 'colsample_bytree': 0.5}

In [36]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=7, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [37]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(classifier,X,Y,cv=10)

Parameters: { "silent" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Traceback (most recent call last):
  File "C:\Anaconda\Lib\site-packages\sklearn\metrics\_scorer.py", line 156, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
  File "C:\Anaconda\Lib\site-packages\sklearn\metrics\_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Anaconda\Lib\site-packages\sklearn\base.py", line 548, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ~~~~~~~~~~~~^^^
  File "C:\Anaconda\Lib\site-packages\xgboost\core.py", line 774, in inner_f
    return func(**kwargs)
  File "C:\Anaconda\Lib\site-packages\xgboost\sklearn.py", line 1839, in predict
    class_probs = super().predict(
        X=X,
    ...<3 lines>...
        iteration_range=iteration_range,
    )
  File "C:\Anaconda\Lib\site-packages\xgboost\core.

In [35]:
score

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [59]:
score.mean()

0.8646989201989201