In [3]:
import optuna

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,roc_curve,precision_recall_curve
import statsmodels.formula.api as sm
from xgboost import XGBClassifier
import datetime
from datetime import *

In [7]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

In [8]:
X=train.drop("Disbursed",axis=1)
Y=train["Disbursed"]

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=100)

In [11]:
import xgboost as xgb

In [12]:
clf = xgb.XGBClassifier()

In [15]:
skf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)

In [17]:
scores = cross_val_score(clf,X,Y,scoring="f1_macro",cv=skf,n_jobs=-1)
scores

array([0.49626628, 0.49629544, 0.49629544, 0.49617879, 0.49629544,
       0.50407611, 0.49626628, 0.49626628, 0.50381244, 0.49620795])

In [19]:
avg_f1_score = scores.mean()
avg_f1_score

0.49779604467037164

### Our aim is to improve this f1_score

### Optuna hyperparameter tuning begins

#### Step #1 : Declaring the hyperparameters and their ranges over which you want to optimize the function

In [48]:
def objective(trial):
    n_estimators=trial.suggest_int("n_estimators",100,250,step=10)
    learning_rate=trial.suggest_float("learning_rate",1e-7,0.3,log=True)
    scale_pos_weight=trial.suggest_int("scale_pos_weight",1,80,step=1)
    max_depth=trial.suggest_int("max_depth",3,12,step=1)
    min_child_weight=trial.suggest_int("min_child_weight",1,15,step=1)
    gamma=trial.suggest_float("gamma",1e-5,0.5,log=True)
    subsample=trial.suggest_float("subsample",0.5,1,log=True)
    colsample_bytree=trial.suggest_float("colsample_bytree",0.5,1,log=True)
    reg_lambda=trial.suggest_int("reg_lambda",0,50,step=1)
    reg_alpha=trial.suggest_int("reg_alpha",0,50,step=1)
    clf = xgb.XGBClassifier(n_estimators=n_estimators,learning_rate=learning_rate,scale_pos_weight=scale_pos_weight,
                            max_depth=max_depth,min_child_weight=min_child_weight,gamma=gamma,
                            subsample=subsample,colsample_bytree=colsample_bytree,reg_lambda=reg_lambda,
                            reg_alpha=reg_alpha)
    skf = StratifiedKFold(n_splits=10,random_state=1,shuffle=True)
    scores = cross_val_score(clf,X,Y,scoring='f1_macro',cv=skf,n_jobs=-1)
    f1_score = scores.mean()
    return f1_score

In [49]:
study = optuna.create_study(direction="maximize",study_name="f1-score-max")

[32m[I 2022-08-06 03:13:20,575][0m A new study created in memory with name: f1-score-max[0m


In [None]:
study.optimize(objective,n_trials=100)