# Coding Block 2 - Hyperparameter Optimization

### Load the packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import time

### Read the dataset

In [2]:
df = pd.read_csv("../data/diabetes.csv")

### Copy the code from your last successful classifiers (RF, XGBoost, ...)
Or use function below for XGBoost/RF

In [20]:
def create_model(data, model_type="xgboost"):
    """
    Create and train ML models on the given dataset
    
    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable
    model_type : str
        The type of model to create (default: "xgboost")
        
    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train model based on type
    if model_type == "random_forest":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
    elif model_type == "xgboost":
        import xgboost as xgb
        model = xgb.XGBClassifier(random_state=42)
        #model.fit(X_train, y_train)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Return model and data
    return model, X, y, X_train, X_test, y_train, y_test

### Define the parameter grid for GridSearchCV or use RandomizedSearchCV

In [21]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 20, 30],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10], # Minimum samples for splitting
    'min_samples_leaf': [1, 2, 4],   # Minimum samples per leaf
    'criterion': ['gini', 'entropy'] # Splitting criteria
}

# Initialize the model
rf_model, X, y, X_train, X_test, y_train, y_test = create_model(df, model_type="random_forest")

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

Random_Forest Model Accuracy: 0.7208
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



### Perform GridSearchCV or RandomizedSearchCV and tune the hyperparameters of the model
Maybe the hyperparameter tuning won't finish in time though. No problem.

In [22]:
grid_search.fit(X_train, y_train)

# Display the best combination of hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Display the best score achieved
print("Best accuracy score:", grid_search.best_score_)

# Retrieve the best model
best_model = grid_search.best_estimator_
print("Best model:", best_model)

# Optionally, display all results in a DataFrame
import pandas as pd

results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['param_n_estimators', 'param_max_depth', 'mean_test_score']])


Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best accuracy score: 0.7899373583899775
Best model: RandomForestClassifier(criterion='entropy', max_depth=20, min_samples_leaf=4,
                       min_samples_split=10, n_estimators=300, random_state=42)
     param_n_estimators  param_max_depth  mean_test_score
0                   100               10         0.770399
1                   200               10         0.767133
2                   300               10         0.773651
3                   100               10         0.773651
4                   200               10         0.771985
..                  ...              ...              ...
157                 200               30         0.785019
158                 300               30         0.785046
159                 100               30         0.773624
160           

In [23]:
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.319695,0.005195,0.021472,0.001540,gini,10,1,2,100,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.764228,0.821138,0.739837,0.731707,0.795082,0.770399,0.033624,127
1,0.647790,0.008324,0.037075,0.002289,gini,10,1,2,200,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.772358,0.813008,0.731707,0.731707,0.786885,0.767133,0.031724,156
2,0.950729,0.016121,0.051897,0.002243,gini,10,1,2,300,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.764228,0.821138,0.739837,0.747967,0.795082,0.773651,0.030352,97
3,0.318212,0.011160,0.018758,0.001779,gini,10,1,5,100,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.756098,0.829268,0.747967,0.739837,0.795082,0.773651,0.033656,97
4,0.615755,0.010507,0.034931,0.001277,gini,10,1,5,200,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.764228,0.829268,0.747967,0.747967,0.770492,0.771985,0.029992,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,0.779385,0.014105,0.042587,0.001194,entropy,30,4,5,200,"{'criterion': 'entropy', 'max_depth': 30, 'min...",0.780488,0.837398,0.764228,0.756098,0.786885,0.785019,0.028412,15
158,1.161829,0.008888,0.062664,0.002590,entropy,30,4,5,300,"{'criterion': 'entropy', 'max_depth': 30, 'min...",0.780488,0.829268,0.756098,0.756098,0.803279,0.785046,0.028231,7
159,0.389372,0.012966,0.023369,0.001200,entropy,30,4,10,100,"{'criterion': 'entropy', 'max_depth': 30, 'min...",0.764228,0.813008,0.772358,0.739837,0.778689,0.773624,0.023699,109
160,0.770138,0.017073,0.035036,0.009937,entropy,30,4,10,200,"{'criterion': 'entropy', 'max_depth': 30, 'min...",0.780488,0.829268,0.756098,0.747967,0.786885,0.780141,0.028539,47
