In [70]:
##Let's import the iris dataset
import polars as pl
from sklearn.datasets import load_iris

iris = load_iris()

data=iris.data
target=iris.target

df=pl.DataFrame(data)
df.columns=iris.feature_names
df=df.with_columns(pl.Series(name="target", values=target))

In [71]:
import plotly.express as px

for col in df.columns:
    if col != "target":
        fig=px.histogram(df, x=col, color="target", title=f"Histogram of {col} by target")
        fig.show()

In [72]:
## as we can see we have very little data to work with, so we need to use the whole dataset,how can we do that?
## leverage the power of cross validation
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

xgb=XGBClassifier()
scores=cross_val_score(xgb, df.drop("target"), df["target"], cv=5,verbose=1)
scores.mean()

np.float64(0.9533333333333334)

In [73]:
df.shape

(150, 5)

In [74]:
## Now we can use optuna to find the best hyperparameters
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    xgb = XGBClassifier(**params)

    scores = cross_val_score(xgb, df.drop("target"), df["target"], cv=5, verbose=1)
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)


[I 2025-03-31 10:56:33,619] A new study created in memory with name: no-name-038f7071-9aed-4e28-acbc-b6c37295c43b
[I 2025-03-31 10:56:41,632] Trial 0 finished with value: 0.9533333333333334 and parameters: {'n_estimators': 909, 'max_depth': 3, 'learning_rate': 0.14504900211141217, 'subsample': 0.8925217552967388, 'colsample_bytree': 0.8640131729705625}. Best is trial 0 with value: 0.9533333333333334.
[I 2025-03-31 10:56:46,187] Trial 1 finished with value: 0.9533333333333334 and parameters: {'n_estimators': 484, 'max_depth': 5, 'learning_rate': 0.17035074678207748, 'subsample': 0.8803066032683518, 'colsample_bytree': 0.5480774701105878}. Best is trial 0 with value: 0.9533333333333334.
[I 2025-03-31 10:56:50,424] Trial 2 finished with value: 0.96 and parameters: {'n_estimators': 479, 'max_depth': 9, 'learning_rate': 0.283762344134641, 'subsample': 0.8971712261351806, 'colsample_bytree': 0.689997250471611}. Best is trial 2 with value: 0.96.
[I 2025-03-31 10:56:53,314] Trial 3 finished wi

In [75]:
from optuna.importance import get_param_importances
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def plot_param_importances(study):
    param_importances=get_param_importances(study)
    params_importance=[list(param_importances.keys()),list(param_importances.values())]
    param_importances=pl.DataFrame(params_importance)
    param_importances.columns=["param","importance"]

    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(go.Bar(y=param_importances["param"],x=param_importances["importance"],orientation="h"))
    fig.show()
    return param_importances


In [76]:
plot_param_importances(study)

param,importance
str,f64
"""learning_rate""",0.573499
"""subsample""",0.145671
"""max_depth""",0.134821
"""colsample_bytree""",0.099335
"""n_estimators""",0.046674


In [77]:
## now we train the model with the best hyperparameters
xgb=XGBClassifier(**study.best_trial.params)
df=df.rename({col:col.replace(" (cm)", "").replace(" ", "_") for col in df.columns if " (cm)" in col})
xgb.fit(df.drop("target"), df["target"])

## and we save the model 
import pickle
pickle.dump(xgb, open("xgb_best_model.pkl", "wb"))


In [78]:
df

sepal_length,sepal_width,petal_length,petal_width,target
f64,f64,f64,f64,i64
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
4.7,3.2,1.3,0.2,0
4.6,3.1,1.5,0.2,0
5.0,3.6,1.4,0.2,0
…,…,…,…,…
6.7,3.0,5.2,2.3,2
6.3,2.5,5.0,1.9,2
6.5,3.0,5.2,2.0,2
6.2,3.4,5.4,2.3,2
