In [1]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import plotly

In [2]:
healthexp = sns.load_dataset("healthexp")
healthexp.head(25)

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
5,1971,Canada,313.391,72.8
6,1971,Germany,298.251,70.8
7,1971,Great Britain,134.172,71.9
8,1971,Japan,163.854,72.9
9,1971,USA,357.988,71.2


In [3]:
healthexp = pd.get_dummies(healthexp)

In [4]:
X = healthexp.drop(["Life_Expectancy"], axis=1)
y = healthexp["Life_Expectancy"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=19
)

In [6]:
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

In [7]:
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.25916363636361917
0.10221141818181628
0.9910457602615238


In [8]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 10, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 32)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 32)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
    )

    score = cross_val_score(
        model, X, y, n_jobs=-1, cv=5, scoring="neg_mean_squared_error"
    ).mean()
    return score

In [9]:
study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=42)
)

[I 2024-09-17 13:59:15,639] A new study created in memory with name: no-name-d9c65cc6-65c2-472f-aa72-5368ab79c4f4


In [10]:
study.optimize(objective, n_trials=100)

[I 2024-09-17 13:59:26,581] Trial 0 finished with value: -4.492232889822825 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20}. Best is trial 0 with value: -4.492232889822825.
[I 2024-09-17 13:59:28,613] Trial 1 finished with value: -5.135907141854878 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 28}. Best is trial 0 with value: -4.492232889822825.
[I 2024-09-17 13:59:34,145] Trial 2 finished with value: -5.547653344597071 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 32}. Best is trial 0 with value: -4.492232889822825.
[I 2024-09-17 13:59:40,739] Trial 3 finished with value: -3.013679850017632 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 3 with value: -3.013679850017632.
[I 2024-09-17 13:59:42,558] Trial 4 finished with value: -3.828406881097854 and paramete

In [11]:
best_params = study.best_params
best_score = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_score:.3f}")

Best Hyperparameters: {'n_estimators': 358, 'max_depth': 34, 'min_samples_split': 2, 'min_samples_leaf': 2}
Best Accuracy: -1.864


In [12]:
optuna.visualization.plot_optimization_history(study)

In [13]:
optuna.visualization.plot_parallel_coordinate(study)

In [14]:
optuna.visualization.plot_optimization_history(study)

In [15]:
optuna.visualization.plot_slice(
    study, params=["n_estimators", "max_depth", "min_samples_leaf", "min_samples_split"]
)

In [16]:
optuna.visualization.plot_param_importances(study)

In [17]:
best_n_estimators = best_params["n_estimators"]
best_max_depth = best_params["max_depth"]
best_min_samples_split = best_params["min_samples_split"]
best_min_samples_leaf = best_params["min_samples_leaf"]

In [18]:
best_model = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

In [19]:
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.319734253036058
0.14611125516317616
0.9871999114140704
