In [1]:
import seaborn as sns

In [2]:
import pandas as pd

In [27]:
import numpy as np

In [3]:
healthexp = sns.load_dataset('healthexp')

In [4]:
healthexp.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [5]:
healthexp = pd.get_dummies(healthexp) #builds new column for each country

In [6]:
healthexp.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


In [7]:
X = healthexp.drop(['Life_Expectancy'], axis = 1)

In [8]:
y = healthexp['Life_Expectancy']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 54)

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
rfr = RandomForestRegressor(random_state = 34)

In [13]:
rfr.fit(X_train, y_train)

In [15]:
y_pred = rfr.predict(X_test)

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score #r2 score - how well model fits data

In [17]:
mean_absolute_error(y_test, y_pred)

0.31138181818180044

In [18]:
r2_score(y_test, y_pred)

0.9836234548107303

In [19]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [20]:
import optuna

In [21]:
from sklearn.model_selection import cross_val_score

In [39]:

def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    # Instantiate the model with hyperparameters
    model = RandomForestRegressor(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=42)
    
    # Perform cross-validation and calculate mean squared error
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    mean_mse = -np.mean(scores)
    
    return mean_mse

    study = optuna.create_study(direction='minimize')  # Assuming you want to minimize mean squared error
    study.optimize(objective, n_trials=200)


In [40]:
# Set the seed value for reproducibility
#seed_value = 42

# Create a study with RandomSampler and specify the seed
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=42))


[I 2024-04-14 16:57:45,449] A new study created in memory with name: no-name-ab201953-097b-4bc3-b0f7-fb2e091a8d04


In [None]:
study.optimize(objective, n_trials = 200)

[I 2024-04-14 16:57:46,039] Trial 0 finished with value: 1.0509848478478494 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 1.0509848478478494.
[I 2024-04-14 16:57:46,301] Trial 1 finished with value: 1.4612112450134922 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 1.4612112450134922.
[I 2024-04-14 16:57:46,958] Trial 2 finished with value: 1.5646688178748172 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 2 with value: 1.5646688178748172.
[I 2024-04-14 16:57:47,890] Trial 3 finished with value: 0.2888075323961413 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 2 with value: 1.5646688178748172.
[I 2024-04-14 16:57:48,255] Trial 4 finished with value: 0.480712917145695 and parameters:

In [None]:
study.best_params

In [None]:
best_params = study.best_params

In [None]:
import matplotlib.pyplot as plt

In [None]:
optuna.visualization.plot_optimization_history(study)