In [1]:
import os
import warnings

import numpy as np
import optuna
import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

In [2]:
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1)
data = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
data["target"] = y

In [3]:
X = data.drop("target", axis=1)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
def get_family(family_link):
    if family_link == "log":
        family = sm.families.Gaussian(sm.families.links.log())
    elif family_link == "identity":
        family = sm.families.Gaussian(sm.families.links.identity())
    return family


def objective(trial):
    alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
    family_link = trial.suggest_categorical("family_link", ["log", "identity"])
    family = get_family(family_link)

    model = sm.GLM(y_train, sm.add_constant(X_train), family=family)
    results = model.fit_regularized(alpha=alpha)

    predictions = results.predict(sm.add_constant(X_test))
    mse = mean_squared_error(y_test, predictions)

    return mse

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[I 2023-11-14 00:36:22,723] A new study created in memory with name: no-name-0f95908f-847d-4a65-b361-ed64d4a6d7b8


[I 2023-11-14 00:36:23,070] Trial 0 finished with value: 26085.999190747625 and parameters: {'alpha': 0.05809114791582246, 'family_link': 'log'}. Best is trial 0 with value: 26085.999190747625.
[I 2023-11-14 00:36:23,306] Trial 1 finished with value: 26086.38796938616 and parameters: {'alpha': 0.0037996853534324982, 'family_link': 'log'}. Best is trial 0 with value: 26085.999190747625.
[I 2023-11-14 00:36:23,340] Trial 2 finished with value: 14.584409037505102 and parameters: {'alpha': 1.0498850987586763, 'family_link': 'identity'}. Best is trial 2 with value: 14.584409037505102.
[I 2023-11-14 00:36:23,658] Trial 3 finished with value: 26086.372061130278 and parameters: {'alpha': 0.006021140347585357, 'family_link': 'log'}. Best is trial 2 with value: 14.584409037505102.
[I 2023-11-14 00:36:23,880] Trial 4 finished with value: 26086.2620489516 and parameters: {'alpha': 0.021383591344285465, 'family_link': 'log'}. Best is trial 2 with value: 14.584409037505102.
[I 2023-11-14 00:36:24,10

In [6]:
best_params = study.best_params
print(f"Best parameters: {best_params}")

Best parameters: {'alpha': 1.0498850987586763, 'family_link': 'identity'}


In [7]:
family_link = best_params["family_link"]
family = get_family(family_link)

final_model = sm.GLM(
    y_train, sm.add_constant(X_train), family=family
).fit_regularized(alpha=best_params["alpha"])
final_predictions = final_model.predict(sm.add_constant(X_test))
final_mse = mean_squared_error(y_test, final_predictions)

print(f"Final model MSE: {final_mse}")

Final model MSE: 14.584409037505102
