In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error as mse

from models import generate_model

In [None]:
model = {
    "quadratic": generate_model(degree=2),
    "log_quadratic": generate_model(degree=2, take_log=True),
    "cubic": generate_model(degree=3),
    "log_cubic": generate_model(degree=3, take_log=True),
    "quartic": generate_model(degree=4),
    "log_quartic": generate_model(degree=4, take_log=True),
    "split_quartic": generate_model(degree=4, split_quartic=True),
    "log_split_quartic": generate_model(degree=4, split_quartic=True, take_log=True),
}

train = [pd.read_csv(f"output/train/df_train_{i}.csv") for i in range(1, 11)]
test = [pd.read_csv(f"output/test/df_test_{i}.csv") for i in range(1, 6)]

In [3]:
results_list = []

for pipe_id, pipe in model.items():
    for train_id, tr_set in enumerate(train):
        pipe.fit(tr_set[["leverage", "sigma"]], tr_set["std"])

        for test_id, test_set in enumerate(test):
            pred = pipe.predict(test_set[["leverage", "sigma"]])

            results_list.append(
                {
                    "model": pipe_id,
                    "train": train_id,
                    "test": test_id,
                    "mse": mse(test_set["std"], pred),
                    "R2": pipe.score(test_set[["leverage", "sigma"]], test_set["std"]),
                }
            )

results = pd.DataFrame(results_list)

In [4]:
results

Unnamed: 0,model,train,test,mse,R2
0,quadratic,0,0,9.898369,0.710035
1,quadratic,0,1,12.188740,0.667731
2,quadratic,0,2,8.538313,0.735262
3,quadratic,0,3,8.555832,0.734449
4,quadratic,0,4,10.820537,0.688619
...,...,...,...,...,...
395,log_split_quartic,9,0,14.386098,0.578571
396,log_split_quartic,9,1,14.790935,0.596794
397,log_split_quartic,9,2,13.230965,0.589762
398,log_split_quartic,9,3,12.407227,0.614912


In [7]:
mean_result = results.groupby("model")[["mse", "R2"]].mean()
std_result = results.groupby("model")[["mse", "R2"]].std()

In [8]:
aggr_result

Unnamed: 0_level_0,mse,R2
model,Unnamed: 1_level_1,Unnamed: 2_level_1
cubic,8.308695,0.756929
log_cubic,8.335333,0.75588
log_quadratic,7.65654,0.775904
log_quartic,7.692902,0.774996
log_split_quartic,9.625089,0.717689
quadratic,11.2231,0.671219
quartic,7.638996,0.776561
split_quartic,9.082007,0.734188


In [9]:
std_result

Unnamed: 0_level_0,mse,R2
model,Unnamed: 1_level_1,Unnamed: 2_level_1
cubic,1.614524,0.038223
log_cubic,1.923178,0.051008
log_quadratic,1.120801,0.021845
log_quartic,1.256808,0.025862
log_split_quartic,2.250678,0.061934
quadratic,2.853675,0.076984
quartic,1.238446,0.025363
split_quartic,2.271519,0.059877


The `log_quadratic` model has second-best `mse` and `R2`, and lowest standard deviation of `mse` and `R2` among train-test couples.