In [15]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

In [16]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)
            
            # Calculate time of go_to_work and go_to_sum
            df_act_work = df_activities[df_activities['activity_type_main']=='work'].drop(['end_time'], axis=1)
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link")['go_to_work'].sum().reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home'].drop(['end_time'], axis=1)
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link")['go_to_home'].sum().reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']

            df_rushhr = df_activities[df_activities['end_time']!=-1]
            df_rushhr.loc[:, 'rush_hour'] = 0
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('08:00:00'), pd.to_timedelta('10:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('16:00:00'), pd.to_timedelta('19:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
            df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
            
            df_maxduragg = df_activities[df_activities['max_dur']!=-1].groupby(by='link')['max_dur'].sum().reset_index(drop=False)
            
            df_activities['cemdapStopDuration_s'] = df_activities['cemdapStopDuration_s'].astype(float)
            df_cemagg = df_activities[df_activities['cemdapStopDuration_s']!=-1].groupby(by='link')['cemdapStopDuration_s'].sum().reset_index(drop=False)
            
            df_temp = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            df_temp = df_temp[['link_id_x','link_from','link_to','link_id_y','from', 'to', 'type']]
            df_temp = df_temp.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_rushhragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_maxduragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_cemagg, how='left', left_on='link_id_y', right_on='link')
            df_temp.fillna({'cemdapStopDuration_s':-1, 'max_dur':-1, 'rush_hour': -1, 'go_to_sum': -1}, inplace=True)
            df_temp = df_temp[['link_id_x', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'type']]
            
            df_links = df_links.merge(df_temp, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [29]:
numerical_features = ['link_id', 'link_from', 'link_to', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes'
                     ]
category_feature = ['type']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
    # 'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
    'GB': GradientBoostingClassifier(random_state=101),
    'ANN': MLPClassifier(random_state=101),
    # 'SVR': SVC(),
}

model_space = {
    # 'KNN': KNeighborsRegressor(),
    # 'XGB': xgb.XGBRegressor(random_state=101),
    # 'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    # 'RF': RandomForestRegressor(random_state=101),
    # 'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
    # 'SVR': SVR(),
    # 'Linear': LinearRegression(),
    # 'Ridge': Ridge(),
    # 'Lasso': Lasso()
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'identity', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
'Linear':{
    'fit_intercept' :optuna.distributions.CategoricalDistribution([True, False])
},
'Lasso':{
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True)
},
'Ridge':{
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True)
}
}

In [18]:
df_train = []
for i in range(0, 10):
    train_files = f'Data/sparseWorlds/Train/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_links_network_{i}.pkl')
    train_data = load_data([train_files], df_activities, df_links_network)
    df_train.append(train_data)
train_data = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    validate_files = f'Data/sparseWorlds/Validate/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_links_network_{i}.pkl')
    validate_data = load_data([validate_files], df_activities, df_links_network)
    df_validate.append(validate_data)
validate_data = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    test_files = f'Data/sparseWorlds/Test/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_links_network_{i}.pkl')
    test_data = load_data([test_files], df_activities, df_links_network)
    df_test.append(test_data)
test_data = pd.concat(df_test, ignore_index=True)

train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

Big_data_tr = ct.fit_transform(Big_data)

In [19]:
cluster = MiniBatchKMeans(n_clusters=5, random_state=101)
Big_data_tr['x_y_coor'] = cluster.fit_predict(Big_data_tr[['num_preprocess__start_node_x', 'num_preprocess__start_node_y',
                                                           'num_preprocess__end_node_x', 'num_preprocess__end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_tr['similar_link'] = cluster1.fit_predict(Big_data_tr[['num_preprocess__link_length', 'num_preprocess__link_freespeed',
                                                           'num_preprocess__link_capacity', 'num_preprocess__link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_tr['planxml'] = cluster2.fit_predict(Big_data_tr[['num_preprocess__rush_hour', 'num_preprocess__max_dur', 
                                                           'num_preprocess__cemdapStopDuration_s']])
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0

In [20]:
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [None]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 36)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

In [None]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [None]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [None]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 38)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

In [30]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_boreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 36)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_boreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-02-06 08:47:28,811] A new study created in memory with name: no-name-121d7a45-ba2c-4d24-837f-daa779b24ae2
[I 2024-02-06 08:48:16,858] Trial 0 finished with value: -4.7127948184373825 and parameters: {'selector__k': 3, 'model__hidden_layer_sizes': (100,), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 392.89645997762693}. Best is trial 0 with value: -4.7127948184373825.
[I 2024-02-06 08:48:25,913] Trial 1 finished with value: -4.774513796476237 and parameters: {'selector__k': 11, 'model__hidden_layer_sizes': (50, 50), 'model__activation': 'logistic', 'model__solver': 'sgd', 'model__alpha': 19727.508922067205}. Best is trial 0 with value: -4.7127948184373825.
[I 2024-02-06 08:48:34,195] Trial 2 finished with value: -4.751671993067225 and parameters: {'selector__k': 9, 'model__hidden_layer_sizes': (50,), 'model__activation': 'relu', 'model__solver': 'adam', 'model__alpha': 282.7733958227019}. Best is trial 0 with value: -4.7127948184373825.
[I 2024-02-06

ANN -4.045839155355693 3.9936506156416587 23.600375261685077 18.754578111686683


In [28]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    param_grid = param_space[model_name]
    # BayesSearchCV
    opt = OptunaSearchCV(
        model,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-02-06 08:43:15,130] A new study created in memory with name: no-name-408382a1-5aa2-4969-a48e-cd1167303dd1
[I 2024-02-06 08:43:15,304] Trial 0 finished with value: -4.57221848398243 and parameters: {'fit_intercept': True}. Best is trial 0 with value: -4.57221848398243.
[I 2024-02-06 08:43:15,500] Trial 1 finished with value: -4.57221848398243 and parameters: {'fit_intercept': True}. Best is trial 0 with value: -4.57221848398243.
[I 2024-02-06 08:43:15,678] Trial 2 finished with value: -4.572218483982428 and parameters: {'fit_intercept': False}. Best is trial 2 with value: -4.572218483982428.
[I 2024-02-06 08:43:15,859] Trial 3 finished with value: -4.57221848398243 and parameters: {'fit_intercept': True}. Best is trial 2 with value: -4.572218483982428.
[I 2024-02-06 08:43:16,029] Trial 4 finished with value: -4.57221848398243 and parameters: {'fit_intercept': True}. Best is trial 2 with value: -4.572218483982428.
[I 2024-02-06 08:43:16,198] Trial 5 finished with value: -4.572218

Linear -4.572218483982428 3.8446809819038683 22.80350100154905 19.695046192455912


[I 2024-02-06 08:43:24,296] Trial 1 finished with value: -4.57221844927378 and parameters: {'alpha': 2.48114988705338e-05}. Best is trial 1 with value: -4.57221844927378.
[I 2024-02-06 08:43:24,442] Trial 2 finished with value: -4.572217145309752 and parameters: {'alpha': 0.0009570217177771799}. Best is trial 2 with value: -4.572217145309752.
[I 2024-02-06 08:43:24,595] Trial 3 finished with value: -4.571866081099144 and parameters: {'alpha': 0.2576974921295679}. Best is trial 3 with value: -4.571866081099144.
[I 2024-02-06 08:43:24,742] Trial 4 finished with value: -4.571839788483163 and parameters: {'alpha': 0.27751895566144263}. Best is trial 4 with value: -4.571839788483163.
[I 2024-02-06 08:43:24,890] Trial 5 finished with value: -4.686775030783776 and parameters: {'alpha': 90975.6511154779}. Best is trial 4 with value: -4.571839788483163.
[I 2024-02-06 08:43:25,052] Trial 6 finished with value: -4.571347565181516 and parameters: {'alpha': 0.6605823598510955}. Best is trial 6 with

Ridge -4.5481938188515825 3.842558496993737 22.806667832780565 19.645679535758088


[I 2024-02-06 08:43:31,974] Trial 1 finished with value: -4.771858749908705 and parameters: {'alpha': 13.133007304452514}. Best is trial 0 with value: -4.771858749908705.
[I 2024-02-06 08:43:39,586] Trial 2 finished with value: -4.571425319529491 and parameters: {'alpha': 1.1989188751560727e-05}. Best is trial 2 with value: -4.571425319529491.
[I 2024-02-06 08:43:39,767] Trial 3 finished with value: -4.771858749908705 and parameters: {'alpha': 20729.12056316106}. Best is trial 2 with value: -4.571425319529491.
[I 2024-02-06 08:43:39,942] Trial 4 finished with value: -4.771858749908705 and parameters: {'alpha': 4501.174192096905}. Best is trial 2 with value: -4.571425319529491.
[I 2024-02-06 08:43:47,265] Trial 5 finished with value: -4.564846860737942 and parameters: {'alpha': 0.0013242598014368836}. Best is trial 5 with value: -4.564846860737942.
[I 2024-02-06 08:43:47,448] Trial 6 finished with value: -4.771858749908705 and parameters: {'alpha': 516.2587640653221}. Best is trial 5 wi

Lasso -4.549683962899841 3.845681911363211 22.801810307158807 19.59478649032316
