In [6]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

In [7]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)
            
            # Calculate time of go_to_work and go_to_sum
            df_act_work = df_activities[df_activities['activity_type_main']=='work'].drop(['end_time'], axis=1)
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link")['go_to_work'].sum().reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home'].drop(['end_time'], axis=1)
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link")['go_to_home'].sum().reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']

            df_rushhr = df_activities[df_activities['end_time']!=-1]
            df_rushhr.loc[:, 'rush_hour'] = 0
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('08:00:00'), pd.to_timedelta('10:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('16:00:00'), pd.to_timedelta('19:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
            df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
            
            df_maxduragg = df_activities[df_activities['max_dur']!=-1].groupby(by='link')['max_dur'].sum().reset_index(drop=False)
            
            df_activities['cemdapStopDuration_s'] = df_activities['cemdapStopDuration_s'].astype(float)
            df_cemagg = df_activities[df_activities['cemdapStopDuration_s']!=-1].groupby(by='link')['cemdapStopDuration_s'].sum().reset_index(drop=False)
            
            df_temp = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            df_temp = df_temp[['link_id_x','link_from','link_to','link_id_y','from', 'to', 'type']]
            df_temp = df_temp.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_rushhragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_maxduragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_cemagg, how='left', left_on='link_id_y', right_on='link')
            df_temp.fillna({'cemdapStopDuration_s':-1, 'max_dur':-1, 'rush_hour': -1, 'go_to_sum': -1}, inplace=True)
            df_temp = df_temp[['link_id_x', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'type']]
            
            df_links = df_links.merge(df_temp, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [24]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes'
                     ]
category_feature = ['type']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
    # 'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101),
    # 'SVR': SVC(),
}

model_space = {
    'KNN': KNeighborsRegressor(),
#     'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
    # 'SVR': SVR(),
    'Linear': LinearRegression(),
    'Lasso': LassoCV(random_state=42, max_iter=100000),
    'Ridge': RidgeCV(),
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'Linear': {  
},
'Lasso': {
},
'Ridge': {  
},
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'identity', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
'GPR':{
    'kernel': optuna.distributions.CategoricalDistribution([0.1**2 * RBF(length_scale=0.1) + 
                                    WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5)), 
                                    0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0),
                                    50.0**2 * RBF(length_scale=50.0), DotProduct() + WhiteKernel(), 
                                    1.0 * Matern(length_scale=1.0, nu=1.5),
                                    RBF() + ConstantKernel(constant_value=2)
                                                           ]),
    'alpha':  optuna.distributions.FloatDistribution(1e-15, 1e10)
}
}

In [9]:
df_train = []
list_od = []
list_nodes = []
for i in range(0, 10):
    train_files = f'Data/sparseWorlds/Train/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_links_network_{i}.pkl')
    train_data = load_data([train_files], df_activities, df_links_network)
    df_train.append(train_data)
    with open(train_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
train_data = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    validate_files = f'Data/sparseWorlds/Validate/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_links_network_{i}.pkl')
    validate_data = load_data([validate_files], df_activities, df_links_network)
    df_validate.append(validate_data)
    with open(validate_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
validate_data = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    test_files = f'Data/sparseWorlds/Test/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_links_network_{i}.pkl')
    test_data = load_data([test_files], df_activities, df_links_network)
    df_test.append(test_data)
    with open(test_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
test_data = pd.concat(df_test, ignore_index=True)

train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

indices = Big_data.index[Big_data['link_id'] == 0].tolist()
indices.append(len(Big_data))
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]
import networkx as nx

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data_new = pd.concat(dfs)

In [11]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['x_y_coor'] = cluster.fit_predict(Big_data_new[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['similar_link'] = cluster1.fit_predict(Big_data_new[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['planxml'] = cluster2.fit_predict(Big_data_new[['rush_hour', 'max_dur', 'cemdapStopDuration_s']])

Big_data_new = Big_data_new.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

In [12]:
Big_data_tr = ct.fit_transform(Big_data_new)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)

In [13]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [16]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 37)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-03-14 21:20:06,002] A new study created in memory with name: no-name-628a33e4-5655-464f-9e76-639f9930f063
[I 2024-03-14 21:20:08,793] Trial 0 finished with value: 0.5612524065254838 and parameters: {'selector__k': 32, 'model__n_neighbors': 4, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 0 with value: 0.5612524065254838.
[I 2024-03-14 21:20:13,945] Trial 1 finished with value: 0.5758435505117033 and parameters: {'selector__k': 33, 'model__n_neighbors': 18, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 1 with value: 0.5758435505117033.
[I 2024-03-14 21:20:20,058] Trial 2 finished with value: 0.6831492552436924 and parameters: {'selector__k': 17, 'model__n_neighbors': 50, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 2 with value: 0.6831492552436924.
[I 2024-03-14 21:20:21,158] Trial 3 finished with value: 0.6832505826324856 and parameters: {'selector__k': 4, 'model__n_neighbors': 21, 'model_

[I 2024-03-14 21:22:17,513] Trial 33 finished with value: 0.7029080960583646 and parameters: {'selector__k': 2, 'model__n_neighbors': 4, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 11 with value: 0.7117235788833721.
[I 2024-03-14 21:22:19,503] Trial 34 finished with value: 0.6854797851859358 and parameters: {'selector__k': 5, 'model__n_neighbors': 7, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 11 with value: 0.7117235788833721.
[I 2024-03-14 21:22:22,087] Trial 35 finished with value: 0.6901408450704225 and parameters: {'selector__k': 10, 'model__n_neighbors': 14, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 11 with value: 0.7117235788833721.
[I 2024-03-14 21:22:25,045] Trial 36 finished with value: 0.6483939608876279 and parameters: {'selector__k': 4, 'model__n_neighbors': 1, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 11 with value: 0.7117235788833721.
[I 2024-03-14

KNN 0.7117235788833721 0.6845651286239282


[I 2024-03-14 21:23:17,022] Trial 0 finished with value: 0.6799067788023103 and parameters: {'selector__k': 8, 'model__learning_rate': 0.8803849917726317, 'model__n_estimators': 227, 'model__max_depth': 33, 'model__num_leaves': 29, 'model__min_child_samples': 3, 'model__subsample': 0.14163983566680666, 'model__colsample_bytree': 0.5792524624676381}. Best is trial 0 with value: 0.6799067788023103.
[I 2024-03-14 21:23:20,791] Trial 1 finished with value: 0.7060492451109535 and parameters: {'selector__k': 18, 'model__learning_rate': 0.1816989480906841, 'model__n_estimators': 955, 'model__max_depth': 29, 'model__num_leaves': 6, 'model__min_child_samples': 15, 'model__subsample': 0.2534723060198575, 'model__colsample_bytree': 0.7580707305168117}. Best is trial 1 with value: 0.7060492451109535.
[I 2024-03-14 21:23:25,416] Trial 2 finished with value: 0.6868983686290404 and parameters: {'selector__k': 20, 'model__learning_rate': 0.4827388984838141, 'model__n_estimators': 435, 'model__max_dept

[I 2024-03-14 21:25:30,511] Trial 21 finished with value: 0.6906474820143885 and parameters: {'selector__k': 22, 'model__learning_rate': 0.3864022191336065, 'model__n_estimators': 245, 'model__max_depth': 10, 'model__num_leaves': 37, 'model__min_child_samples': 7, 'model__subsample': 0.9997659924954753, 'model__colsample_bytree': 0.4499105813870309}. Best is trial 12 with value: 0.710001013273888.
[I 2024-03-14 21:25:32,358] Trial 22 finished with value: 0.7035160603911237 and parameters: {'selector__k': 22, 'model__learning_rate': 0.31069736459051833, 'model__n_estimators': 137, 'model__max_depth': 5, 'model__num_leaves': 50, 'model__min_child_samples': 6, 'model__subsample': 0.8786810271093474, 'model__colsample_bytree': 0.6592306146324225}. Best is trial 12 with value: 0.710001013273888.
[I 2024-03-14 21:25:56,560] Trial 23 finished with value: 0.6788935049143784 and parameters: {'selector__k': 13, 'model__learning_rate': 0.5388881441703545, 'model__n_estimators': 2023, 'model__max_

[I 2024-03-14 21:26:30,221] Trial 42 finished with value: 0.7008815482825007 and parameters: {'selector__k': 3, 'model__learning_rate': 0.15472692911617983, 'model__n_estimators': 186, 'model__max_depth': 32, 'model__num_leaves': 26, 'model__min_child_samples': 2, 'model__subsample': 0.7426073128024966, 'model__colsample_bytree': 0.9180622265957755}. Best is trial 30 with value: 0.7115209241057858.
[I 2024-03-14 21:26:31,209] Trial 43 finished with value: 0.7039213699462965 and parameters: {'selector__k': 5, 'model__learning_rate': 0.08655204429725116, 'model__n_estimators': 265, 'model__max_depth': 23, 'model__num_leaves': 20, 'model__min_child_samples': 4, 'model__subsample': 0.5651832738077481, 'model__colsample_bytree': 0.8578200914415658}. Best is trial 30 with value: 0.7115209241057858.
[I 2024-03-14 21:26:31,861] Trial 44 finished with value: 0.7118249062721653 and parameters: {'selector__k': 2, 'model__learning_rate': 0.11703445382350844, 'model__n_estimators': 169, 'model__max

LGBM 0.7118249062721653 0.6855859534503879


[I 2024-03-14 21:26:52,072] Trial 0 finished with value: 0.6957138514540481 and parameters: {'selector__k': 24, 'model__max_features': 'log2', 'model__n_estimators': 120, 'model__max_depth': 179, 'model__min_samples_leaf': 8}. Best is trial 0 with value: 0.6957138514540481.
[I 2024-03-14 21:27:02,034] Trial 1 finished with value: 0.7109129597730266 and parameters: {'selector__k': 10, 'model__max_features': 'sqrt', 'model__n_estimators': 129, 'model__max_depth': 13, 'model__min_samples_leaf': 12}. Best is trial 1 with value: 0.7109129597730266.
[I 2024-03-14 21:27:31,076] Trial 2 finished with value: 0.7044280068902624 and parameters: {'selector__k': 20, 'model__max_features': 'log2', 'model__n_estimators': 365, 'model__max_depth': 37, 'model__min_samples_leaf': 17}. Best is trial 1 with value: 0.7109129597730266.
[I 2024-03-14 21:28:03,525] Trial 3 finished with value: 0.7080757928868173 and parameters: {'selector__k': 9, 'model__max_features': 'sqrt', 'model__n_estimators': 414, 'mode

[I 2024-03-14 22:17:54,353] Trial 30 finished with value: 0.698855000506637 and parameters: {'selector__k': 5, 'model__max_features': 'sqrt', 'model__n_estimators': 2240, 'model__max_depth': 85, 'model__min_samples_leaf': 4}. Best is trial 13 with value: 0.712736852771304.
[I 2024-03-14 22:18:41,693] Trial 31 finished with value: 0.7117235788833721 and parameters: {'selector__k': 2, 'model__max_features': 'sqrt', 'model__n_estimators': 2787, 'model__max_depth': 49, 'model__min_samples_leaf': 6}. Best is trial 13 with value: 0.712736852771304.
[I 2024-03-14 22:19:32,875] Trial 32 finished with value: 0.7117235788833721 and parameters: {'selector__k': 2, 'model__max_features': 'sqrt', 'model__n_estimators': 2967, 'model__max_depth': 17, 'model__min_samples_leaf': 7}. Best is trial 13 with value: 0.712736852771304.
[I 2024-03-14 22:22:06,948] Trial 33 finished with value: 0.7032120782247442 and parameters: {'selector__k': 5, 'model__max_features': 'sqrt', 'model__n_estimators': 2250, 'mod

RF 0.712736852771304 0.6893630053082891


In [17]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [18]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [23]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 37)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-15 00:16:17,658] A new study created in memory with name: no-name-d0f79910-4ff5-4393-b441-95e50c972486
[I 2024-03-15 00:16:26,092] Trial 0 finished with value: -5.260215105382335 and parameters: {'selector__k': 12, 'model__hidden_layer_sizes': (50,), 'model__activation': 'tanh', 'model__solver': 'adam', 'model__alpha': 6.13462963500084}. Best is trial 0 with value: -5.260215105382335.
[I 2024-03-15 00:17:01,674] Trial 1 finished with value: -6.810613995081397 and parameters: {'selector__k': 30, 'model__hidden_layer_sizes': (50,), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 96768.01236500457}. Best is trial 0 with value: -5.260215105382335.
[I 2024-03-15 00:18:30,223] Trial 2 finished with value: -5.76107624665674 and parameters: {'selector__k': 7, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 2.1144697235431072e-05}. Best is trial 0 with value: -5.260215105382335.
[W 2024-03-15

[I 2024-03-15 00:23:50,243] Trial 28 finished with value: -4.717047870833634 and parameters: {'selector__k': 15, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'identity', 'model__solver': 'adam', 'model__alpha': 0.10743472223655151}. Best is trial 25 with value: -4.420464848758526.
[W 2024-03-15 00:24:28,067] Trial 29 failed with parameters: {'selector__k': 21, 'model__hidden_layer_sizes': (100,), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 0.011893100846398587} because of the following error: The value nan is not acceptable.
[W 2024-03-15 00:24:28,069] Trial 29 failed with value nan.
[I 2024-03-15 00:24:32,801] Trial 30 finished with value: -5.588588843688757 and parameters: {'selector__k': 9, 'model__hidden_layer_sizes': (100,), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 0.1958233981265065}. Best is trial 25 with value: -4.420464848758526.
[I 2024-03-15 00:25:28,672] Trial 31 finished with value: -6.2897247200

ANN -4.40190924000081 3.245077841232823 25.991858087575924 21.0


[I 2024-03-15 00:29:55,419] Trial 1 finished with value: -5.4877521446532835 and parameters: {'selector__k': 37}. Best is trial 0 with value: -5.362830166174571.
[I 2024-03-15 00:29:55,582] Trial 2 finished with value: -5.47971278040683 and parameters: {'selector__k': 13}. Best is trial 0 with value: -5.362830166174571.
[I 2024-03-15 00:29:55,725] Trial 3 finished with value: -5.474851905479228 and parameters: {'selector__k': 19}. Best is trial 0 with value: -5.362830166174571.
[I 2024-03-15 00:29:55,842] Trial 4 finished with value: -5.447051155670018 and parameters: {'selector__k': 5}. Best is trial 0 with value: -5.362830166174571.
[I 2024-03-15 00:29:55,980] Trial 5 finished with value: -5.4774481816548 and parameters: {'selector__k': 26}. Best is trial 0 with value: -5.362830166174571.
[I 2024-03-15 00:29:56,330] Trial 6 finished with value: -5.487927329323021 and parameters: {'selector__k': 34}. Best is trial 0 with value: -5.362830166174571.
[I 2024-03-15 00:29:56,539] Trial 7 f

Linear -5.362830166174571 3.4002864102743855 27.69021782680158 21.0


[I 2024-03-15 00:30:03,296] Trial 0 finished with value: -5.440666512070388 and parameters: {'selector__k': 4}. Best is trial 0 with value: -5.440666512070388.
[I 2024-03-15 00:30:04,010] Trial 1 finished with value: -5.425704570522889 and parameters: {'selector__k': 13}. Best is trial 1 with value: -5.425704570522889.
[I 2024-03-15 00:30:05,181] Trial 2 finished with value: -5.428613697834659 and parameters: {'selector__k': 36}. Best is trial 1 with value: -5.425704570522889.
[I 2024-03-15 00:30:06,121] Trial 3 finished with value: -5.4243111991249044 and parameters: {'selector__k': 24}. Best is trial 3 with value: -5.4243111991249044.
[I 2024-03-15 00:30:06,777] Trial 4 finished with value: -5.425736123843219 and parameters: {'selector__k': 12}. Best is trial 3 with value: -5.4243111991249044.
[I 2024-03-15 00:30:07,849] Trial 5 finished with value: -5.4249375683392485 and parameters: {'selector__k': 30}. Best is trial 3 with value: -5.4243111991249044.
[I 2024-03-15 00:30:08,244] Tr

Lasso -5.361830270269343 3.398452757692297 27.65596645952652 21.0


[I 2024-03-15 00:30:36,141] Trial 0 finished with value: -5.484590321705437 and parameters: {'selector__k': 33}. Best is trial 0 with value: -5.484590321705437.
[I 2024-03-15 00:30:36,363] Trial 1 finished with value: -5.476536383656789 and parameters: {'selector__k': 27}. Best is trial 1 with value: -5.476536383656789.
[I 2024-03-15 00:30:36,490] Trial 2 finished with value: -5.362719690555998 and parameters: {'selector__k': 2}. Best is trial 2 with value: -5.362719690555998.
[I 2024-03-15 00:30:36,632] Trial 3 finished with value: -5.476632772685796 and parameters: {'selector__k': 15}. Best is trial 2 with value: -5.362719690555998.
[I 2024-03-15 00:30:36,811] Trial 4 finished with value: -5.476536383656789 and parameters: {'selector__k': 27}. Best is trial 2 with value: -5.362719690555998.
[I 2024-03-15 00:30:37,278] Trial 5 finished with value: -5.477058257223575 and parameters: {'selector__k': 30}. Best is trial 2 with value: -5.362719690555998.
[I 2024-03-15 00:30:37,466] Trial 6

Ridge -5.362719690555998 3.4002155303041373 27.688899738662137 21.0


In [None]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_boreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 36)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_boreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-15 00:39:07,307] A new study created in memory with name: no-name-c8aa08e8-64fc-4096-9c6e-099ff7822e38
[I 2024-03-15 00:39:08,915] Trial 0 finished with value: -4.5452362068285685 and parameters: {'selector__k': 30, 'model__n_neighbors': 49, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: -4.5452362068285685.
[I 2024-03-15 00:39:09,442] Trial 1 finished with value: -3.9327053779036016 and parameters: {'selector__k': 11, 'model__n_neighbors': 4, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 1 with value: -3.9327053779036016.
[I 2024-03-15 00:39:10,838] Trial 2 finished with value: -4.546725608441768 and parameters: {'selector__k': 31, 'model__n_neighbors': 40, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: -3.9327053779036016.
[I 2024-03-15 00:39:12,956] Trial 3 finished with value: -4.564802285098524 and parameters: {'selector__k': 34, 'model__n_neighbors': 16, 'mo

[I 2024-03-15 00:39:51,863] Trial 33 finished with value: -3.472539199686101 and parameters: {'selector__k': 11, 'model__n_neighbors': 47, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 30 with value: -3.435955700527981.
[I 2024-03-15 00:39:52,872] Trial 34 finished with value: -3.6238371843459145 and parameters: {'selector__k': 9, 'model__n_neighbors': 40, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 30 with value: -3.435955700527981.
[I 2024-03-15 00:39:53,620] Trial 35 finished with value: -4.339750734623569 and parameters: {'selector__k': 13, 'model__n_neighbors': 1, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 30 with value: -3.435955700527981.
[I 2024-03-15 00:39:54,240] Trial 36 finished with value: -3.4609167326013326 and parameters: {'selector__k': 5, 'model__n_neighbors': 47, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 30 with value: -3.435955700527981.
[I 2024-03

KNN -3.429480359374472 3.316206870151082 24.977551734197064 30.625


[I 2024-03-15 00:40:14,145] Trial 0 finished with value: -3.4122327651460185 and parameters: {'selector__k': 30, 'model__learning_rate': 0.1512784480273178, 'model__n_estimators': 166, 'model__max_depth': 5, 'model__num_leaves': 24, 'model__min_child_samples': 15, 'model__subsample': 0.28311552883477675, 'model__colsample_bytree': 0.2568840001886151}. Best is trial 0 with value: -3.4122327651460185.
[I 2024-03-15 00:40:14,834] Trial 1 finished with value: -3.5972237139052017 and parameters: {'selector__k': 24, 'model__learning_rate': 0.39730707163609286, 'model__n_estimators': 105, 'model__max_depth': 50, 'model__num_leaves': 27, 'model__min_child_samples': 19, 'model__subsample': 0.724562558815109, 'model__colsample_bytree': 0.4394180641033105}. Best is trial 0 with value: -3.4122327651460185.
[I 2024-03-15 00:40:15,307] Trial 2 finished with value: -3.8388879740965627 and parameters: {'selector__k': 12, 'model__learning_rate': 0.8597451690763099, 'model__n_estimators': 87, 'model__ma

[I 2024-03-15 00:40:48,344] Trial 21 finished with value: -3.4714264455708204 and parameters: {'selector__k': 25, 'model__learning_rate': 0.24914257613740665, 'model__n_estimators': 57, 'model__max_depth': 1, 'model__num_leaves': 22, 'model__min_child_samples': 13, 'model__subsample': 0.42660998842711684, 'model__colsample_bytree': 0.23992974971450798}. Best is trial 0 with value: -3.4122327651460185.
[I 2024-03-15 00:40:49,034] Trial 22 finished with value: -3.4114936837962175 and parameters: {'selector__k': 32, 'model__learning_rate': 0.15587832827089942, 'model__n_estimators': 76, 'model__max_depth': 10, 'model__num_leaves': 18, 'model__min_child_samples': 11, 'model__subsample': 0.35902348281609586, 'model__colsample_bytree': 0.3157582754034015}. Best is trial 22 with value: -3.4114936837962175.
[I 2024-03-15 00:40:49,868] Trial 23 finished with value: -3.4064131296862636 and parameters: {'selector__k': 34, 'model__learning_rate': 0.11286334487905025, 'model__n_estimators': 132, 'm

[I 2024-03-15 00:41:05,659] Trial 42 finished with value: -3.450400274549801 and parameters: {'selector__k': 31, 'model__learning_rate': 0.07541526847841692, 'model__n_estimators': 60, 'model__max_depth': 21, 'model__num_leaves': 16, 'model__min_child_samples': 10, 'model__subsample': 0.2398130804839967, 'model__colsample_bytree': 0.45429202057620016}. Best is trial 41 with value: -3.367254196899682.
[I 2024-03-15 00:41:06,381] Trial 43 finished with value: -3.9190428420253287 and parameters: {'selector__k': 27, 'model__learning_rate': 0.013193679247360798, 'model__n_estimators': 111, 'model__max_depth': 25, 'model__num_leaves': 7, 'model__min_child_samples': 12, 'model__subsample': 0.15395717757725663, 'model__colsample_bytree': 0.5610685303253584}. Best is trial 41 with value: -3.367254196899682.
[I 2024-03-15 00:41:07,305] Trial 44 finished with value: -3.3386341745689347 and parameters: {'selector__k': 34, 'model__learning_rate': 0.14563521950104907, 'model__n_estimators': 128, 'mo

LGBM -3.3386341745689347 3.1333943414874694 22.44414427957755 25.391741527079848


[I 2024-03-15 00:42:25,241] Trial 0 finished with value: -3.44462604705219 and parameters: {'selector__k': 9, 'model__max_features': 'log2', 'model__n_estimators': 1021, 'model__max_depth': 169, 'model__min_samples_leaf': 20}. Best is trial 0 with value: -3.44462604705219.
[I 2024-03-15 00:42:33,092] Trial 1 finished with value: -3.439324014915234 and parameters: {'selector__k': 7, 'model__max_features': 'sqrt', 'model__n_estimators': 129, 'model__max_depth': 124, 'model__min_samples_leaf': 12}. Best is trial 1 with value: -3.439324014915234.
[I 2024-03-15 00:44:09,146] Trial 2 finished with value: -3.413622856511995 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators': 1034, 'model__max_depth': 17, 'model__min_samples_leaf': 7}. Best is trial 2 with value: -3.413622856511995.
[I 2024-03-15 00:44:23,964] Trial 3 finished with value: -3.524597223242518 and parameters: {'selector__k': 32, 'model__max_features': 'log2', 'model__n_estimators': 105, 'mode

[I 2024-03-15 01:20:45,107] Trial 30 finished with value: -3.4071153990199723 and parameters: {'selector__k': 12, 'model__max_features': 'sqrt', 'model__n_estimators': 1378, 'model__max_depth': 50, 'model__min_samples_leaf': 6}. Best is trial 27 with value: -3.345604993577058.
[I 2024-03-15 01:21:04,544] Trial 31 finished with value: -3.3441508518594985 and parameters: {'selector__k': 20, 'model__max_features': 'sqrt', 'model__n_estimators': 134, 'model__max_depth': 24, 'model__min_samples_leaf': 5}. Best is trial 31 with value: -3.3441508518594985.
[I 2024-03-15 01:21:15,512] Trial 32 finished with value: -3.420229265495136 and parameters: {'selector__k': 14, 'model__max_features': 'sqrt', 'model__n_estimators': 91, 'model__max_depth': 78, 'model__min_samples_leaf': 7}. Best is trial 31 with value: -3.3441508518594985.
[I 2024-03-15 01:21:22,400] Trial 33 finished with value: -3.39389769383503 and parameters: {'selector__k': 19, 'model__max_features': 'sqrt', 'model__n_estimators': 67

In [None]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    param_grid = param_space[model_name]
    # BayesSearchCV
    opt = OptunaSearchCV(
        model,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)