In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

In [2]:
def load_data_small(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)

            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']
        
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [7]:
numerical_features = ['link_id', 'link_from', 'link_to', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'length_per_capacity_ratio', 'speed_capacity_ratio', 'length_times_lanes', 'speed_times_capacity', 
                      'length_times', 'capacity_divided_by_lanes'
                     ]
scaler = StandardScaler()
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
    # 'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
    'GB': GradientBoostingClassifier(random_state=101),
    'ANN': MLPClassifier(random_state=101),
    # 'SVR': SVC(),
}

model_space = {
    # 'KNN': KNeighborsRegressor(),
    # 'XGB': xgb.XGBRegressor(random_state=101),
    # 'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    # 'RF': RandomForestRegressor(random_state=101),
    # 'GB': GradientBoostingRegressor(random_state=101),
    # 'ANN': MLPRegressor(random_state=101),
    # 'SVR': SVR(),
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso()
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'identity', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
'Linear':{
    'fit_intercept' :optuna.distributions.CategoricalDistribution([True, False])
},
'Lasso':{
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True)
},
'Ridge':{
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True)
}
}

In [4]:
df_train = []
for i in range(0, 10):
    small_train_files = f'Data/smallWorlds/Train/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Train/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Train/s/df_links_network_{i}.pkl")
    small_train_data = load_data_small([small_train_files], small_df_activities, small_df_links_network)
    df_train.append(small_train_data)
train_data = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    small_validate_files = f'Data/smallWorlds/Validate/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_links_network_{i}.pkl")
    small_validate_data = load_data_small([small_validate_files], small_df_activities, small_df_links_network)
    df_validate.append(small_validate_data)
validate_data = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    small_test_files = f'Data/smallWorlds/Test/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Test/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Test/s/df_links_network_{i}.pkl")
    small_test_data = load_data_small([small_test_files], small_df_activities, small_df_links_network)
    df_test.append(small_test_data)
test_data = pd.concat(df_test, ignore_index=True)

train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

Big_data_tr = ct.fit_transform(Big_data)


In [5]:
cluster = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_tr['x_y_coor'] = cluster.fit_predict(Big_data_tr[['num_preprocess__start_node_x', 'num_preprocess__start_node_y',
                                                           'num_preprocess__end_node_x', 'num_preprocess__end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_tr['similar_link'] = cluster1.fit_predict(Big_data_tr[['num_preprocess__link_length', 'num_preprocess__link_freespeed',
                                                           'num_preprocess__link_capacity', 'num_preprocess__link_permlanes']])
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0

In [6]:
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [53]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 21)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-02-06 01:01:12,925] A new study created in memory with name: no-name-cbba016c-8c90-4f54-aec9-25636c8ff06e
[I 2024-02-06 01:01:13,701] Trial 0 finished with value: 0.9218543046357616 and parameters: {'selector__k': 9, 'model__n_neighbors': 1, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 0.9218543046357616.
[I 2024-02-06 01:01:15,231] Trial 1 finished with value: 0.9556291390728476 and parameters: {'selector__k': 6, 'model__n_neighbors': 10, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 1 with value: 0.9556291390728476.
[I 2024-02-06 01:01:16,321] Trial 2 finished with value: 0.9589403973509933 and parameters: {'selector__k': 20, 'model__n_neighbors': 33, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 2 with value: 0.9589403973509933.
[I 2024-02-06 01:01:17,071] Trial 3 finished with value: 0.9589403973509933 and parameters: {'selector__k': 21, 'model__n_neighbors': 47, 'model__

KNN 0.9589403973509933 0.9588313413014609


[I 2024-02-06 01:01:54,312] Trial 0 finished with value: 0.8986754966887417 and parameters: {'selector__k': 9, 'model__learning_rate': 0.739838547821738, 'model__n_estimators': 2350, 'model__max_depth': 44, 'model__num_leaves': 14, 'model__min_child_samples': 10, 'model__subsample': 0.5597256401483633, 'model__colsample_bytree': 0.876586041329461}. Best is trial 0 with value: 0.8986754966887417.
[I 2024-02-06 01:01:55,827] Trial 1 finished with value: 0.9509933774834437 and parameters: {'selector__k': 8, 'model__learning_rate': 0.6232468546849578, 'model__n_estimators': 488, 'model__max_depth': 22, 'model__num_leaves': 47, 'model__min_child_samples': 15, 'model__subsample': 0.9892126169120675, 'model__colsample_bytree': 0.4550902434784765}. Best is trial 1 with value: 0.9509933774834437.
[I 2024-02-06 01:01:58,898] Trial 2 finished with value: 0.9483443708609272 and parameters: {'selector__k': 7, 'model__learning_rate': 0.6467362140725984, 'model__n_estimators': 2847, 'model__max_depth

LGBM 0.9589403973509933 0.9588313413014609


[I 2024-02-06 01:03:38,353] Trial 0 finished with value: 0.9589403973509933 and parameters: {'selector__k': 21, 'model__max_features': 'log2', 'model__n_estimators': 1514, 'model__max_depth': 4, 'model__min_samples_leaf': 15}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-02-06 01:03:42,327] Trial 1 finished with value: 0.9589403973509933 and parameters: {'selector__k': 15, 'model__max_features': 'sqrt', 'model__n_estimators': 162, 'model__max_depth': 196, 'model__min_samples_leaf': 12}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-02-06 01:03:44,867] Trial 2 finished with value: 0.9582781456953643 and parameters: {'selector__k': 5, 'model__max_features': 'sqrt', 'model__n_estimators': 123, 'model__max_depth': 18, 'model__min_samples_leaf': 5}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-02-06 01:04:23,974] Trial 3 finished with value: 0.9589403973509933 and parameters: {'selector__k': 9, 'model__max_features': 'sqrt', 'model__n_estimators': 1983, 'mode

RF 0.9589403973509933 0.9588313413014609


[I 2024-02-06 01:15:57,532] Trial 0 finished with value: 0.9556291390728476 and parameters: {'selector__k': 18, 'model__learning_rate': 0.3978928183829663, 'model__n_estimators': 92, 'model__max_depth': 83, 'model__min_samples_split': 3, 'model__min_samples_leaf': 6, 'model__subsample': 0.979340716822595}. Best is trial 0 with value: 0.9556291390728476.
[I 2024-02-06 01:16:24,381] Trial 1 finished with value: 0.9463576158940398 and parameters: {'selector__k': 15, 'model__learning_rate': 0.5765018078112646, 'model__n_estimators': 622, 'model__max_depth': 37, 'model__min_samples_split': 2, 'model__min_samples_leaf': 3, 'model__subsample': 0.8994782624458738}. Best is trial 0 with value: 0.9556291390728476.
[I 2024-02-06 01:16:43,311] Trial 2 finished with value: 0.9483443708609272 and parameters: {'selector__k': 11, 'model__learning_rate': 0.0196215568787918, 'model__n_estimators': 386, 'model__max_depth': 199, 'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__subsamp

KeyboardInterrupt: 

In [None]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [None]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [None]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 38)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

In [8]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_boreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 21)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_boreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-02-06 08:37:48,236] A new study created in memory with name: no-name-f60afbf8-3665-4aa7-bcb5-1df955dbee4d
[I 2024-02-06 08:37:48,327] Trial 0 finished with value: -2.360643721080169 and parameters: {'selector__k': 13, 'model__fit_intercept': False}. Best is trial 0 with value: -2.360643721080169.
[I 2024-02-06 08:37:48,413] Trial 1 finished with value: -1.7700368069815178 and parameters: {'selector__k': 10, 'model__fit_intercept': True}. Best is trial 1 with value: -1.7700368069815178.
[I 2024-02-06 08:37:48,489] Trial 2 finished with value: -1.770104960101363 and parameters: {'selector__k': 12, 'model__fit_intercept': True}. Best is trial 1 with value: -1.7700368069815178.
[I 2024-02-06 08:37:48,563] Trial 3 finished with value: -1.7690216725428412 and parameters: {'selector__k': 6, 'model__fit_intercept': True}. Best is trial 3 with value: -1.7690216725428412.
[I 2024-02-06 08:37:48,629] Trial 4 finished with value: -3.814892119564364 and parameters: {'selector__k': 2, 'model

Linear -1.7489385485942837 1.685164882657735 4.746705426679251 10.189859685546253


[I 2024-02-06 08:37:52,069] Trial 2 finished with value: -1.7691820794936124 and parameters: {'selector__k': 10, 'model__alpha': 197.83479357148977}. Best is trial 0 with value: -1.7685037391892102.
[I 2024-02-06 08:37:52,165] Trial 3 finished with value: -1.7700367647548996 and parameters: {'selector__k': 10, 'model__alpha': 0.00489314249165299}. Best is trial 0 with value: -1.7685037391892102.
[I 2024-02-06 08:37:52,248] Trial 4 finished with value: -1.7685985403502271 and parameters: {'selector__k': 21, 'model__alpha': 0.6013086070158336}. Best is trial 0 with value: -1.7685037391892102.
[I 2024-02-06 08:37:52,333] Trial 5 finished with value: -1.8554719619941145 and parameters: {'selector__k': 5, 'model__alpha': 28138.460973255485}. Best is trial 0 with value: -1.7685037391892102.
[I 2024-02-06 08:37:52,406] Trial 6 finished with value: -1.7683699131799802 and parameters: {'selector__k': 9, 'model__alpha': 0.5698808787245334}. Best is trial 6 with value: -1.7683699131799802.
[I 202

Ridge -1.7489227409235444 1.6851809790386025 4.746709832369533 10.18928605063909


[I 2024-02-06 08:37:55,798] Trial 4 finished with value: -1.7689911686815487 and parameters: {'selector__k': 6, 'model__alpha': 0.00022726959146899707}. Best is trial 2 with value: -1.758532587993927.
[I 2024-02-06 08:37:55,852] Trial 5 finished with value: -1.7648370168166903 and parameters: {'selector__k': 15, 'model__alpha': 0.021116119466095373}. Best is trial 2 with value: -1.758532587993927.
[I 2024-02-06 08:37:55,903] Trial 6 finished with value: -1.8829342180567925 and parameters: {'selector__k': 4, 'model__alpha': 3.209754361184745}. Best is trial 2 with value: -1.758532587993927.
[I 2024-02-06 08:37:55,956] Trial 7 finished with value: -1.749459707789037 and parameters: {'selector__k': 3, 'model__alpha': 0.001423927746881681}. Best is trial 7 with value: -1.749459707789037.
[I 2024-02-06 08:37:56,013] Trial 8 finished with value: -1.7686010975110544 and parameters: {'selector__k': 18, 'model__alpha': 1.4854470283665042e-05}. Best is trial 7 with value: -1.749459707789037.
[I 

Lasso -1.7479602704787343 1.6839650977953247 4.727313426082527 10.132449097888035


In [54]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    param_grid = param_space[model_name]
    # BayesSearchCV
    opt = OptunaSearchCV(
        model,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-02-06 01:55:58,848] A new study created in memory with name: no-name-838402af-29bd-451c-842b-e447ec36fa97
[I 2024-02-06 01:55:58,975] Trial 0 finished with value: -1.8872793118258315 and parameters: {'n_neighbors': 32, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 0 with value: -1.8872793118258315.
[I 2024-02-06 01:55:59,092] Trial 1 finished with value: -1.8831058251115014 and parameters: {'n_neighbors': 49, 'weights': 'uniform', 'algorithm': 'auto'}. Best is trial 1 with value: -1.8831058251115014.
[I 2024-02-06 01:55:59,291] Trial 2 finished with value: -2.022384105960265 and parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto'}. Best is trial 1 with value: -1.8831058251115014.
[I 2024-02-06 01:55:59,478] Trial 3 finished with value: -2.095080772548452 and parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 1 with value: -1.8831058251115014.
[I 2024-02-06 01:55:59,622] Trial 4 finished with value: -1.883

KNN -1.8774183752706524 1.7897360332756482 5.163995011429884 9.319485213687773


[I 2024-02-06 01:56:07,540] Trial 0 finished with value: -2.1860443072831526 and parameters: {'learning_rate': 0.6485120723678065, 'n_estimators': 252, 'max_depth': 19, 'num_leaves': 40, 'min_child_samples': 7, 'subsample': 0.6759240814800025, 'colsample_bytree': 0.4666557571498413}. Best is trial 0 with value: -2.1860443072831526.
[I 2024-02-06 01:56:08,222] Trial 1 finished with value: -2.1251226188839585 and parameters: {'learning_rate': 0.6448363112836321, 'n_estimators': 90, 'max_depth': 15, 'num_leaves': 50, 'min_child_samples': 20, 'subsample': 0.424458149533702, 'colsample_bytree': 0.787174647761704}. Best is trial 1 with value: -2.1251226188839585.
[I 2024-02-06 01:56:08,559] Trial 2 finished with value: -1.9216748983469842 and parameters: {'learning_rate': 0.3436791344446932, 'n_estimators': 129, 'max_depth': 5, 'num_leaves': 17, 'min_child_samples': 1, 'subsample': 0.7716328260590618, 'colsample_bytree': 0.5324976819533966}. Best is trial 2 with value: -1.9216748983469842.
[

LGBM -1.7182491902775794 1.676314101171994 4.589454662266262 8.557247856093824


[I 2024-02-06 01:57:11,977] Trial 0 finished with value: -1.7907903652330737 and parameters: {'max_features': 'sqrt', 'n_estimators': 197, 'max_depth': 43, 'min_samples_leaf': 4}. Best is trial 0 with value: -1.7907903652330737.
[I 2024-02-06 01:57:16,442] Trial 1 finished with value: -1.7880905776285057 and parameters: {'max_features': 'log2', 'n_estimators': 176, 'max_depth': 87, 'min_samples_leaf': 11}. Best is trial 1 with value: -1.7880905776285057.
[I 2024-02-06 01:57:56,821] Trial 2 finished with value: -1.787873031789269 and parameters: {'max_features': 'sqrt', 'n_estimators': 1879, 'max_depth': 47, 'min_samples_leaf': 20}. Best is trial 2 with value: -1.787873031789269.
[I 2024-02-06 01:58:02,755] Trial 3 finished with value: -1.7859444269809255 and parameters: {'max_features': 'sqrt', 'n_estimators': 286, 'max_depth': 118, 'min_samples_leaf': 19}. Best is trial 3 with value: -1.7859444269809255.
[I 2024-02-06 01:58:26,475] Trial 4 finished with value: -1.8045220270659375 and 

RF -1.7810428327108048 1.7164026975262014 4.729798207092285 8.893366471388576


[I 2024-02-06 02:04:50,045] Trial 0 finished with value: -1.8699321652989827 and parameters: {'learning_rate': 0.12123682213311732, 'n_estimators': 278, 'max_depth': 118, 'min_samples_split': 7, 'min_samples_leaf': 6, 'subsample': 0.8264572067294448}. Best is trial 0 with value: -1.8699321652989827.
[I 2024-02-06 02:05:08,434] Trial 1 finished with value: -2.3871050126162747 and parameters: {'learning_rate': 0.43904088335296426, 'n_estimators': 148, 'max_depth': 180, 'min_samples_split': 2, 'min_samples_leaf': 6, 'subsample': 0.41067954854872146}. Best is trial 0 with value: -1.8699321652989827.
[I 2024-02-06 02:08:42,124] Trial 2 finished with value: -2.5357854522884598 and parameters: {'learning_rate': 0.8727743621139512, 'n_estimators': 581, 'max_depth': 89, 'min_samples_split': 3, 'min_samples_leaf': 9, 'subsample': 0.7749229936556579}. Best is trial 0 with value: -1.8699321652989827.
[I 2024-02-06 02:08:56,346] Trial 3 finished with value: -3.4873337924281014 and parameters: {'lea

GB -1.7350785372363116 1.6892258707426593 4.6701950206468315 8.814597161161483


[I 2024-02-06 02:59:00,371] Trial 0 finished with value: -1.7806242829779526 and parameters: {'hidden_layer_sizes': (30, 30, 30), 'activation': 'logistic', 'solver': 'adam', 'alpha': 0.0017504200940275342}. Best is trial 0 with value: -1.7806242829779526.
[I 2024-02-06 02:59:14,725] Trial 1 finished with value: -1.782868775046666 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'logistic', 'solver': 'adam', 'alpha': 0.016202279663347224}. Best is trial 0 with value: -1.7806242829779526.
[I 2024-02-06 02:59:16,414] Trial 2 finished with value: -1.8507283308934552 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.04266994952201609}. Best is trial 0 with value: -1.7806242829779526.
[W 2024-02-06 02:59:20,177] Trial 3 failed with parameters: {'hidden_layer_sizes': (30, 30, 30), 'activation': 'identity', 'solver': 'sgd', 'alpha': 0.4288056183319254} because of the following error: The value nan is not acceptable.
[W 2024-02-06 0

ANN -1.7582600848450984 1.7951203440844383 5.030628811593716 9.86888079872566


[I 2024-02-06 03:02:37,659] Trial 0 finished with value: -1.7566466950291986 and parameters: {'C': 0.21093599882793507, 'gamma': 'auto', 'kernel': 'linear'}. Best is trial 0 with value: -1.7566466950291986.
[I 2024-02-06 03:02:38,797] Trial 1 finished with value: -1.8526556205242126 and parameters: {'C': 0.0023180221754788943, 'gamma': 'scale', 'kernel': 'poly'}. Best is trial 0 with value: -1.7566466950291986.
[I 2024-02-06 03:02:40,443] Trial 2 finished with value: -1.7817441303483261 and parameters: {'C': 10.535460238076015, 'gamma': 'scale', 'kernel': 'poly'}. Best is trial 0 with value: -1.7566466950291986.
[I 2024-02-06 03:02:42,723] Trial 3 finished with value: -1.8529805031674276 and parameters: {'C': 3.41400167534395e-05, 'gamma': 'auto', 'kernel': 'rbf'}. Best is trial 0 with value: -1.7566466950291986.
[I 2024-02-06 03:02:45,045] Trial 4 finished with value: -1.7593453395036898 and parameters: {'C': 11.180333903379903, 'gamma': 'scale', 'kernel': 'rbf'}. Best is trial 0 with

SVR -1.750055210769758 1.686332394659695 4.922985236415432 10.36187688180884


[I 2024-02-06 05:54:06,381] Trial 4 finished with value: -2.142527091388352 and parameters: {'fit_intercept': False}. Best is trial 3 with value: -1.7686043161249452.
[I 2024-02-06 05:54:06,422] Trial 5 finished with value: -2.142527091388352 and parameters: {'fit_intercept': False}. Best is trial 3 with value: -1.7686043161249452.
[I 2024-02-06 05:54:06,463] Trial 6 finished with value: -2.142527091388352 and parameters: {'fit_intercept': False}. Best is trial 3 with value: -1.7686043161249452.
[I 2024-02-06 05:54:06,503] Trial 7 finished with value: -2.142527091388352 and parameters: {'fit_intercept': False}. Best is trial 3 with value: -1.7686043161249452.
[I 2024-02-06 05:54:06,547] Trial 8 finished with value: -2.142527091388352 and parameters: {'fit_intercept': False}. Best is trial 3 with value: -1.7686043161249452.
[I 2024-02-06 05:54:06,591] Trial 9 finished with value: -1.7686043161249452 and parameters: {'fit_intercept': True}. Best is trial 3 with value: -1.7686043161249452

Linear -1.7686043161249452 1.7020477439274455 4.785091388584241 10.013508237586427


[I 2024-02-06 05:54:08,647] Trial 4 finished with value: -1.7679329361118306 and parameters: {'alpha': 118.5128379949208}. Best is trial 4 with value: -1.7679329361118306.
[I 2024-02-06 05:54:08,694] Trial 5 finished with value: -1.7679409459117112 and parameters: {'alpha': 116.24872156426682}. Best is trial 4 with value: -1.7679329361118306.
[I 2024-02-06 05:54:08,736] Trial 6 finished with value: -1.7685299903094316 and parameters: {'alpha': 8.142735546681672}. Best is trial 4 with value: -1.7679329361118306.
[I 2024-02-06 05:54:08,776] Trial 7 finished with value: -1.7685942501323044 and parameters: {'alpha': 1.0480508095325598}. Best is trial 4 with value: -1.7679329361118306.
[I 2024-02-06 05:54:08,818] Trial 8 finished with value: -1.7685947705163467 and parameters: {'alpha': 0.9938587702640002}. Best is trial 4 with value: -1.7679329361118306.
[I 2024-02-06 05:54:08,859] Trial 9 finished with value: -1.8494094054327683 and parameters: {'alpha': 20510.11190216315}. Best is trial 

Ridge -1.7677859407197443 1.7020801925100342 4.780295143017336 9.957157005496166


[I 2024-02-06 05:54:10,964] Trial 5 finished with value: -1.768588565589957 and parameters: {'alpha': 7.241961031863516e-05}. Best is trial 0 with value: -1.7585516408247044.
[I 2024-02-06 05:54:11,011] Trial 6 finished with value: -1.758470850869289 and parameters: {'alpha': 0.13329616925319518}. Best is trial 6 with value: -1.758470850869289.
[I 2024-02-06 05:54:11,054] Trial 7 finished with value: -1.8829342180567925 and parameters: {'alpha': 68.39426439177944}. Best is trial 6 with value: -1.758470850869289.
[I 2024-02-06 05:54:11,098] Trial 8 finished with value: -1.8829342180567925 and parameters: {'alpha': 6604.3261512985655}. Best is trial 6 with value: -1.758470850869289.
[I 2024-02-06 05:54:11,140] Trial 9 finished with value: -1.8829342180567925 and parameters: {'alpha': 29083.838606843427}. Best is trial 6 with value: -1.758470850869289.
[I 2024-02-06 05:54:11,192] Trial 10 finished with value: -1.7685956710203454 and parameters: {'alpha': 3.98089867404556e-05}. Best is tri

Lasso -1.758470850869289 1.6889798232467081 4.714385021648571 9.851376150071866
