In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

  from pandas.core import (


In [32]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'link_length', 'link_freespeed', 
                      'link_capacity', 'link_permlanes', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 
                      'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes',
                      'income', 'score', 'income_avg', 'score_avg'
                     ]
category_feature = ['type', 'home-activity-zone']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
#     'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101)
}

model_space = {
    'KNN': KNeighborsRegressor(),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
    'GPR': GaussianProcessRegressor(copy_X_train=False, random_state=101),
#     'SVR': SVR()
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'identity', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
'GPR':{
    'kernel': optuna.distributions.CategoricalDistribution([0.1**2 * RBF(length_scale=0.1) + 
                                    WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5)), 
                                    0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0),
                                    50.0**2 * RBF(length_scale=50.0), DotProduct() + WhiteKernel(), 
                                    1.0 * Matern(length_scale=1.0, nu=1.5),
                                    RBF() + ConstantKernel(constant_value=2)
                                                           ]),
    'alpha':  optuna.distributions.FloatDistribution(1e-15, 1e10)
}
}

In [3]:
train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

In [4]:
nodes_data = Big_data[['link_id', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y']]
grouped = nodes_data.groupby(['start_node_x', 'start_node_y'])
filtered_df = grouped.filter(lambda x: len(x) == 1)
filtered_df = filtered_df.drop_duplicates()
node_mapping = filtered_df.set_index(['start_node_x', 'start_node_y']).apply(
    lambda row: (row['end_node_x'], row['end_node_y']), axis=1).to_dict()

all_nodes = set(node_mapping.keys()) | set(node_mapping.values())
end_nodes = set(node_mapping.values())

start_nodes = list(all_nodes - end_nodes)

paths = []
for start_node in start_nodes:
    path = [start_node]
    while path[-1] in node_mapping:
        next_node = node_mapping[path[-1]]
        path.append(next_node)
    paths.append(path)
    
new_paths = [x for x in paths if len(x) >2]
def map_path_to_links(df, path):
    path_links = pd.DataFrame()
    for i in range(len(path) - 1):
        start_node = path[i]
        end_node = path[i+1]
        link_row = df[(df['start_node_x'] == start_node[0]) & 
                      (df['start_node_y'] == start_node[1]) & 
                      (df['end_node_x'] == end_node[0]) & 
                      (df['end_node_y'] == end_node[1])]
        if not link_row.empty:
            path_links = pd.concat([path_links, link_row])
    return path_links

# Step 3: Create separate DataFrames for each path
path_dfs = []
for path in new_paths:
    link_df = map_path_to_links(Big_data, path)
    path_dfs.append(link_df)

Big_data_drop = Big_data.copy(deep=True)
for path_df in path_dfs:
    numeric_df = path_df.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = path_df['home-activity-zone'].mode()
    type_value = path_df['type'].mode()
    dataset = path_df['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    Big_data_drop = pd.concat([Big_data_drop, mean_df])

    try:
        Big_data_drop.drop(path_df.index, inplace=True)
    except:
        pass

In [5]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['x_y_coor'] = cluster.fit_predict(Big_data_drop[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['similar_link'] = cluster1.fit_predict(Big_data_drop[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['planxml'] = cluster2.fit_predict(Big_data_drop[['income', 'score', 'rush_hour',
                                                               'max_dur', 'cemdapStopDuration_s']])

Big_data_drop = Big_data_drop.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

In [37]:
train_data_drop = Big_data_drop[Big_data_drop['dataset']=='train']
validate_data_drop = Big_data_drop[Big_data_drop['dataset']=='validate']
test_data_drop = Big_data_drop[Big_data_drop['dataset']=='test']

In [38]:
column_name = 'planxml'
train_data_edit = pd.DataFrame()
for i in list(set(train_data_drop[column_name].tolist())):
    new_train = train_data_drop[train_data_drop[column_name]==i]
    numeric_df = new_train.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = new_train['home-activity-zone'].mode()
    type_value = new_train['type'].mode()
    dataset = new_train['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    train_data_edit = pd.concat([train_data_edit, mean_df], ignore_index=True)
    
validate_data_edit = pd.DataFrame()
for i in list(set(validate_data_drop[column_name].tolist())):
    new_validate = validate_data_drop[validate_data_drop[column_name]==i]
    numeric_df = new_validate.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = new_validate['home-activity-zone'].mode()
    type_value = new_validate['type'].mode()
    dataset = new_validate['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    validate_data_edit = pd.concat([validate_data_edit, mean_df], ignore_index=True)

Big_data_edit = pd.concat([train_data_edit, validate_data_edit, test_data_drop], ignore_index=True)
Big_data_edit = Big_data_edit.astype({column_name:'int64'})

In [39]:
Big_data_tr = ct.fit_transform(Big_data_edit)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)


In [45]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [46]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 47)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-02-19 23:01:44,014] A new study created in memory with name: no-name-66e13394-9dca-4501-b671-20af835b554e
[I 2024-02-19 23:01:44,070] Trial 0 finished with value: 1.0 and parameters: {'selector__k': 42, 'model__n_neighbors': 33, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:44,118] Trial 1 finished with value: 1.0 and parameters: {'selector__k': 24, 'model__n_neighbors': 38, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:44,161] Trial 2 finished with value: 1.0 and parameters: {'selector__k': 15, 'model__n_neighbors': 27, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:44,377] Trial 3 finished with value: 1.0 and parameters: {'selector__k': 38, 'model__n_neighbors': 5, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:4

[I 2024-02-19 23:01:49,192] Trial 38 finished with value: 1.0 and parameters: {'selector__k': 34, 'model__n_neighbors': 9, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:49,439] Trial 39 finished with value: 1.0 and parameters: {'selector__k': 39, 'model__n_neighbors': 6, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:49,536] Trial 40 finished with value: 1.0 and parameters: {'selector__k': 17, 'model__n_neighbors': 33, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:49,628] Trial 41 finished with value: 1.0 and parameters: {'selector__k': 27, 'model__n_neighbors': 3, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:49,726] Trial 42 finished with value: 1.0 and parameters: {'selector__k': 26, 'model__n_neighbors': 8, 'model__w

KNN 1.0 0.8462449033276338


[I 2024-02-19 23:01:51,128] Trial 3 finished with value: 1.0 and parameters: {'selector__k': 17, 'model__learning_rate': 0.22247857064303891, 'model__n_estimators': 56, 'model__max_depth': 46, 'model__num_leaves': 12, 'model__min_child_samples': 9, 'model__subsample': 0.9068781014202166, 'model__colsample_bytree': 0.9050297147918321}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:51,188] Trial 4 finished with value: 1.0 and parameters: {'selector__k': 31, 'model__learning_rate': 0.3259013646458304, 'model__n_estimators': 97, 'model__max_depth': 17, 'model__num_leaves': 38, 'model__min_child_samples': 6, 'model__subsample': 0.6667962867728544, 'model__colsample_bytree': 0.3193145753471868}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:51,246] Trial 5 finished with value: 1.0 and parameters: {'selector__k': 11, 'model__learning_rate': 0.16447415783142252, 'model__n_estimators': 90, 'model__max_depth': 34, 'model__num_leaves': 38, 'model__min_child_samples': 15, 'model__sub

[I 2024-02-19 23:01:54,112] Trial 26 finished with value: 1.0 and parameters: {'selector__k': 10, 'model__learning_rate': 0.9111902945634572, 'model__n_estimators': 290, 'model__max_depth': 21, 'model__num_leaves': 16, 'model__min_child_samples': 20, 'model__subsample': 0.8688108476100062, 'model__colsample_bytree': 0.40031914553684295}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:54,265] Trial 27 finished with value: 1.0 and parameters: {'selector__k': 34, 'model__learning_rate': 0.597481204925121, 'model__n_estimators': 131, 'model__max_depth': 28, 'model__num_leaves': 24, 'model__min_child_samples': 11, 'model__subsample': 0.5900430379878561, 'model__colsample_bytree': 0.54586589200813}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:54,422] Trial 28 finished with value: 1.0 and parameters: {'selector__k': 16, 'model__learning_rate': 0.7788109481962026, 'model__n_estimators': 229, 'model__max_depth': 5, 'model__num_leaves': 9, 'model__min_child_samples': 13, 'model__s

[I 2024-02-19 23:01:57,691] Trial 49 finished with value: 1.0 and parameters: {'selector__k': 15, 'model__learning_rate': 0.10800625570307083, 'model__n_estimators': 452, 'model__max_depth': 12, 'model__num_leaves': 28, 'model__min_child_samples': 19, 'model__subsample': 0.7016737877162615, 'model__colsample_bytree': 0.43631059362193375}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:01:57,734] A new study created in memory with name: no-name-d69cdd26-8b80-455f-8f28-54c9f6cc2f3e


LGBM 1.0 0.8462449033276338


[I 2024-02-19 23:01:58,462] Trial 0 finished with value: 1.0 and parameters: {'selector__k': 39, 'model__max_features': 'sqrt', 'model__n_estimators': 117, 'model__max_depth': 27, 'model__min_samples_leaf': 20}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:00,821] Trial 1 finished with value: 1.0 and parameters: {'selector__k': 15, 'model__max_features': 'log2', 'model__n_estimators': 405, 'model__max_depth': 82, 'model__min_samples_leaf': 14}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:01,805] Trial 2 finished with value: 1.0 and parameters: {'selector__k': 18, 'model__max_features': 'sqrt', 'model__n_estimators': 160, 'model__max_depth': 59, 'model__min_samples_leaf': 15}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:02,284] Trial 3 finished with value: 1.0 and parameters: {'selector__k': 37, 'model__max_features': 'sqrt', 'model__n_estimators': 72, 'model__max_depth': 105, 'model__min_samples_leaf': 11}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:

[I 2024-02-19 23:02:38,541] Trial 34 finished with value: 1.0 and parameters: {'selector__k': 21, 'model__max_features': 'log2', 'model__n_estimators': 85, 'model__max_depth': 29, 'model__min_samples_leaf': 12}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:38,989] Trial 35 finished with value: 1.0 and parameters: {'selector__k': 39, 'model__max_features': 'sqrt', 'model__n_estimators': 59, 'model__max_depth': 152, 'model__min_samples_leaf': 16}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:39,956] Trial 36 finished with value: 1.0 and parameters: {'selector__k': 43, 'model__max_features': 'log2', 'model__n_estimators': 142, 'model__max_depth': 84, 'model__min_samples_leaf': 5}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:02:40,702] Trial 37 finished with value: 1.0 and parameters: {'selector__k': 33, 'model__max_features': 'sqrt', 'model__n_estimators': 111, 'model__max_depth': 103, 'model__min_samples_leaf': 2}. Best is trial 0 with value: 1.0.
[I 2024-02-19 23:0

RF 1.0 0.8462449033276338


In [None]:
import pickle
with open('best_model_clf.pickle', 'wb') as f:
    pickle.dump(best_model_clf, f, pickle.HIGHEST_PROTOCOL)

In [None]:
best_model_clf = pd.read_pickle("best_model_clf.pickle")

In [47]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [48]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [49]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 38)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-02-19 23:03:03,635] A new study created in memory with name: no-name-0716b4b6-1321-4768-96c3-e1e95a82232f
[I 2024-02-19 23:03:03,692] Trial 0 finished with value: -11.540283289603234 and parameters: {'selector__k': 26, 'model__n_neighbors': 1, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -11.540283289603234.
[I 2024-02-19 23:03:03,904] Trial 1 finished with value: -8.379073831345579 and parameters: {'selector__k': 17, 'model__n_neighbors': 15, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 1 with value: -8.379073831345579.
[I 2024-02-19 23:03:03,953] Trial 2 finished with value: -10.146931974763099 and parameters: {'selector__k': 30, 'model__n_neighbors': 43, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: -8.379073831345579.
[I 2024-02-19 23:03:04,158] Trial 3 finished with value: -7.441718068263958 and parameters: {'selector__k': 10, 'model__n_neighbors': 4, 'mode

[I 2024-02-19 23:03:07,552] Trial 33 finished with value: -7.720458922620533 and parameters: {'selector__k': 9, 'model__n_neighbors': 17, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 22 with value: -7.297829107207343.
[I 2024-02-19 23:03:07,622] Trial 34 finished with value: -7.798013374831986 and parameters: {'selector__k': 4, 'model__n_neighbors': 5, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 22 with value: -7.297829107207343.
[I 2024-02-19 23:03:07,856] Trial 35 finished with value: -7.787981694189161 and parameters: {'selector__k': 11, 'model__n_neighbors': 14, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 22 with value: -7.297829107207343.
[I 2024-02-19 23:03:07,927] Trial 36 finished with value: -7.599251573517453 and parameters: {'selector__k': 19, 'model__n_neighbors': 3, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 22 with value: -7.297829107207343.
[I 2024-02-19 2

KNN -7.2562509954734455 17.120761588561134 573.3167900073335 227.91836200581207


[I 2024-02-19 23:03:09,650] Trial 1 finished with value: -10.637534930826096 and parameters: {'selector__k': 21, 'model__learning_rate': 0.8908268923397015, 'model__n_estimators': 212, 'model__max_depth': 13, 'model__max_leaves': 11, 'model__max_bin': 43, 'model__gamma': 16}. Best is trial 0 with value: -9.04475139878975.
[I 2024-02-19 23:03:09,836] Trial 2 finished with value: -7.590508737338739 and parameters: {'selector__k': 2, 'model__learning_rate': 0.15006023700376844, 'model__n_estimators': 157, 'model__max_depth': 3, 'model__max_leaves': 37, 'model__max_bin': 48, 'model__gamma': 13}. Best is trial 2 with value: -7.590508737338739.
[I 2024-02-19 23:03:10,002] Trial 3 finished with value: -8.381770564939991 and parameters: {'selector__k': 29, 'model__learning_rate': 0.08353629887476025, 'model__n_estimators': 50, 'model__max_depth': 5, 'model__max_leaves': 39, 'model__max_bin': 19, 'model__gamma': 13}. Best is trial 2 with value: -7.590508737338739.
[I 2024-02-19 23:03:10,302] Tr

[I 2024-02-19 23:03:16,501] Trial 27 finished with value: -9.930424080391022 and parameters: {'selector__k': 14, 'model__learning_rate': 0.3414895477058889, 'model__n_estimators': 285, 'model__max_depth': 5, 'model__max_leaves': 47, 'model__max_bin': 30, 'model__gamma': 9}. Best is trial 11 with value: -7.229560682539986.
[I 2024-02-19 23:03:16,867] Trial 28 finished with value: -8.803710188544269 and parameters: {'selector__k': 5, 'model__learning_rate': 0.2473360823126863, 'model__n_estimators': 240, 'model__max_depth': 11, 'model__max_leaves': 43, 'model__max_bin': 42, 'model__gamma': 6}. Best is trial 11 with value: -7.229560682539986.
[I 2024-02-19 23:03:17,084] Trial 29 finished with value: -8.963699096459615 and parameters: {'selector__k': 18, 'model__learning_rate': 0.47786900143985517, 'model__n_estimators': 64, 'model__max_depth': 6, 'model__max_leaves': 48, 'model__max_bin': 36, 'model__gamma': 11}. Best is trial 11 with value: -7.229560682539986.
[I 2024-02-19 23:03:17,339]

XGB -7.229560682539986 17.43485561002967 766.1441804343416 279.58859159265245


[I 2024-02-19 23:03:23,599] Trial 1 finished with value: -7.330847996293005 and parameters: {'selector__k': 6, 'model__learning_rate': 0.07600383481012479, 'model__n_estimators': 410, 'model__max_depth': 2, 'model__num_leaves': 43, 'model__min_child_samples': 4, 'model__subsample': 0.9016447430868054, 'model__colsample_bytree': 0.340402895373252}. Best is trial 1 with value: -7.330847996293005.
[I 2024-02-19 23:03:24,017] Trial 2 finished with value: -8.37397426172955 and parameters: {'selector__k': 22, 'model__learning_rate': 0.30140246504937074, 'model__n_estimators': 183, 'model__max_depth': 39, 'model__num_leaves': 20, 'model__min_child_samples': 5, 'model__subsample': 0.6392287681610004, 'model__colsample_bytree': 0.4719761782619516}. Best is trial 1 with value: -7.330847996293005.
[I 2024-02-19 23:03:24,264] Trial 3 finished with value: -7.481642395260459 and parameters: {'selector__k': 18, 'model__learning_rate': 0.015066494642948725, 'model__n_estimators': 77, 'model__max_depth

[I 2024-02-19 23:03:28,693] Trial 22 finished with value: -7.5540863635428614 and parameters: {'selector__k': 26, 'model__learning_rate': 0.6271300152344481, 'model__n_estimators': 64, 'model__max_depth': 43, 'model__num_leaves': 31, 'model__min_child_samples': 10, 'model__subsample': 0.7548675303312326, 'model__colsample_bytree': 0.5425839445885806}. Best is trial 6 with value: -6.368718595529192.
[I 2024-02-19 23:03:28,932] Trial 23 finished with value: -8.773889149488594 and parameters: {'selector__k': 19, 'model__learning_rate': 0.8047731882725201, 'model__n_estimators': 87, 'model__max_depth': 50, 'model__num_leaves': 38, 'model__min_child_samples': 11, 'model__subsample': 0.9924985715232607, 'model__colsample_bytree': 0.5173716431799011}. Best is trial 6 with value: -6.368718595529192.
[I 2024-02-19 23:03:29,112] Trial 24 finished with value: -7.445909534514642 and parameters: {'selector__k': 21, 'model__learning_rate': 0.8337774303373056, 'model__n_estimators': 60, 'model__max_d

[I 2024-02-19 23:03:32,876] Trial 43 finished with value: -7.899387275024749 and parameters: {'selector__k': 30, 'model__learning_rate': 0.013517928420791192, 'model__n_estimators': 56, 'model__max_depth': 20, 'model__num_leaves': 26, 'model__min_child_samples': 18, 'model__subsample': 0.8048602942751655, 'model__colsample_bytree': 0.576236659538371}. Best is trial 32 with value: -6.262458851244446.
[I 2024-02-19 23:03:33,059] Trial 44 finished with value: -6.490167780651247 and parameters: {'selector__k': 23, 'model__learning_rate': 0.07535361951011471, 'model__n_estimators': 64, 'model__max_depth': 15, 'model__num_leaves': 33, 'model__min_child_samples': 15, 'model__subsample': 0.6365515049649069, 'model__colsample_bytree': 0.6365211560056363}. Best is trial 32 with value: -6.262458851244446.
[I 2024-02-19 23:03:33,508] Trial 45 finished with value: -7.121944985068641 and parameters: {'selector__k': 23, 'model__learning_rate': 0.0821347695952018, 'model__n_estimators': 486, 'model__m

LGBM -6.262458851244446 16.855263189491726 646.9908922528558 231.53715524608862


[I 2024-02-19 23:03:35,695] Trial 0 finished with value: -7.2475980139748115 and parameters: {'selector__k': 23, 'model__max_features': 'sqrt', 'model__n_estimators': 214, 'model__max_depth': 15, 'model__min_samples_leaf': 7}. Best is trial 0 with value: -7.2475980139748115.
[I 2024-02-19 23:03:36,687] Trial 1 finished with value: -7.651778477611129 and parameters: {'selector__k': 12, 'model__max_features': 'log2', 'model__n_estimators': 172, 'model__max_depth': 36, 'model__min_samples_leaf': 13}. Best is trial 0 with value: -7.2475980139748115.
[I 2024-02-19 23:03:37,024] Trial 2 finished with value: -7.801247903039582 and parameters: {'selector__k': 20, 'model__max_features': 'sqrt', 'model__n_estimators': 55, 'model__max_depth': 149, 'model__min_samples_leaf': 19}. Best is trial 0 with value: -7.2475980139748115.
[I 2024-02-19 23:03:39,533] Trial 3 finished with value: -7.409570260881118 and parameters: {'selector__k': 34, 'model__max_features': 'log2', 'model__n_estimators': 407, '

[I 2024-02-19 23:04:00,373] Trial 30 finished with value: -7.376528331078969 and parameters: {'selector__k': 31, 'model__max_features': 'sqrt', 'model__n_estimators': 80, 'model__max_depth': 18, 'model__min_samples_leaf': 9}. Best is trial 9 with value: -7.0915678979339205.
[I 2024-02-19 23:04:01,084] Trial 31 finished with value: -7.110980034514053 and parameters: {'selector__k': 20, 'model__max_features': 'sqrt', 'model__n_estimators': 109, 'model__max_depth': 45, 'model__min_samples_leaf': 5}. Best is trial 9 with value: -7.0915678979339205.
[I 2024-02-19 23:04:01,921] Trial 32 finished with value: -7.265658725957479 and parameters: {'selector__k': 24, 'model__max_features': 'sqrt', 'model__n_estimators': 127, 'model__max_depth': 46, 'model__min_samples_leaf': 5}. Best is trial 9 with value: -7.0915678979339205.
[I 2024-02-19 23:04:02,981] Trial 33 finished with value: -7.08954111603585 and parameters: {'selector__k': 27, 'model__max_features': 'sqrt', 'model__n_estimators': 166, 'm

RF -6.943759483466927 17.531557598295926 602.6161035504958 229.9635458763102


[I 2024-02-19 23:04:26,813] Trial 0 finished with value: -10.407626119825714 and parameters: {'selector__k': 22, 'model__learning_rate': 0.629969890210556, 'model__n_estimators': 438, 'model__max_depth': 164, 'model__min_samples_split': 9, 'model__min_samples_leaf': 7, 'model__subsample': 0.4558611508329816}. Best is trial 0 with value: -10.407626119825714.
[I 2024-02-19 23:04:27,680] Trial 1 finished with value: -7.734634905799583 and parameters: {'selector__k': 31, 'model__learning_rate': 0.0423598496309897, 'model__n_estimators': 68, 'model__max_depth': 74, 'model__min_samples_split': 9, 'model__min_samples_leaf': 2, 'model__subsample': 0.7439553318854379}. Best is trial 1 with value: -7.734634905799583.
[I 2024-02-19 23:04:29,160] Trial 2 finished with value: -9.195427971161754 and parameters: {'selector__k': 12, 'model__learning_rate': 0.36738231097856383, 'model__n_estimators': 371, 'model__max_depth': 111, 'model__min_samples_split': 7, 'model__min_samples_leaf': 6, 'model__subs

[I 2024-02-19 23:04:56,587] Trial 23 finished with value: -6.378144781424373 and parameters: {'selector__k': 37, 'model__learning_rate': 0.12721365995770847, 'model__n_estimators': 69, 'model__max_depth': 98, 'model__min_samples_split': 3, 'model__min_samples_leaf': 9, 'model__subsample': 0.38492372232911853}. Best is trial 23 with value: -6.378144781424373.
[I 2024-02-19 23:04:57,182] Trial 24 finished with value: -6.692435928546449 and parameters: {'selector__k': 38, 'model__learning_rate': 0.1178532320210694, 'model__n_estimators': 68, 'model__max_depth': 92, 'model__min_samples_split': 3, 'model__min_samples_leaf': 9, 'model__subsample': 0.6118192709286671}. Best is trial 23 with value: -6.378144781424373.
[I 2024-02-19 23:04:57,515] Trial 25 finished with value: -6.826955239689392 and parameters: {'selector__k': 37, 'model__learning_rate': 0.25384840129465724, 'model__n_estimators': 66, 'model__max_depth': 94, 'model__min_samples_split': 3, 'model__min_samples_leaf': 10, 'model__s

[I 2024-02-19 23:05:06,765] Trial 46 finished with value: -7.537684616519177 and parameters: {'selector__k': 24, 'model__learning_rate': 0.1843467688242486, 'model__n_estimators': 95, 'model__max_depth': 91, 'model__min_samples_split': 6, 'model__min_samples_leaf': 1, 'model__subsample': 0.2950281290534031}. Best is trial 28 with value: -6.338971662780297.
[I 2024-02-19 23:05:07,040] Trial 47 finished with value: -7.4787960029848035 and parameters: {'selector__k': 31, 'model__learning_rate': 0.11844106350298322, 'model__n_estimators': 69, 'model__max_depth': 128, 'model__min_samples_split': 2, 'model__min_samples_leaf': 8, 'model__subsample': 0.16429117469222868}. Best is trial 28 with value: -6.338971662780297.
[I 2024-02-19 23:05:07,338] Trial 48 finished with value: -8.043650013069838 and parameters: {'selector__k': 36, 'model__learning_rate': 0.014796579798032963, 'model__n_estimators': 54, 'model__max_depth': 150, 'model__min_samples_split': 4, 'model__min_samples_leaf': 4, 'model

GB -6.338971662780297 16.742185469819606 601.394581164686 228.33629321650733


[I 2024-02-19 23:05:08,683] Trial 0 finished with value: -7.626443097217751 and parameters: {'selector__k': 21, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'relu', 'model__solver': 'sgd', 'model__alpha': 52.42420009755076}. Best is trial 0 with value: -7.626443097217751.
[I 2024-02-19 23:05:09,633] Trial 1 finished with value: -6.747718266764877 and parameters: {'selector__k': 23, 'model__hidden_layer_sizes': (100,), 'model__activation': 'tanh', 'model__solver': 'sgd', 'model__alpha': 0.00018132039915768922}. Best is trial 1 with value: -6.747718266764877.
[I 2024-02-19 23:05:11,268] Trial 2 finished with value: -11.19875228079038 and parameters: {'selector__k': 23, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 0.9243653943696304}. Best is trial 1 with value: -6.747718266764877.
[I 2024-02-19 23:05:13,101] Trial 3 finished with value: -10.21772261809412 and parameters: {'selector__k': 9, 'model__hi

[I 2024-02-19 23:05:29,991] Trial 29 finished with value: -8.892473505118184 and parameters: {'selector__k': 20, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'relu', 'model__solver': 'sgd', 'model__alpha': 0.0003421159306894956}. Best is trial 1 with value: -6.747718266764877.
[I 2024-02-19 23:05:30,196] Trial 30 finished with value: -10.227677080692773 and parameters: {'selector__k': 25, 'model__hidden_layer_sizes': (50, 50), 'model__activation': 'tanh', 'model__solver': 'sgd', 'model__alpha': 2.506786286112457}. Best is trial 1 with value: -6.747718266764877.
[I 2024-02-19 23:05:31,164] Trial 31 finished with value: -6.823554928899651 and parameters: {'selector__k': 15, 'model__hidden_layer_sizes': (100,), 'model__activation': 'tanh', 'model__solver': 'sgd', 'model__alpha': 3.376398236555969e-05}. Best is trial 1 with value: -6.747718266764877.
[I 2024-02-19 23:05:32,236] Trial 32 finished with value: -6.686575109333548 and parameters: {'selector__k': 20, 'model__h

ANN -6.387016437649466 17.265337640335026 669.4479876133787 220.7659557199633


[I 2024-02-19 23:05:46,977] Trial 2 finished with value: -23.77385266419767 and parameters: {'selector__k': 25, 'model__kernel': 0.5**2 * RationalQuadratic(alpha=1, length_scale=1), 'model__alpha': 1057854099.3931218}. Best is trial 1 with value: -23.773849902184676.
[I 2024-02-19 23:05:47,064] Trial 3 finished with value: -23.77385267742385 and parameters: {'selector__k': 33, 'model__kernel': 0.1**2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.01), 'model__alpha': 564771351.8560426}. Best is trial 1 with value: -23.773849902184676.
[I 2024-02-19 23:05:47,141] Trial 4 finished with value: -23.773852675881464 and parameters: {'selector__k': 24, 'model__kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'model__alpha': 9177424753.571928}. Best is trial 1 with value: -23.773849902184676.
[I 2024-02-19 23:05:47,217] Trial 5 finished with value: -23.77385182723355 and parameters: {'selector__k': 33, 'model__kernel': RBF(length_scale=1) + 1.41**2, 'model__alpha': 7494639312.67828}. Best 

[I 2024-02-19 23:05:50,397] Trial 34 finished with value: -15.547434099043702 and parameters: {'selector__k': 38, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 231028592.00975657}. Best is trial 32 with value: -15.118306463256015.
[I 2024-02-19 23:05:50,508] Trial 35 finished with value: -11.835945238196285 and parameters: {'selector__k': 38, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 76068595.47883004}. Best is trial 35 with value: -11.835945238196285.
[I 2024-02-19 23:05:50,617] Trial 36 finished with value: -14.102722313205206 and parameters: {'selector__k': 38, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 158208117.28919953}. Best is trial 35 with value: -11.835945238196285.
[I 2024-02-19 23:05:50,728] Trial 37 finished with value: -18.893415505127813 and parameters: {'selector__k': 38, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__

GPR -10.378278605415332 23.44617728399548 1020.2816805083735 261.5632994635494


In [40]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

In [41]:
best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 47)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)



[I 2024-02-19 22:47:52,595] A new study created in memory with name: no-name-bc21603c-36d8-431d-a004-5ab808fd08d3
[I 2024-02-19 22:47:52,650] Trial 0 finished with value: -10.196618287319273 and parameters: {'selector__k': 31, 'model__n_neighbors': 31, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: -10.196618287319273.
[I 2024-02-19 22:47:52,701] Trial 1 finished with value: -8.710193394272641 and parameters: {'selector__k': 16, 'model__n_neighbors': 38, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 1 with value: -8.710193394272641.
[I 2024-02-19 22:47:52,932] Trial 2 finished with value: -10.232515406011876 and parameters: {'selector__k': 31, 'model__n_neighbors': 37, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 1 with value: -8.710193394272641.
[I 2024-02-19 22:47:52,990] Trial 3 finished with value: -9.155103922134545 and parameters: {'selector__k': 20, 'model__n_neighbors': 44, 'm

[I 2024-02-19 22:47:55,850] Trial 33 finished with value: -7.569371375358781 and parameters: {'selector__k': 13, 'model__n_neighbors': 4, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 26 with value: -7.510712161623867.
[I 2024-02-19 22:47:55,926] Trial 34 finished with value: -8.181165273702359 and parameters: {'selector__k': 20, 'model__n_neighbors': 14, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 26 with value: -7.510712161623867.
[I 2024-02-19 22:47:55,997] Trial 35 finished with value: -10.779264823210568 and parameters: {'selector__k': 28, 'model__n_neighbors': 3, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 26 with value: -7.510712161623867.
[I 2024-02-19 22:47:56,073] Trial 36 finished with value: -8.173567346612003 and parameters: {'selector__k': 23, 'model__n_neighbors': 12, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 26 with value: -7.510712161623867.
[I 20

KNN -7.458754842951573 16.33727220455779 561.6086024811608 232.63639885247028


[I 2024-02-19 22:47:58,050] Trial 1 finished with value: -10.104763475477526 and parameters: {'selector__k': 39, 'model__learning_rate': 0.9151583572549509, 'model__n_estimators': 166, 'model__max_depth': 17, 'model__max_leaves': 12, 'model__max_bin': 50, 'model__gamma': 5}. Best is trial 0 with value: -8.1035950162964.
[I 2024-02-19 22:47:58,255] Trial 2 finished with value: -9.737943849710172 and parameters: {'selector__k': 15, 'model__learning_rate': 0.6693437514482586, 'model__n_estimators': 145, 'model__max_depth': 10, 'model__max_leaves': 27, 'model__max_bin': 19, 'model__gamma': 4}. Best is trial 0 with value: -8.1035950162964.
[I 2024-02-19 22:47:58,449] Trial 3 finished with value: -10.049789781489592 and parameters: {'selector__k': 38, 'model__learning_rate': 0.8994984595268515, 'model__n_estimators': 141, 'model__max_depth': 16, 'model__max_leaves': 48, 'model__max_bin': 27, 'model__gamma': 18}. Best is trial 0 with value: -8.1035950162964.
[I 2024-02-19 22:47:58,775] Trial 

[I 2024-02-19 22:48:06,449] Trial 27 finished with value: -8.73389229578872 and parameters: {'selector__k': 10, 'model__learning_rate': 0.23480947989666237, 'model__n_estimators': 348, 'model__max_depth': 12, 'model__max_leaves': 22, 'model__max_bin': 47, 'model__gamma': 20}. Best is trial 10 with value: -7.177507264281254.
[I 2024-02-19 22:48:06,825] Trial 28 finished with value: -8.501314571733428 and parameters: {'selector__k': 20, 'model__learning_rate': 0.011362809804022489, 'model__n_estimators': 55, 'model__max_depth': 14, 'model__max_leaves': 30, 'model__max_bin': 23, 'model__gamma': 18}. Best is trial 10 with value: -7.177507264281254.
[I 2024-02-19 22:48:07,065] Trial 29 finished with value: -9.25975794617219 and parameters: {'selector__k': 13, 'model__learning_rate': 0.470653492452988, 'model__n_estimators': 115, 'model__max_depth': 11, 'model__max_leaves': 24, 'model__max_bin': 39, 'model__gamma': 19}. Best is trial 10 with value: -7.177507264281254.
[I 2024-02-19 22:48:07,

XGB -7.177507264281254 17.972539647479977 698.7876389934627 232.96246065412248


[I 2024-02-19 22:48:15,369] Trial 0 finished with value: -7.608731344191054 and parameters: {'selector__k': 39, 'model__learning_rate': 0.056726970375503026, 'model__n_estimators': 316, 'model__max_depth': 13, 'model__num_leaves': 36, 'model__min_child_samples': 3, 'model__subsample': 0.6865966751325083, 'model__colsample_bytree': 0.3429731861709645}. Best is trial 0 with value: -7.608731344191054.
[I 2024-02-19 22:48:15,549] Trial 1 finished with value: -8.54036006580447 and parameters: {'selector__k': 39, 'model__learning_rate': 0.49721267204835173, 'model__n_estimators': 143, 'model__max_depth': 19, 'model__num_leaves': 9, 'model__min_child_samples': 6, 'model__subsample': 0.9494880276607008, 'model__colsample_bytree': 0.3462542884574966}. Best is trial 0 with value: -7.608731344191054.
[I 2024-02-19 22:48:15,641] Trial 2 finished with value: -6.935681133917628 and parameters: {'selector__k': 47, 'model__learning_rate': 0.12796012944022955, 'model__n_estimators': 65, 'model__max_dep

[I 2024-02-19 22:48:21,300] Trial 21 finished with value: -6.71694127622912 and parameters: {'selector__k': 29, 'model__learning_rate': 0.16094979485603822, 'model__n_estimators': 54, 'model__max_depth': 34, 'model__num_leaves': 49, 'model__min_child_samples': 9, 'model__subsample': 0.423476174269095, 'model__colsample_bytree': 0.5495068827043251}. Best is trial 11 with value: -6.497126730910424.
[I 2024-02-19 22:48:21,481] Trial 22 finished with value: -7.130100310690748 and parameters: {'selector__k': 27, 'model__learning_rate': 0.03311917767246709, 'model__n_estimators': 50, 'model__max_depth': 30, 'model__num_leaves': 47, 'model__min_child_samples': 10, 'model__subsample': 0.46092675717299103, 'model__colsample_bytree': 0.5473350519187918}. Best is trial 11 with value: -6.497126730910424.
[I 2024-02-19 22:48:21,659] Trial 23 finished with value: -6.9098668621739145 and parameters: {'selector__k': 30, 'model__learning_rate': 0.2624490539835209, 'model__n_estimators': 67, 'model__max

[I 2024-02-19 22:48:26,689] Trial 42 finished with value: -6.41907169915202 and parameters: {'selector__k': 33, 'model__learning_rate': 0.13789489056458265, 'model__n_estimators': 76, 'model__max_depth': 31, 'model__num_leaves': 50, 'model__min_child_samples': 12, 'model__subsample': 0.10112523474754039, 'model__colsample_bytree': 0.5016955100528112}. Best is trial 42 with value: -6.41907169915202.
[I 2024-02-19 22:48:26,904] Trial 43 finished with value: -6.80212432735163 and parameters: {'selector__k': 28, 'model__learning_rate': 0.06609005904563309, 'model__n_estimators': 74, 'model__max_depth': 32, 'model__num_leaves': 44, 'model__min_child_samples': 7, 'model__subsample': 0.14860603691647714, 'model__colsample_bytree': 0.44623569862015106}. Best is trial 42 with value: -6.41907169915202.
[I 2024-02-19 22:48:27,091] Trial 44 finished with value: -7.186157292032327 and parameters: {'selector__k': 35, 'model__learning_rate': 0.22573897147510008, 'model__n_estimators': 85, 'model__max

LGBM -6.41907169915202 16.24535304991682 583.5182948182387 226.36771568066956


[I 2024-02-19 22:48:30,434] Trial 0 finished with value: -7.578520373179606 and parameters: {'selector__k': 17, 'model__max_features': 'sqrt', 'model__n_estimators': 401, 'model__max_depth': 123, 'model__min_samples_leaf': 14}. Best is trial 0 with value: -7.578520373179606.
[I 2024-02-19 22:48:31,461] Trial 1 finished with value: -7.9241456846087335 and parameters: {'selector__k': 25, 'model__max_features': 'log2', 'model__n_estimators': 189, 'model__max_depth': 113, 'model__min_samples_leaf': 19}. Best is trial 0 with value: -7.578520373179606.
[I 2024-02-19 22:48:31,938] Trial 2 finished with value: -7.36070803792798 and parameters: {'selector__k': 31, 'model__max_features': 'sqrt', 'model__n_estimators': 78, 'model__max_depth': 163, 'model__min_samples_leaf': 9}. Best is trial 2 with value: -7.36070803792798.
[I 2024-02-19 22:48:34,545] Trial 3 finished with value: -7.28622144464379 and parameters: {'selector__k': 17, 'model__max_features': 'log2', 'model__n_estimators': 438, 'mode

[I 2024-02-19 22:49:06,855] Trial 30 finished with value: -7.371299502596935 and parameters: {'selector__k': 10, 'model__max_features': 'log2', 'model__n_estimators': 224, 'model__max_depth': 125, 'model__min_samples_leaf': 12}. Best is trial 21 with value: -6.991897138020415.
[I 2024-02-19 22:49:08,125] Trial 31 finished with value: -7.105117719376204 and parameters: {'selector__k': 22, 'model__max_features': 'log2', 'model__n_estimators': 200, 'model__max_depth': 170, 'model__min_samples_leaf': 4}. Best is trial 21 with value: -6.991897138020415.
[I 2024-02-19 22:49:09,082] Trial 32 finished with value: -7.102540482687684 and parameters: {'selector__k': 23, 'model__max_features': 'log2', 'model__n_estimators': 141, 'model__max_depth': 109, 'model__min_samples_leaf': 2}. Best is trial 21 with value: -6.991897138020415.
[I 2024-02-19 22:49:10,195] Trial 33 finished with value: -7.139362449314539 and parameters: {'selector__k': 20, 'model__max_features': 'log2', 'model__n_estimators': 1

RF -6.922405894957586 17.40009976862873 602.9461605824571 230.44098305316106


[I 2024-02-19 22:49:27,516] Trial 0 finished with value: -9.162962400409825 and parameters: {'selector__k': 38, 'model__learning_rate': 0.5667144202653551, 'model__n_estimators': 116, 'model__max_depth': 27, 'model__min_samples_split': 5, 'model__min_samples_leaf': 10, 'model__subsample': 0.4046609014519581}. Best is trial 0 with value: -9.162962400409825.
[I 2024-02-19 22:49:28,064] Trial 1 finished with value: -9.16423541976571 and parameters: {'selector__k': 21, 'model__learning_rate': 0.9182455276006531, 'model__n_estimators': 99, 'model__max_depth': 132, 'model__min_samples_split': 11, 'model__min_samples_leaf': 10, 'model__subsample': 0.7214313257671883}. Best is trial 0 with value: -9.162962400409825.
[I 2024-02-19 22:49:28,729] Trial 2 finished with value: -6.863025886475776 and parameters: {'selector__k': 22, 'model__learning_rate': 0.2078485065872026, 'model__n_estimators': 74, 'model__max_depth': 180, 'model__min_samples_split': 10, 'model__min_samples_leaf': 10, 'model__sub

[I 2024-02-19 22:49:54,337] Trial 23 finished with value: -7.038077735706835 and parameters: {'selector__k': 35, 'model__learning_rate': 0.0138554984333721, 'model__n_estimators': 196, 'model__max_depth': 37, 'model__min_samples_split': 5, 'model__min_samples_leaf': 6, 'model__subsample': 0.6119156722000103}. Best is trial 16 with value: -6.422177082373198.
[I 2024-02-19 22:49:56,038] Trial 24 finished with value: -7.279350249259654 and parameters: {'selector__k': 42, 'model__learning_rate': 0.15554706207923108, 'model__n_estimators': 300, 'model__max_depth': 79, 'model__min_samples_split': 6, 'model__min_samples_leaf': 9, 'model__subsample': 0.48753370404659324}. Best is trial 16 with value: -6.422177082373198.
[I 2024-02-19 22:49:56,664] Trial 25 finished with value: -7.708689201719761 and parameters: {'selector__k': 28, 'model__learning_rate': 0.33237247018735383, 'model__n_estimators': 149, 'model__max_depth': 19, 'model__min_samples_split': 7, 'model__min_samples_leaf': 7, 'model_

[I 2024-02-19 22:50:14,740] Trial 46 finished with value: -6.698913755036891 and parameters: {'selector__k': 39, 'model__learning_rate': 0.053971988797597854, 'model__n_estimators': 136, 'model__max_depth': 156, 'model__min_samples_split': 7, 'model__min_samples_leaf': 7, 'model__subsample': 0.21402843922951945}. Best is trial 41 with value: -6.324137471464376.
[I 2024-02-19 22:50:15,275] Trial 47 finished with value: -6.875079280624226 and parameters: {'selector__k': 35, 'model__learning_rate': 0.1206852241123991, 'model__n_estimators': 115, 'model__max_depth': 144, 'model__min_samples_split': 6, 'model__min_samples_leaf': 5, 'model__subsample': 0.27516907184213346}. Best is trial 41 with value: -6.324137471464376.
[I 2024-02-19 22:50:15,886] Trial 48 finished with value: -7.451117034335113 and parameters: {'selector__k': 45, 'model__learning_rate': 0.010839926551902454, 'model__n_estimators': 187, 'model__max_depth': 135, 'model__min_samples_split': 8, 'model__min_samples_leaf': 6, '

GB -6.324137471464376 16.7194926874848 595.2436234055286 230.19962936444915


[I 2024-02-19 22:50:17,894] Trial 1 finished with value: -11.141693638257008 and parameters: {'selector__k': 30, 'model__hidden_layer_sizes': (100,), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 0.0024497515587669893}. Best is trial 0 with value: -10.509895736303227.
[I 2024-02-19 22:50:18,152] Trial 2 finished with value: -10.779734265270882 and parameters: {'selector__k': 31, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'tanh', 'model__solver': 'sgd', 'model__alpha': 3.75712399267205}. Best is trial 0 with value: -10.509895736303227.
[I 2024-02-19 22:50:18,899] Trial 3 finished with value: -10.214408308951967 and parameters: {'selector__k': 24, 'model__hidden_layer_sizes': (100,), 'model__activation': 'logistic', 'model__solver': 'sgd', 'model__alpha': 76021.8970918174}. Best is trial 3 with value: -10.214408308951967.
[W 2024-02-19 22:50:19,962] Trial 4 failed with parameters: {'selector__k': 32, 'model__hidden_layer_sizes': (30, 30, 3

[I 2024-02-19 22:50:39,817] Trial 29 finished with value: -7.971778770796677 and parameters: {'selector__k': 20, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'identity', 'model__solver': 'adam', 'model__alpha': 522.4660463704223}. Best is trial 24 with value: -7.290786452564099.
[W 2024-02-19 22:50:40,415] Trial 30 failed with parameters: {'selector__k': 24, 'model__hidden_layer_sizes': (50,), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 5406.397304506127} because of the following error: The value nan is not acceptable.
[W 2024-02-19 22:50:40,417] Trial 30 failed with value nan.
[W 2024-02-19 22:50:41,031] Trial 31 failed with parameters: {'selector__k': 25, 'model__hidden_layer_sizes': (50,), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 13245.914818425383} because of the following error: The value nan is not acceptable.
[W 2024-02-19 22:50:41,033] Trial 31 failed with value nan.
[W 2024-02-19 22:50:41,677] Tria

ANN -7.290786452564099 15.382559311313866 532.1391035243552 171.12256608465609


[I 2024-02-19 22:50:52,412] Trial 2 finished with value: -23.771434127648014 and parameters: {'selector__k': 29, 'model__kernel': 50**2 * RBF(length_scale=50), 'model__alpha': 1454517500.9061496}. Best is trial 1 with value: -23.15039596417937.
[I 2024-02-19 22:50:52,503] Trial 3 finished with value: -23.773851900457185 and parameters: {'selector__k': 39, 'model__kernel': RBF(length_scale=1) + 1.41**2, 'model__alpha': 8200946820.833871}. Best is trial 1 with value: -23.15039596417937.
[I 2024-02-19 22:50:52,576] Trial 4 finished with value: -23.68342763279905 and parameters: {'selector__k': 28, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 4116611452.557807}. Best is trial 1 with value: -23.15039596417937.
[I 2024-02-19 22:50:52,658] Trial 5 finished with value: -23.77385266828124 and parameters: {'selector__k': 24, 'model__kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'model__alpha': 1548256462.9871166}. Best is trial 1 with value: -23.15039596

[I 2024-02-19 22:50:55,624] Trial 34 finished with value: -18.65701428768599 and parameters: {'selector__k': 45, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 506159276.3065819}. Best is trial 34 with value: -18.65701428768599.
[I 2024-02-19 22:50:55,738] Trial 35 finished with value: -18.258567920957717 and parameters: {'selector__k': 45, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 455631749.26012605}. Best is trial 35 with value: -18.258567920957717.
[I 2024-02-19 22:50:55,863] Trial 36 finished with value: -23.77363503985292 and parameters: {'selector__k': 45, 'model__kernel': 50**2 * RBF(length_scale=50), 'model__alpha': 361516897.79670876}. Best is trial 35 with value: -18.258567920957717.
[I 2024-02-19 22:50:55,988] Trial 37 finished with value: -23.773840715410806 and parameters: {'selector__k': 46, 'model__kernel': RBF(length_scale=1) + 1.41**2, 'model__alpha': 532674485.60249054}. Best is trial 35 

GPR -10.306239890914275 21.938868093155524 963.8576028104934 262.5516713178617


In [42]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-02-19 22:50:57,835] A new study created in memory with name: no-name-bf62c6b5-09d1-4336-a1ba-5b89d6787daa
[I 2024-02-19 22:50:58,053] Trial 0 finished with value: -10.108962113602463 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 0 with value: -10.108962113602463.
[I 2024-02-19 22:50:58,250] Trial 1 finished with value: -10.244329379613506 and parameters: {'n_neighbors': 30, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 0 with value: -10.108962113602463.
[I 2024-02-19 22:50:58,287] Trial 2 finished with value: -10.170476477841213 and parameters: {'n_neighbors': 36, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 0 with value: -10.108962113602463.
[I 2024-02-19 22:50:58,321] Trial 3 finished with value: -10.310765712007647 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 0 with value: -10.108962113602463.
[I 2024-02-19 22:50:58,360] Trial 4 finished wit

[I 2024-02-19 22:51:04,561] Trial 39 finished with value: -10.156878589751425 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 5 with value: -10.094270723970405.
[I 2024-02-19 22:51:04,617] Trial 40 finished with value: -10.295218615000545 and parameters: {'n_neighbors': 28, 'weights': 'uniform', 'algorithm': 'kd_tree'}. Best is trial 5 with value: -10.094270723970405.
[I 2024-02-19 22:51:04,672] Trial 41 finished with value: -10.094270723970405 and parameters: {'n_neighbors': 50, 'weights': 'uniform', 'algorithm': 'ball_tree'}. Best is trial 5 with value: -10.094270723970405.
[I 2024-02-19 22:51:04,721] Trial 42 finished with value: -10.142026518090168 and parameters: {'n_neighbors': 47, 'weights': 'uniform', 'algorithm': 'ball_tree'}. Best is trial 5 with value: -10.094270723970405.
[I 2024-02-19 22:51:04,772] Trial 43 finished with value: -10.094270723970405 and parameters: {'n_neighbors': 50, 'weights': 'uniform', 'algorithm': 'ball_tre

KNN -10.094270723970405 22.34723726292137 894.7848578725286 258.87853118348875


[I 2024-02-19 22:51:06,427] Trial 0 finished with value: -9.200809840665995 and parameters: {'learning_rate': 0.5202382389132794, 'n_estimators': 464, 'max_depth': 19, 'max_leaves': 16, 'max_bin': 25, 'gamma': 12}. Best is trial 0 with value: -9.200809840665995.
[I 2024-02-19 22:51:06,913] Trial 1 finished with value: -9.242107232539235 and parameters: {'learning_rate': 0.5625760553196336, 'n_estimators': 299, 'max_depth': 3, 'max_leaves': 37, 'max_bin': 49, 'gamma': 10}. Best is trial 0 with value: -9.200809840665995.
[I 2024-02-19 22:51:07,195] Trial 2 finished with value: -8.648544838003344 and parameters: {'learning_rate': 0.4270862226836473, 'n_estimators': 131, 'max_depth': 20, 'max_leaves': 39, 'max_bin': 26, 'gamma': 8}. Best is trial 2 with value: -8.648544838003344.
[I 2024-02-19 22:51:07,442] Trial 3 finished with value: -9.277244866665384 and parameters: {'learning_rate': 0.4067892060927689, 'n_estimators': 113, 'max_depth': 12, 'max_leaves': 37, 'max_bin': 9, 'gamma': 5}. 

[I 2024-02-19 22:51:17,761] Trial 32 finished with value: -8.015252953811546 and parameters: {'learning_rate': 0.020161855657309588, 'n_estimators': 71, 'max_depth': 12, 'max_leaves': 12, 'max_bin': 44, 'gamma': 4}. Best is trial 17 with value: -7.249281096437913.
[I 2024-02-19 22:51:18,147] Trial 33 finished with value: -7.253428807310839 and parameters: {'learning_rate': 0.14284474599754288, 'n_estimators': 148, 'max_depth': 10, 'max_leaves': 6, 'max_bin': 49, 'gamma': 11}. Best is trial 17 with value: -7.249281096437913.
[I 2024-02-19 22:51:18,547] Trial 34 finished with value: -7.70158367222811 and parameters: {'learning_rate': 0.17316522616841573, 'n_estimators': 152, 'max_depth': 10, 'max_leaves': 6, 'max_bin': 50, 'gamma': 9}. Best is trial 17 with value: -7.249281096437913.
[I 2024-02-19 22:51:19,000] Trial 35 finished with value: -8.248166951790024 and parameters: {'learning_rate': 0.4239597484526644, 'n_estimators': 221, 'max_depth': 1, 'max_leaves': 13, 'max_bin': 39, 'gamma

XGB -7.209010407182477 17.116279300200077 609.6592925273153 217.8223544529506


[I 2024-02-19 22:51:27,162] Trial 1 finished with value: -8.211609569968306 and parameters: {'learning_rate': 0.27439447153525104, 'n_estimators': 87, 'max_depth': 40, 'num_leaves': 22, 'min_child_samples': 2, 'subsample': 0.3366596617124733, 'colsample_bytree': 0.7967193971738765}. Best is trial 0 with value: -7.728390896686054.
[I 2024-02-19 22:51:27,257] Trial 2 finished with value: -7.104388282605648 and parameters: {'learning_rate': 0.38814762371919226, 'n_estimators': 177, 'max_depth': 2, 'num_leaves': 40, 'min_child_samples': 14, 'subsample': 0.9381577471056294, 'colsample_bytree': 0.4965806120172528}. Best is trial 2 with value: -7.104388282605648.
[I 2024-02-19 22:51:27,405] Trial 3 finished with value: -7.556732027522607 and parameters: {'learning_rate': 0.20274390068628542, 'n_estimators': 87, 'max_depth': 43, 'num_leaves': 25, 'min_child_samples': 8, 'subsample': 0.7266128074015007, 'colsample_bytree': 0.8768234813353015}. Best is trial 2 with value: -7.104388282605648.
[I 

[I 2024-02-19 22:51:32,785] Trial 26 finished with value: -7.684322678476154 and parameters: {'learning_rate': 0.27237927745473345, 'n_estimators': 378, 'max_depth': 34, 'num_leaves': 15, 'min_child_samples': 4, 'subsample': 0.20751275880485934, 'colsample_bytree': 0.3021919200366375}. Best is trial 18 with value: -6.572556352341418.
[I 2024-02-19 22:51:32,966] Trial 27 finished with value: -7.149438670130396 and parameters: {'learning_rate': 0.19712720801485542, 'n_estimators': 265, 'max_depth': 18, 'num_leaves': 2, 'min_child_samples': 10, 'subsample': 0.5083378288667066, 'colsample_bytree': 0.31946277731751077}. Best is trial 18 with value: -6.572556352341418.
[I 2024-02-19 22:51:33,194] Trial 28 finished with value: -7.148502500155639 and parameters: {'learning_rate': 0.013836102184462296, 'n_estimators': 138, 'max_depth': 27, 'num_leaves': 44, 'min_child_samples': 18, 'subsample': 0.29040258290692456, 'colsample_bytree': 0.19015733559046216}. Best is trial 18 with value: -6.572556

LGBM -6.565875681494578 17.440108477800543 619.2704589997802 228.4514239935806


[I 2024-02-19 22:51:39,993] Trial 0 finished with value: -7.261397270568392 and parameters: {'max_features': 'sqrt', 'n_estimators': 90, 'max_depth': 177, 'min_samples_leaf': 4}. Best is trial 0 with value: -7.261397270568392.
[I 2024-02-19 22:51:41,111] Trial 1 finished with value: -8.0777922332935 and parameters: {'max_features': 'sqrt', 'n_estimators': 197, 'max_depth': 138, 'min_samples_leaf': 16}. Best is trial 0 with value: -7.261397270568392.
[I 2024-02-19 22:51:43,439] Trial 2 finished with value: -7.26951484963068 and parameters: {'max_features': 'sqrt', 'n_estimators': 341, 'max_depth': 142, 'min_samples_leaf': 2}. Best is trial 0 with value: -7.261397270568392.
[I 2024-02-19 22:51:43,756] Trial 3 finished with value: -7.97930289580009 and parameters: {'max_features': 'log2', 'n_estimators': 51, 'max_depth': 107, 'min_samples_leaf': 10}. Best is trial 0 with value: -7.261397270568392.
[I 2024-02-19 22:51:44,267] Trial 4 finished with value: -7.430377281306096 and parameters: 

[I 2024-02-19 22:52:55,543] Trial 36 finished with value: -7.331573048830884 and parameters: {'max_features': 'log2', 'n_estimators': 441, 'max_depth': 187, 'min_samples_leaf': 4}. Best is trial 12 with value: -7.209693508201157.
[I 2024-02-19 22:52:57,210] Trial 37 finished with value: -7.302133046045571 and parameters: {'max_features': 'sqrt', 'n_estimators': 217, 'max_depth': 154, 'min_samples_leaf': 1}. Best is trial 12 with value: -7.209693508201157.
[I 2024-02-19 22:52:58,001] Trial 38 finished with value: -8.157270109048522 and parameters: {'max_features': 'sqrt', 'n_estimators': 134, 'max_depth': 137, 'min_samples_leaf': 17}. Best is trial 12 with value: -7.209693508201157.
[I 2024-02-19 22:52:59,802] Trial 39 finished with value: -7.367395189189454 and parameters: {'max_features': 'log2', 'n_estimators': 292, 'max_depth': 37, 'min_samples_leaf': 5}. Best is trial 12 with value: -7.209693508201157.
[I 2024-02-19 22:53:01,116] Trial 40 finished with value: -7.241647419665969 and

RF -7.190838558922893 20.067667300777654 669.9950979044746 229.52086449887943


[I 2024-02-19 22:53:28,116] Trial 0 finished with value: -4442960.883036104 and parameters: {'learning_rate': 0.9303270790703336, 'n_estimators': 220, 'max_depth': 180, 'min_samples_split': 9, 'min_samples_leaf': 5, 'subsample': 0.23502752872274968}. Best is trial 0 with value: -4442960.883036104.
[I 2024-02-19 22:53:29,623] Trial 1 finished with value: -9.125750289325225 and parameters: {'learning_rate': 0.5199217471397569, 'n_estimators': 328, 'max_depth': 114, 'min_samples_split': 11, 'min_samples_leaf': 7, 'subsample': 0.39241082686708384}. Best is trial 1 with value: -9.125750289325225.
[I 2024-02-19 22:53:29,934] Trial 2 finished with value: -43.04729918255095 and parameters: {'learning_rate': 0.7986204588181621, 'n_estimators': 77, 'max_depth': 132, 'min_samples_split': 4, 'min_samples_leaf': 6, 'subsample': 0.2857445704809464}. Best is trial 1 with value: -9.125750289325225.
[I 2024-02-19 22:53:30,322] Trial 3 finished with value: -13.718637620351082 and parameters: {'learning_

[I 2024-02-19 22:54:35,364] Trial 28 finished with value: -7.720206908476179 and parameters: {'learning_rate': 0.21587526023264159, 'n_estimators': 418, 'max_depth': 149, 'min_samples_split': 7, 'min_samples_leaf': 8, 'subsample': 0.3966466370085912}. Best is trial 26 with value: -6.515174631838967.
[I 2024-02-19 22:54:37,210] Trial 29 finished with value: -7.610541442900592 and parameters: {'learning_rate': 0.16335893156681403, 'n_estimators': 235, 'max_depth': 170, 'min_samples_split': 3, 'min_samples_leaf': 4, 'subsample': 0.4341980504978989}. Best is trial 26 with value: -6.515174631838967.
[I 2024-02-19 22:54:38,609] Trial 30 finished with value: -11.476790976449536 and parameters: {'learning_rate': 0.9993536741075205, 'n_estimators': 179, 'max_depth': 134, 'min_samples_split': 8, 'min_samples_leaf': 9, 'subsample': 0.7073334777327618}. Best is trial 26 with value: -6.515174631838967.
[I 2024-02-19 22:54:41,561] Trial 31 finished with value: -6.548314290001316 and parameters: {'le

GB -6.515174631838967 17.02958806275883 596.6050874747147 225.5793874592711


[I 2024-02-19 22:55:26,575] Trial 0 finished with value: -11.690630333343252 and parameters: {'hidden_layer_sizes': (50,), 'activation': 'relu', 'solver': 'adam', 'alpha': 4732.8320231087355}. Best is trial 0 with value: -11.690630333343252.
[I 2024-02-19 22:55:28,648] Trial 1 finished with value: -10.042135624276222 and parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 563.5406030261598}. Best is trial 1 with value: -10.042135624276222.
[W 2024-02-19 22:55:29,480] Trial 2 failed with parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'identity', 'solver': 'sgd', 'alpha': 5.981523563615243} because of the following error: The value nan is not acceptable.
[W 2024-02-19 22:55:29,482] Trial 2 failed with value nan.
[I 2024-02-19 22:55:30,713] Trial 3 finished with value: -23.1786046185128 and parameters: {'hidden_layer_sizes': (30, 30, 30), 'activation': 'logistic', 'solver': 'adam', 'alpha': 72239.77905378469}. Best is trial 1 with val

[W 2024-02-19 22:55:58,449] Trial 30 failed with value nan.
[W 2024-02-19 22:55:59,559] Trial 31 failed with parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'identity', 'solver': 'sgd', 'alpha': 28.149574444495737} because of the following error: The value nan is not acceptable.
[W 2024-02-19 22:55:59,560] Trial 31 failed with value nan.
[W 2024-02-19 22:56:00,722] Trial 32 failed with parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'identity', 'solver': 'sgd', 'alpha': 21.570089510164905} because of the following error: The value nan is not acceptable.
[W 2024-02-19 22:56:00,723] Trial 32 failed with value nan.
[I 2024-02-19 22:56:00,972] Trial 33 finished with value: -10.03313392028014 and parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 49.223316689850115}. Best is trial 21 with value: -10.030918436616542.
[W 2024-02-19 22:56:01,960] Trial 34 failed with parameters: {'hidden_layer_sizes': (30, 30, 30), 'activati

ANN -10.030918436616542 22.284169195304948 891.1968183064212 257.53354634081


[I 2024-02-19 22:56:15,862] Trial 2 finished with value: -23.77385267742385 and parameters: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'alpha': 7690345518.070352}. Best is trial 0 with value: -23.773812107344078.
[I 2024-02-19 22:56:15,937] Trial 3 finished with value: -23.77385267742385 and parameters: {'kernel': 0.1**2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.01), 'alpha': 6722731956.470007}. Best is trial 0 with value: -23.773812107344078.
[I 2024-02-19 22:56:16,016] Trial 4 finished with value: -23.773835046087523 and parameters: {'kernel': 50**2 * RBF(length_scale=50), 'alpha': 4464631183.481208}. Best is trial 0 with value: -23.773812107344078.
[I 2024-02-19 22:56:16,074] Trial 5 finished with value: -22.934820599822412 and parameters: {'kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'alpha': 4033250952.196187}. Best is trial 5 with value: -22.934820599822412.
[I 2024-02-19 22:56:16,142] Trial 6 finished with value: -23.773852677419974 and paramete

[I 2024-02-19 22:56:19,077] Trial 38 finished with value: -19.950785794658003 and parameters: {'kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'alpha': 742030928.4550616}. Best is trial 13 with value: -10.248587467812165.
[I 2024-02-19 22:56:19,187] Trial 39 finished with value: -23.773811051223298 and parameters: {'kernel': 50**2 * RBF(length_scale=50), 'alpha': 1890945470.3329697}. Best is trial 13 with value: -10.248587467812165.
[I 2024-02-19 22:56:19,271] Trial 40 finished with value: -21.5993446328406 and parameters: {'kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'alpha': 1448064010.3906515}. Best is trial 13 with value: -10.248587467812165.
[I 2024-02-19 22:56:19,356] Trial 41 finished with value: -18.42924221638622 and parameters: {'kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'alpha': 476591121.3837734}. Best is trial 13 with value: -10.248587467812165.
[I 2024-02-19 22:56:19,449] Trial 42 finished with value: -10.260853299176432 a

GPR -10.248587467812165 22.195950206537695 957.8563272393701 261.3548207891602


In [None]:
with open('best_model_onlyreg.pickle', 'wb') as f:
    pickle.dump(best_model_onlyreg, f, pickle.HIGHEST_PROTOCOL)

In [None]:

# X_t = X_t_old[selected_features] 
# X_t['cluster'] = train_data_tr['remainder__clusters']
# X_t['remainder__link_counts'] = train_data_tr['remainder__link_counts']
# X_v = validate_data_tr[selected_features]
# X_v['cluster'] = validate_data_tr['remainder__clusters']
# X_v['remainder__link_counts'] = validate_data_tr['remainder__link_counts']
# X_te = test_data_tr[selected_features]
# X_te['cluster'] = test_data_tr['remainder__clusters']
# X_te['remainder__link_counts'] = test_data_tr['remainder__link_counts']

In [None]:
selected_features

In [None]:
model = xgb.XGBRegressor(random_state=101)
opt = BayesSearchCV(
    model,
    param_space['XGB'],
    n_iter=50,  # Adjust the number of iterations based on your computational resources
    cv=ShuffleSplit(test_size=0.20, n_splits=3),  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

opt.fit(X_t, y_t)
best_params = opt.best_params_ 

model = xgb.XGBRegressor(**best_params, random_state=101)
model.fit(X_t, y_t)
te_predictions = model.predict(X_te)
te_mae = mean_absolute_error(y_te, te_predictions)
print(te_mae)

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(model.apply(X_t))
rf_lr = LogisticRegressionCV(cv=ShuffleSplit(test_size=0.20, n_splits=3))
rf_lr.fit(X_t_leaves, y_t)
X_te_leaves = onehot.transform(model.apply(X_te))
y_te_pred_rf_lr = rf_lr.predict(X_te_leaves)
te_rflr_mae = mean_absolute_error(y_te, y_te_pred_rf_lr)
print(te_rflr_mae)

In [None]:
v_mae_list = []
te_mae_list = []
v_rf_mae_list = []
te_rf_mae_list = []
# X_te['cluster'] = 1
for cluster_label in list(set(X_t['cluster'])):
    X_v_cluster, X_te_cluster = None, None
    v_mae, te_mae, v_rflr_mae, te_rflr_mae = None, None, None, None
    # Subset the training data for the current cluster
    X_t_cluster = X_t[X_t['cluster'] == cluster_label]
    y_t_cluster = X_t_cluster['remainder__link_counts']
    X_t_cluster = X_t_cluster.drop(columns=['remainder__link_counts','cluster'])  
    
    if cluster_label in list(set(X_v['cluster'])):
        X_v_cluster = X_v[X_v['cluster'] == cluster_label]
        y_v_cluster = X_v_cluster['remainder__link_counts']
        X_v_cluster = X_v_cluster.drop(columns=['remainder__link_counts','cluster'])  
    if cluster_label in list(set(X_te['cluster'])):
        X_te_cluster = X_te[X_te['cluster'] == cluster_label]
        y_te_cluster = X_te_cluster['remainder__link_counts']
        X_te_cluster = X_te_cluster.drop(columns=['remainder__link_counts','cluster'])  
    

    # Train a regression model for the current cluster
    # model = RidgeCV(scoring='neg_mean_absolute_error')
    # model = RandomForestRegressor(criterion='friedman_mse', n_estimators=300,
    #                   random_state=101)

    if not ((X_v_cluster is None) and (X_te_cluster is None)):

        model = xgb.XGBRegressor(random_state=101)
        opt = BayesSearchCV(
            model,
            param_space['XGB'],
            n_iter=60,  # Adjust the number of iterations based on your computational resources
            cv=ShuffleSplit(test_size=0.20, n_splits=1),  # Adjust the number of cross-validation folds
            scoring='neg_mean_absolute_error',  # Use a suitable regression metric
            n_jobs=-1,
        )
        
        opt.fit(X_t_cluster, y_t_cluster)
        best_params = opt.best_params_ 
        onehot = OneHotEncoder()
        model = xgb.XGBRegressor(**best_params, random_state=101)
        model.fit(X_t_cluster, y_t_cluster)

        X_t_leaves = onehot.fit_transform(model.apply(X_t_cluster))
        rf_lr = LogisticRegression()
        rf_lr.fit(X_t_leaves, y_t_cluster)
        if (X_v_cluster is not None):
            v_predictions = model.predict(X_v_cluster)
            v_mae = mean_absolute_error(y_v_cluster, v_predictions)
            row_count = len(y_v_cluster)
            v_mae_list.append(v_mae*row_count)

            X_v_leaves = onehot.transform(model.apply(X_v_cluster))
            y_v_pred_rf_lr = rf_lr.predict(X_v_leaves)
            row_count = len(y_v_pred_rf_lr)
            v_rflr_mae = mean_absolute_error(y_v_cluster, y_v_pred_rf_lr)
            v_rf_mae_list.append(v_rflr_mae*row_count)
        if (X_te_cluster is not None):
            te_predictions = model.predict(X_te_cluster)
            te_mae = mean_absolute_error(y_te_cluster, te_predictions)
            row_count = len(y_te_cluster)
            te_mae_list.append(te_mae*row_count)
      
            X_te_leaves = onehot.transform(model.apply(X_te_cluster))
            y_te_pred_rf_lr = rf_lr.predict(X_te_leaves)
            row_count = len(y_te_pred_rf_lr)
            te_rflr_mae = mean_absolute_error(y_te_cluster, y_te_pred_rf_lr)
            te_rf_mae_list.append(te_rflr_mae*row_count)
    print(cluster_label, v_mae, te_mae, v_rflr_mae, te_rflr_mae)
v_mae_list = list(filter(lambda item: item is not None, v_mae_list))    
te_mae_list = list(filter(lambda item: item is not None, te_mae_list))    
v_mean_mae = sum(v_mae_list)/len(X_v)
te_mean_mae = sum(te_mae_list)/len(X_te)
print(v_mean_mae)
print(te_mean_mae)    

In [None]:
best_params = None
best_mae = 0.0

# Generate all combinations of hyperparameters
param_combinations = product(*param_grid.values())
# Loop through hyperparameter combinations
for params in param_combinations:
    # Create and train the model with the current hyperparameters
    model = lgb.LGBMRegressor(**dict(zip(param_grid.keys(), params)), random_state=101)
    model.fit(X_t[X_v.columns], y_t)

    # Evaluate the model on the validation dataset
    y_val_pred = model.predict(X_v)
    mae = mean_absolute_error(y_v, y_val_pred)

    # Check if current hyperparameters are the best
    if mae > best_mae:
        best_mae = mae
        best_params = dict(zip(param_grid.keys(), params))
print(best_mae)
print(best_params)
# Train the final model using the entire training dataset with the best hyperparameters
final_model = lgb.LGBMRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

# Evaluate the final model on the test dataset
y_test_pred = final_model.predict(X_te)
test_mae = mean_absolute_error(y_te, y_test_pred)
print(test_mae)

In [None]:
# Define the LGBMRegressor model
lgbm_model = lgb.LGBMRegressor(random_state=101, force_col_wise=True)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    lgbm_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t[X_v.columns], y_t, eval_set = [(X_v, y_v)])

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)

# Train the final model with the best hyperparameters
final_model = lgb.LGBMRegressor(**best_params, random_state=101, force_col_wise=True)
final_model.fit(X_t, y_t, eval_set = [(X_te, y_te)])

print("With LGB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))


In [None]:
print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.predict(X_t, pred_leaf=True))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With LGB + LR Pred")
X_t_leaves = onehot.transform(final_model.predict(X_t, pred_leaf=True))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.predict(X_te, pred_leaf=True))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
param_space = {
    'learning_rate': np.arange(0.01, 1.0, 0.01),
    'n_estimators': np.arange(50, 2001, 50),
    'max_depth': np.arange(1, 20),
    'max_leaves': np.arange(2, 50),
    'max_bin': np.arange(2, 50),
    'gamma': np.arange(1, 20),
    # 'min_child_weight': np.arange(0, 20),
    # 'subsample': np.arange(0.1, 1.0, 0.1),
    # 'colsample_bytree': np.arange(0.1, 1.0, 0.1),
    # 'reg_alpha': np.arange(0, 100),
    # 'reg_lambda': np.arange(0, 10, 0.01),
}

# Define the LGBMRegressor model
xgb_model = xgb.XGBRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    xgb_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t[X_v.columns], y_t, eval_set = [(X_v, y_v)])
# 
# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)

# Train the final model using the entire training dataset with the best hyperparameters
final_model = xgb.XGBRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t, eval_set = [(X_te, y_te)])

# Evaluate the final model on the test dataset
y_test_pred = final_model.predict(X_te)
test_mae = mean_absolute_error(y_te, y_test_pred)
print(test_mae)

In [None]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With XGB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))


# # Make predictions on the validation set
# y_pred = final_model.predict(X_te)

# # Evaluate the model on the validation set
# mae = mean_absolute_error(y_te, y_pred)
# print(f'Mean Absolute Error on Validation Set: {mae}')

In [None]:
param_space =  {
    'max_features': Categorical(['sqrt', 'log2']),
    'n_estimators': np.arange(50, 2001, 50),
    'bootstrap': Categorical([True, False]),
    'max_depth': np.arange(1, 20),
    'min_samples_leaf': np.arange(1, 20),
    'criterion': Categorical(['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])
}
# Define the LGBMRegressor model
rf_model = RandomForestRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    rf_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t, y_t)

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)


In [None]:
# Train the final model with the best hyperparameters
final_model = RandomForestRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With RF Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
param_space = {
    'learning_rate': np.arange(0.01, 1.0, 0.01),
    'n_estimators': np.arange(50, 2001, 50),
    'max_depth': np.arange(1, 200),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 10),
    'subsample': np.arange(0.1, 1.0, 0.1),
}
# Define the LGBMRegressor model
gb_model = GradientBoostingRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    gb_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t, y_t)

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)


In [None]:
final_model = GradientBoostingRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With GB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
def set_randomstate(model):
    current_params = model.get_params()

    # Update the random_state if it exists, otherwise add it to the parameters
    current_params['random_state'] = 101

    # Set the modified parameters back to the model
    model.set_params(**current_params)
    return model

kf = KFold(n_splits=5, shuffle=True, random_state=101)
models = {
    # 'Linear Regression': LinearRegression(),
    # 'Lasso': LassoCV(cv=kf, random_state=42, max_iter=200000),
    # 'Logistic': LogisticRegression(),
    # 'KNN': pd.read_pickle("result_KNN.pkl")['estimator'],
    'XGB': set_randomstate(pd.read_pickle("result_XGB.pkl")['estimator']),
    'LGBM': set_randomstate(pd.read_pickle("result_LGBM.pkl")['estimator']),
    # 'Ridge': RidgeCV(cv=kf),
    # 'SVR': pd.read_pickle("result_SVR.pkl")['estimator'],
    # 'Random Forest': RandomForestRegressor(bootstrap=False, max_depth=15, max_features=0.7,
    #                   min_samples_leaf=9, n_estimators=200, random_state=101),
    # 'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.01, max_depth=14, min_samples_leaf=3,
    #                       min_samples_split=4, n_estimators=950, random_state=101, subsample=0.8),
    # 'Artificial Neural Network': MLPRegressor(activation='logistic', alpha=0.01, hidden_layer_sizes=(50, 50),
    #          max_iter=4000, random_state=101),
    # 'Gaussian Process Regression': GaussianProcessRegressor(0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)))
}

In [None]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
        scoring=['neg_mean_absolute_error','neg_mean_squared_error']
        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=101)

        # Perform k-fold cross-validation and calculate MSE and MAE
        scores = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, n_jobs=-1)

        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()

        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Train and evaluate
results = evaluate_models(models, X_t, y_t)


In [None]:
def model_lr_evaluation(models, X_train, y_train, X_test, y_test):
    results = {}
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']

    for name, model in models.items():
        final_model = model
        
        scores = cross_validate(model, X_train, y_train, cv=3, scoring=scoring, n_jobs=-1)
        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()
        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        
        final_model.fit(X_train, y_train)
        y_te_pred = final_model.predict(X_test)
        print(mean_absolute_error(y_test, y_te_pred), mean_squared_error(y_test, y_te_pred))
        
        print("---------------")
        
        onehot = OneHotEncoder()
        if name == 'LGBM':
            X_t_leaves = onehot.fit_transform(final_model.predict(X_train, pred_leaf=True))
        elif name == 'XGB':
            X_t_leaves = onehot.fit_transform(final_model.apply(X_train))
        xgb_lr = LogisticRegression()
        xgb_lr.fit(X_t_leaves, y_train)

        if name == 'LGBM':
            X_t_leaves = onehot.transform(final_model.predict(X_train, pred_leaf=True))
            X_te_leaves = onehot.transform(final_model.predict(X_test, pred_leaf=True))
        elif name == 'XGB':
            X_t_leaves = onehot.transform(final_model.apply(X_train))
            X_te_leaves = onehot.transform(final_model.apply(X_test))

        scores = cross_validate(model, X_t_leaves, y_train, cv=kf, scoring=scoring, n_jobs=-1)
        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()
        print(mean_mae, mean_mse, std_mse) 
        
        y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
        print(mean_absolute_error(y_test, y_te_pred_xgb_lr), mean_squared_error(y_test, y_te_pred_xgb_lr))

model_lr_evaluation(models, X_t, y_t, X_te, y_te)

In [None]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [None]:
import pickle

result_final_with_test = {}
for name, model in models.items():
    result_final_with_test[name] = evaluate_models_with_test(model, X_t, y_t, X_te, y_te)

result_final_with_test

In [None]:
train_data_0 = train_data[train_data['clusters']==0]
train_data_1 = train_data[train_data['clusters']==1]
validate_data_0 = validate_data[validate_data['clusters']==0]
validate_data_1 = validate_data[validate_data['clusters']==1]
test_data_0 = test_data[test_data['clusters']==0]
test_data_1 = test_data[test_data['clusters']==1]
test_data_2 = test_data[test_data['clusters']==2]
test_data_1 = pd.concat([test_data_1, test_data_2], ignore_index=True)

import matplotlib.pyplot as plt

# Assuming you have three DataFrames: df1, df2, and df3

# Plotting DataFrame 1 in red
plt.plot(train_data_0['start_node_x'], train_data_0['start_node_y'], 'ro-', label='DF1 Start')
plt.plot(train_data_0['end_node_x'], train_data_0['end_node_y'], 'ro-', label='DF1 End')

# Plotting DataFrame 2 in green
plt.plot(train_data_1['start_node_x'], train_data_1['start_node_y'], 'go-', label='DF2 Start')
plt.plot(train_data_1['end_node_x'], train_data_1['end_node_y'], 'go-', label='DF2 End')

# Plotting DataFrame 3 in blue
plt.plot(validate_data_0['start_node_x'], validate_data_0['start_node_y'], 'bo-', label='DF3 Start')
plt.plot(validate_data_0['end_node_x'], validate_data_0['end_node_y'], 'bo-', label='DF3 End')
plt.plot(validate_data_1['start_node_x'], validate_data_1['start_node_y'], 'yo-', label='DF4 Start')
plt.plot(validate_data_1['end_node_x'], validate_data_1['end_node_y'], 'yo-', label='DF4 End')

plt.plot(test_data_0['start_node_x'], test_data_0['start_node_y'], 'co-', label='DF5 Start')
plt.plot(test_data_0['end_node_x'], test_data_0['end_node_y'], 'co-', label='DF5 End')
plt.plot(test_data_1['start_node_x'], test_data_1['start_node_y'], 'mo-', label='DF6 Start')
plt.plot(test_data_1['end_node_x'], test_data_1['end_node_y'], 'mo-', label='DF6 End')
# plt.plot(test_data_2['start_node_x'], test_data_2['start_node_y'], 'ko-', label='DF7 Start')
# plt.plot(test_data_2['end_node_x'], test_data_2['end_node_y'], 'ko-', label='DF7 End')
# Add labels and legend
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()

# Show the plot
plt.show()