In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

  from pandas.core import (


In [3]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'link_length', 'link_freespeed', 
                      'link_capacity', 'link_permlanes', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 
                      'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes',
                      'income', 'score', 'income_avg', 'score_avg'
                     ]
category_feature = ['type', 'home-activity-zone']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
#     'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101)
}

model_space = {
    'KNN': KNeighborsRegressor(),
    # 'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
    # 'SVR': SVR(),
#     'Linear':,
#     'Ridge':,
#     'Lasso':,
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
# 'GRNN':{
#     'sigma' : np.arange(0.1, 4, 0.01)
# }
}

In [4]:
train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

In [5]:
# Find the indices where 'link_id' is 0
indices = Big_data.index[Big_data['link_id'] == 0].tolist()

# Add the end of the DataFrame to the indices list
indices.append(len(Big_data))

# Split the DataFrame using the indices
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]
list_od = []
list_nodes = []
all_files = train_files + validate_files + test_files
for i in all_files:
    with open(i) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]
import networkx as nx

# Assume `nodes` is a list of all node IDs and `edges` is a list of tuples (start_node, end_node, weight)
# For example:
# nodes = [1, 2, 3, 4, ...]
# edges = [(1, 2, 1.0), (2, 3, 2.5), (1, 3, 1.5), ...]
# And `o_d_pairs` is a list of tuples representing the O-D pairs:
# o_d_pairs = [(origin_1, destination_1), (origin_2, destination_2), ...]

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data_new = pd.concat(dfs)

In [6]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['x_y_coor'] = cluster.fit_predict(Big_data_new[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['similar_link'] = cluster1.fit_predict(Big_data_new[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['planxml'] = cluster2.fit_predict(Big_data_new[['income', 'score', 'rush_hour',
                                                               'max_dur', 'cemdapStopDuration_s']])

Big_data_new = Big_data_new.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

In [14]:
Big_data_tr = ct.fit_transform(Big_data_new)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)

In [16]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [19]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-03-05 23:51:41,739] A new study created in memory with name: no-name-4813c196-b84e-462f-a668-28ef1a75d5c0
[I 2024-03-05 23:51:42,574] Trial 0 finished with value: 0.8578973225251781 and parameters: {'selector__k': 38, 'model__n_neighbors': 3, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 0 with value: 0.8578973225251781.
[I 2024-03-05 23:51:46,704] Trial 1 finished with value: 0.8985507246376812 and parameters: {'selector__k': 32, 'model__n_neighbors': 45, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: 0.8985507246376812.
[I 2024-03-05 23:51:47,854] Trial 2 finished with value: 0.9163596168017686 and parameters: {'selector__k': 32, 'model__n_neighbors': 50, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 2 with value: 0.9163596168017686.
[I 2024-03-05 23:51:48,857] Trial 3 finished with value: 0.9180790960451978 and parameters: {'selector__k': 25, 'model__n_neighbors': 38, 'model__we

[I 2024-03-05 23:52:39,357] Trial 33 finished with value: 0.9163596168017686 and parameters: {'selector__k': 32, 'model__n_neighbors': 36, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 27 with value: 0.9184475558830754.
[I 2024-03-05 23:52:40,059] Trial 34 finished with value: 0.9064112011790715 and parameters: {'selector__k': 22, 'model__n_neighbors': 5, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 27 with value: 0.9184475558830754.
[I 2024-03-05 23:52:42,149] Trial 35 finished with value: 0.8987963645295997 and parameters: {'selector__k': 34, 'model__n_neighbors': 46, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 27 with value: 0.9184475558830754.
[I 2024-03-05 23:52:43,298] Trial 36 finished with value: 0.9167280766396463 and parameters: {'selector__k': 30, 'model__n_neighbors': 47, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 27 with value: 0.9184475558830754.
[I 2024-03-

KNN 0.9184475558830754 0.9338586745898463


[I 2024-03-05 23:53:04,771] Trial 0 finished with value: 0.8807418324735937 and parameters: {'selector__k': 16, 'model__learning_rate': 0.9285522933713125, 'model__n_estimators': 396, 'model__max_depth': 39, 'model__num_leaves': 14, 'model__min_child_samples': 10, 'model__subsample': 0.9900204663236737, 'model__colsample_bytree': 0.3644544821196235}. Best is trial 0 with value: 0.8807418324735937.
[I 2024-03-05 23:53:05,910] Trial 1 finished with value: 0.8929010071235569 and parameters: {'selector__k': 26, 'model__learning_rate': 0.5323687712891098, 'model__n_estimators': 250, 'model__max_depth': 26, 'model__num_leaves': 18, 'model__min_child_samples': 1, 'model__subsample': 0.860764693575216, 'model__colsample_bytree': 0.11199347182659802}. Best is trial 1 with value: 0.8929010071235569.
[I 2024-03-05 23:53:06,482] Trial 2 finished with value: 0.8602309014984033 and parameters: {'selector__k': 12, 'model__learning_rate': 0.8184882780562812, 'model__n_estimators': 74, 'model__max_dept

[I 2024-03-05 23:54:39,161] Trial 21 finished with value: 0.9228690739376074 and parameters: {'selector__k': 34, 'model__learning_rate': 0.3603416800679635, 'model__n_estimators': 1468, 'model__max_depth': 9, 'model__num_leaves': 37, 'model__min_child_samples': 16, 'model__subsample': 0.6791419174936939, 'model__colsample_bytree': 0.6948580205642807}. Best is trial 14 with value: 0.9250798329648735.
[I 2024-03-05 23:54:47,381] Trial 22 finished with value: 0.927536231884058 and parameters: {'selector__k': 41, 'model__learning_rate': 0.3619915633638472, 'model__n_estimators': 1537, 'model__max_depth': 20, 'model__num_leaves': 33, 'model__min_child_samples': 18, 'model__subsample': 0.701261768954891, 'model__colsample_bytree': 0.5584190325660325}. Best is trial 22 with value: 0.927536231884058.
[I 2024-03-05 23:54:55,283] Trial 23 finished with value: 0.9179562760992385 and parameters: {'selector__k': 41, 'model__learning_rate': 0.022739366059086184, 'model__n_estimators': 1895, 'model__

[I 2024-03-05 23:56:50,617] Trial 42 finished with value: 0.9267993122083026 and parameters: {'selector__k': 38, 'model__learning_rate': 0.372138063391367, 'model__n_estimators': 2296, 'model__max_depth': 50, 'model__num_leaves': 23, 'model__min_child_samples': 7, 'model__subsample': 0.3364407076179901, 'model__colsample_bytree': 0.5510029037341723}. Best is trial 41 with value: 0.9279046917219357.
[I 2024-03-05 23:57:03,408] Trial 43 finished with value: 0.9281503316138541 and parameters: {'selector__k': 38, 'model__learning_rate': 0.3057990550193053, 'model__n_estimators': 2958, 'model__max_depth': 48, 'model__num_leaves': 27, 'model__min_child_samples': 6, 'model__subsample': 0.3472644957166618, 'model__colsample_bytree': 0.6250927319118981}. Best is trial 43 with value: 0.9281503316138541.
[I 2024-03-05 23:57:10,624] Trial 44 finished with value: 0.8596782117415869 and parameters: {'selector__k': 38, 'model__learning_rate': 0.3569711406124567, 'model__n_estimators': 2885, 'model__m

LGBM 0.9307295504789979 0.8137191577315592


[I 2024-03-05 23:57:50,735] Trial 0 finished with value: 0.9118152788012773 and parameters: {'selector__k': 24, 'model__max_features': 'log2', 'model__n_estimators': 63, 'model__max_depth': 91, 'model__min_samples_leaf': 5}. Best is trial 0 with value: 0.9118152788012773.
[I 2024-03-05 23:58:38,570] Trial 1 finished with value: 0.9053058216654385 and parameters: {'selector__k': 24, 'model__max_features': 'log2', 'model__n_estimators': 947, 'model__max_depth': 34, 'model__min_samples_leaf': 10}. Best is trial 0 with value: 0.9118152788012773.
[I 2024-03-05 23:58:41,492] Trial 2 finished with value: 0.9048145418816016 and parameters: {'selector__k': 16, 'model__max_features': 'log2', 'model__n_estimators': 74, 'model__max_depth': 158, 'model__min_samples_leaf': 18}. Best is trial 0 with value: 0.9118152788012773.
[I 2024-03-05 23:59:00,899] Trial 3 finished with value: 0.9073937607467453 and parameters: {'selector__k': 11, 'model__max_features': 'log2', 'model__n_estimators': 628, 'model

[I 2024-03-06 00:20:56,883] Trial 30 finished with value: 0.9113239990174404 and parameters: {'selector__k': 40, 'model__max_features': 'sqrt', 'model__n_estimators': 112, 'model__max_depth': 109, 'model__min_samples_leaf': 6}. Best is trial 21 with value: 0.9200442151805454.
[I 2024-03-06 00:21:34,933] Trial 31 finished with value: 0.917096536477524 and parameters: {'selector__k': 36, 'model__max_features': 'sqrt', 'model__n_estimators': 335, 'model__max_depth': 142, 'model__min_samples_leaf': 2}. Best is trial 21 with value: 0.9200442151805454.
[I 2024-03-06 00:22:09,618] Trial 32 finished with value: 0.9172193564234832 and parameters: {'selector__k': 35, 'model__max_features': 'sqrt', 'model__n_estimators': 342, 'model__max_depth': 141, 'model__min_samples_leaf': 2}. Best is trial 21 with value: 0.9200442151805454.
[I 2024-03-06 00:22:43,747] Trial 33 finished with value: 0.9188160157209531 and parameters: {'selector__k': 40, 'model__max_features': 'sqrt', 'model__n_estimators': 253

RF 0.926430852370425 0.9484562717995091


In [None]:
import pickle
with open('best_model_clf.pickle', 'wb') as f:
    pickle.dump(best_model_clf, f, pickle.HIGHEST_PROTOCOL)

In [None]:
best_model_clf = pd.read_pickle("best_model_clf.pickle")

In [20]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [21]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [22]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-06 00:57:04,238] A new study created in memory with name: no-name-1fb6893c-1009-4009-a03b-00da596f43f9
[I 2024-03-06 00:57:06,224] Trial 0 finished with value: -11.808884065358834 and parameters: {'selector__k': 12, 'model__n_neighbors': 27, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: -11.808884065358834.
[I 2024-03-06 00:57:08,135] Trial 1 finished with value: -20.021429180738597 and parameters: {'selector__k': 28, 'model__n_neighbors': 41, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -11.808884065358834.
[I 2024-03-06 00:57:13,728] Trial 2 finished with value: -21.782035336089606 and parameters: {'selector__k': 38, 'model__n_neighbors': 47, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 0 with value: -11.808884065358834.
[I 2024-03-06 00:57:17,250] Trial 3 finished with value: -13.982484630553301 and parameters: {'selector__k': 18, 'model__n_neighbors': 37,

[I 2024-03-06 00:58:36,415] Trial 33 finished with value: -15.918500156216599 and parameters: {'selector__k': 20, 'model__n_neighbors': 36, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 25 with value: -11.78219537037843.
[I 2024-03-06 00:58:40,219] Trial 34 finished with value: -11.750979111093462 and parameters: {'selector__k': 17, 'model__n_neighbors': 41, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 34 with value: -11.750979111093462.
[I 2024-03-06 00:58:43,467] Trial 35 finished with value: -12.939813953644355 and parameters: {'selector__k': 18, 'model__n_neighbors': 42, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 34 with value: -11.750979111093462.
[I 2024-03-06 00:58:47,155] Trial 36 finished with value: -11.742278789074705 and parameters: {'selector__k': 12, 'model__n_neighbors': 45, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 36 with value: -11.742278789074705.
[I 2

KNN -11.729695603298707 10.655773210894187 469.18341351036077 202.08510638297872


[I 2024-03-06 00:59:43,087] Trial 0 finished with value: -11.06442904178088 and parameters: {'selector__k': 10, 'model__learning_rate': 0.1298898032070535, 'model__n_estimators': 241, 'model__max_depth': 15, 'model__num_leaves': 4, 'model__min_child_samples': 9, 'model__subsample': 0.5963704922453544, 'model__colsample_bytree': 0.23812346620435793}. Best is trial 0 with value: -11.06442904178088.
[I 2024-03-06 01:00:46,026] Trial 1 finished with value: -10.626640809333264 and parameters: {'selector__k': 20, 'model__learning_rate': 0.23929067052538722, 'model__n_estimators': 1218, 'model__max_depth': 13, 'model__num_leaves': 48, 'model__min_child_samples': 15, 'model__subsample': 0.8690904168459698, 'model__colsample_bytree': 0.5731903704434453}. Best is trial 1 with value: -10.626640809333264.
[I 2024-03-06 01:00:56,108] Trial 2 finished with value: -11.609364502056392 and parameters: {'selector__k': 38, 'model__learning_rate': 0.11910505409670201, 'model__n_estimators': 693, 'model__m

[I 2024-03-06 01:02:13,991] Trial 21 finished with value: -10.272400166041798 and parameters: {'selector__k': 13, 'model__learning_rate': 0.035682919277187065, 'model__n_estimators': 89, 'model__max_depth': 32, 'model__num_leaves': 46, 'model__min_child_samples': 17, 'model__subsample': 0.8623060247183754, 'model__colsample_bytree': 0.7734465690108295}. Best is trial 17 with value: -9.784270808033506.
[I 2024-03-06 01:02:16,648] Trial 22 finished with value: -9.95496239199219 and parameters: {'selector__k': 13, 'model__learning_rate': 0.042514453307390274, 'model__n_estimators': 192, 'model__max_depth': 31, 'model__num_leaves': 36, 'model__min_child_samples': 17, 'model__subsample': 0.8402015080159935, 'model__colsample_bytree': 0.7790281163073846}. Best is trial 17 with value: -9.784270808033506.
[I 2024-03-06 01:02:19,426] Trial 23 finished with value: -9.95423254144458 and parameters: {'selector__k': 18, 'model__learning_rate': 0.10811085960949543, 'model__n_estimators': 189, 'model

[I 2024-03-06 01:03:44,772] Trial 42 finished with value: -12.59117753087435 and parameters: {'selector__k': 15, 'model__learning_rate': 0.08062632650947862, 'model__n_estimators': 120, 'model__max_depth': 1, 'model__num_leaves': 46, 'model__min_child_samples': 8, 'model__subsample': 0.837789643427303, 'model__colsample_bytree': 0.7130482015651745}. Best is trial 17 with value: -9.784270808033506.
[I 2024-03-06 01:03:46,633] Trial 43 finished with value: -10.012085121656492 and parameters: {'selector__k': 12, 'model__learning_rate': 0.15518197070600442, 'model__n_estimators': 128, 'model__max_depth': 11, 'model__num_leaves': 44, 'model__min_child_samples': 10, 'model__subsample': 0.8873509354222566, 'model__colsample_bytree': 0.6780284252178166}. Best is trial 17 with value: -9.784270808033506.
[I 2024-03-06 01:03:48,214] Trial 44 finished with value: -10.488943944359242 and parameters: {'selector__k': 16, 'model__learning_rate': 0.21965006386865515, 'model__n_estimators': 90, 'model__

LGBM -9.784270808033506 9.479743234205126 315.8507670605643 177.21487829380516


[I 2024-03-06 01:04:04,916] Trial 0 finished with value: -13.611644294629343 and parameters: {'selector__k': 24, 'model__max_features': 'log2', 'model__n_estimators': 96, 'model__max_depth': 11, 'model__min_samples_leaf': 1}. Best is trial 0 with value: -13.611644294629343.
[I 2024-03-06 01:04:09,797] Trial 1 finished with value: -12.863949947021672 and parameters: {'selector__k': 29, 'model__max_features': 'log2', 'model__n_estimators': 65, 'model__max_depth': 157, 'model__min_samples_leaf': 17}. Best is trial 1 with value: -12.863949947021672.
[I 2024-03-06 01:04:33,328] Trial 2 finished with value: -14.041840118058976 and parameters: {'selector__k': 41, 'model__max_features': 'log2', 'model__n_estimators': 265, 'model__max_depth': 44, 'model__min_samples_leaf': 20}. Best is trial 1 with value: -12.863949947021672.
[I 2024-03-06 01:04:56,287] Trial 3 finished with value: -13.191760263558354 and parameters: {'selector__k': 30, 'model__max_features': 'log2', 'model__n_estimators': 306,

[I 2024-03-06 01:28:11,452] Trial 30 finished with value: -10.221225927697978 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators': 2595, 'model__max_depth': 26, 'model__min_samples_leaf': 1}. Best is trial 30 with value: -10.221225927697978.
[I 2024-03-06 01:30:51,050] Trial 31 finished with value: -10.2489709551963 and parameters: {'selector__k': 14, 'model__max_features': 'sqrt', 'model__n_estimators': 2014, 'model__max_depth': 28, 'model__min_samples_leaf': 1}. Best is trial 30 with value: -10.221225927697978.
[I 2024-03-06 01:34:06,485] Trial 32 finished with value: -10.229518517173478 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators': 2458, 'model__max_depth': 25, 'model__min_samples_leaf': 1}. Best is trial 30 with value: -10.221225927697978.
[I 2024-03-06 01:39:16,792] Trial 33 finished with value: -10.226524976896675 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators

RF -10.190886546565185 9.983851615750725 395.81971632069866 185.52828878861507


[I 2024-03-06 02:18:02,555] Trial 0 finished with value: -10.418727414410728 and parameters: {'selector__k': 13, 'model__hidden_layer_sizes': (50, 50), 'model__activation': 'relu', 'model__solver': 'adam', 'model__alpha': 0.0037586482280535286}. Best is trial 0 with value: -10.418727414410728.
[I 2024-03-06 02:18:57,905] Trial 1 finished with value: -11.167580672131479 and parameters: {'selector__k': 19, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'tanh', 'model__solver': 'adam', 'model__alpha': 74.32211631939198}. Best is trial 0 with value: -10.418727414410728.
[I 2024-03-06 02:19:01,767] Trial 2 finished with value: -25.80707549585387 and parameters: {'selector__k': 20, 'model__hidden_layer_sizes': (50,), 'model__activation': 'tanh', 'model__solver': 'sgd', 'model__alpha': 6.68571736258295}. Best is trial 0 with value: -10.418727414410728.
[I 2024-03-06 02:20:39,474] Trial 3 finished with value: -23.251043275991744 and parameters: {'selector__k': 44, 'model__hidd

[W 2024-03-06 03:11:54,847] Trial 26 failed with value nan.
[W 2024-03-06 03:15:02,789] Trial 27 failed with parameters: {'selector__k': 16, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'relu', 'model__solver': 'sgd', 'model__alpha': 0.005185143868193359} because of the following error: The value nan is not acceptable.
[W 2024-03-06 03:15:02,792] Trial 27 failed with value nan.
[W 2024-03-06 03:17:59,790] Trial 28 failed with parameters: {'selector__k': 16, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'relu', 'model__solver': 'sgd', 'model__alpha': 0.005583740848226032} because of the following error: The value nan is not acceptable.
[W 2024-03-06 03:17:59,791] Trial 28 failed with value nan.
[W 2024-03-06 03:21:10,625] Trial 29 failed with parameters: {'selector__k': 33, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'relu', 'model__solver': 'sgd', 'model__alpha': 0.0028250382088416026} because of the following error: The value nan is n

ANN -10.293587393841841 11.0429549565129 441.85978423764556 283.84423682092455


In [23]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

In [None]:
X_t_onlyreg

In [24]:
best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)



[I 2024-03-06 10:01:27,022] A new study created in memory with name: no-name-d0efc9e6-775d-4638-9000-0ddd3f06b6e5
[I 2024-03-06 10:01:29,282] Trial 0 finished with value: -10.153258323298045 and parameters: {'selector__k': 10, 'model__n_neighbors': 41, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -10.153258323298045.
[I 2024-03-06 10:01:32,208] Trial 1 finished with value: -18.88677293824096 and parameters: {'selector__k': 39, 'model__n_neighbors': 38, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -10.153258323298045.
[I 2024-03-06 10:01:33,427] Trial 2 finished with value: -15.741680754836938 and parameters: {'selector__k': 36, 'model__n_neighbors': 17, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 0 with value: -10.153258323298045.
[I 2024-03-06 10:01:34,782] Trial 3 finished with value: -20.17635920359382 and parameters: {'selector__k': 45, 'model__n_neighbors': 3, 'm

[I 2024-03-06 10:02:25,477] Trial 33 finished with value: -10.265610941593781 and parameters: {'selector__k': 12, 'model__n_neighbors': 43, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 27 with value: -9.878941714309711.
[I 2024-03-06 10:02:27,143] Trial 34 finished with value: -19.091438085100577 and parameters: {'selector__k': 43, 'model__n_neighbors': 45, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 27 with value: -9.878941714309711.
[I 2024-03-06 10:02:28,219] Trial 35 finished with value: -9.872493089235354 and parameters: {'selector__k': 15, 'model__n_neighbors': 48, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 35 with value: -9.872493089235354.
[I 2024-03-06 10:02:30,723] Trial 36 finished with value: -10.1521815892901 and parameters: {'selector__k': 15, 'model__n_neighbors': 48, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 35 with value: -9.872493089235354.
[I 2024-03

KNN -9.872493089235354 9.932608661553136 418.10448024140703 200.7568774030666


[I 2024-03-06 10:02:48,449] Trial 0 finished with value: -12.363398117333155 and parameters: {'selector__k': 42, 'model__learning_rate': 0.6115598031644136, 'model__n_estimators': 80, 'model__max_depth': 43, 'model__num_leaves': 12, 'model__min_child_samples': 13, 'model__subsample': 0.9470338545057777, 'model__colsample_bytree': 0.27576027240696266}. Best is trial 0 with value: -12.363398117333155.
[I 2024-03-06 10:02:51,267] Trial 1 finished with value: -12.649426674964259 and parameters: {'selector__k': 47, 'model__learning_rate': 0.8104617047021242, 'model__n_estimators': 382, 'model__max_depth': 18, 'model__num_leaves': 47, 'model__min_child_samples': 14, 'model__subsample': 0.26027252322513034, 'model__colsample_bytree': 0.4955214583705506}. Best is trial 0 with value: -12.363398117333155.
[I 2024-03-06 10:02:52,538] Trial 2 finished with value: -8.697597782213556 and parameters: {'selector__k': 21, 'model__learning_rate': 0.07354689053405343, 'model__n_estimators': 612, 'model__

[I 2024-03-06 10:03:24,961] Trial 21 finished with value: -9.217489669596935 and parameters: {'selector__k': 17, 'model__learning_rate': 0.12891389489170402, 'model__n_estimators': 1179, 'model__max_depth': 13, 'model__num_leaves': 32, 'model__min_child_samples': 18, 'model__subsample': 0.4315730785031385, 'model__colsample_bytree': 0.4158092153912235}. Best is trial 16 with value: -8.439148094560041.
[I 2024-03-06 10:03:25,847] Trial 22 finished with value: -8.94861766568091 and parameters: {'selector__k': 10, 'model__learning_rate': 0.26073867119529304, 'model__n_estimators': 272, 'model__max_depth': 7, 'model__num_leaves': 24, 'model__min_child_samples': 12, 'model__subsample': 0.7081182331329363, 'model__colsample_bytree': 0.5393034361202179}. Best is trial 16 with value: -8.439148094560041.
[I 2024-03-06 10:03:26,394] Trial 23 finished with value: -12.552522941059618 and parameters: {'selector__k': 19, 'model__learning_rate': 0.017210939166991762, 'model__n_estimators': 64, 'model

[I 2024-03-06 10:03:46,876] Trial 42 finished with value: -8.638429382694278 and parameters: {'selector__k': 15, 'model__learning_rate': 0.16144161760439374, 'model__n_estimators': 81, 'model__max_depth': 9, 'model__num_leaves': 34, 'model__min_child_samples': 8, 'model__subsample': 0.30880636503586834, 'model__colsample_bytree': 0.646071355484503}. Best is trial 16 with value: -8.439148094560041.
[I 2024-03-06 10:03:47,574] Trial 43 finished with value: -8.792343432152038 and parameters: {'selector__k': 15, 'model__learning_rate': 0.2312744144438529, 'model__n_estimators': 90, 'model__max_depth': 18, 'model__num_leaves': 39, 'model__min_child_samples': 8, 'model__subsample': 0.31039958000792633, 'model__colsample_bytree': 0.6634244469223737}. Best is trial 16 with value: -8.439148094560041.
[I 2024-03-06 10:03:48,288] Trial 44 finished with value: -8.553138088543747 and parameters: {'selector__k': 12, 'model__learning_rate': 0.16935402102559274, 'model__n_estimators': 76, 'model__max_

LGBM -8.439148094560041 10.347945784227322 457.0516525237453 256.7074891522627


[I 2024-03-06 10:04:25,916] Trial 0 finished with value: -12.12611884664912 and parameters: {'selector__k': 44, 'model__max_features': 'sqrt', 'model__n_estimators': 266, 'model__max_depth': 94, 'model__min_samples_leaf': 19}. Best is trial 0 with value: -12.12611884664912.
[I 2024-03-06 10:06:34,395] Trial 1 finished with value: -11.83282643197764 and parameters: {'selector__k': 34, 'model__max_features': 'log2', 'model__n_estimators': 1123, 'model__max_depth': 32, 'model__min_samples_leaf': 11}. Best is trial 1 with value: -11.83282643197764.
[I 2024-03-06 10:07:05,257] Trial 2 finished with value: -9.117393316321644 and parameters: {'selector__k': 19, 'model__max_features': 'log2', 'model__n_estimators': 350, 'model__max_depth': 184, 'model__min_samples_leaf': 7}. Best is trial 2 with value: -9.117393316321644.
[I 2024-03-06 10:09:51,770] Trial 3 finished with value: -11.908909639572974 and parameters: {'selector__k': 40, 'model__max_features': 'log2', 'model__n_estimators': 1130, '

[I 2024-03-06 10:24:15,456] Trial 30 finished with value: -9.416670078302415 and parameters: {'selector__k': 17, 'model__max_features': 'sqrt', 'model__n_estimators': 210, 'model__max_depth': 85, 'model__min_samples_leaf': 14}. Best is trial 23 with value: -8.821483713584065.
[I 2024-03-06 10:24:32,078] Trial 31 finished with value: -8.919255293403063 and parameters: {'selector__k': 15, 'model__max_features': 'sqrt', 'model__n_estimators': 137, 'model__max_depth': 120, 'model__min_samples_leaf': 1}. Best is trial 23 with value: -8.821483713584065.
[I 2024-03-06 10:24:36,452] Trial 32 finished with value: -8.93530803397586 and parameters: {'selector__k': 12, 'model__max_features': 'sqrt', 'model__n_estimators': 81, 'model__max_depth': 100, 'model__min_samples_leaf': 2}. Best is trial 23 with value: -8.821483713584065.
[I 2024-03-06 10:24:47,155] Trial 33 finished with value: -9.190008018849706 and parameters: {'selector__k': 17, 'model__max_features': 'sqrt', 'model__n_estimators': 109,

RF -8.776028278834715 9.323651289431345 356.962203714851 206.56060606060606


[I 2024-03-06 10:32:47,093] Trial 0 finished with value: -22.46114043190607 and parameters: {'selector__k': 33, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 12455.30660073751}. Best is trial 0 with value: -22.46114043190607.
[I 2024-03-06 10:33:36,305] Trial 1 finished with value: -22.944437152488632 and parameters: {'selector__k': 38, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'logistic', 'model__solver': 'sgd', 'model__alpha': 0.0023524422881788173}. Best is trial 0 with value: -22.46114043190607.
[I 2024-03-06 10:34:48,064] Trial 2 finished with value: -22.556848538517894 and parameters: {'selector__k': 19, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 1640.4285880813466}. Best is trial 0 with value: -22.46114043190607.
[I 2024-03-06 10:37:13,835] Trial 3 finished with value: -36.84749959715344 and parameters: {'selector__k':

[I 2024-03-06 11:42:08,006] Trial 28 finished with value: -9.239932110358366 and parameters: {'selector__k': 12, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'tanh', 'model__solver': 'adam', 'model__alpha': 0.661824507566603}. Best is trial 13 with value: -8.782008806065559.
[I 2024-03-06 11:46:49,280] Trial 29 finished with value: -22.18749737343873 and parameters: {'selector__k': 22, 'model__hidden_layer_sizes': (100, 100), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 23307.05182477937}. Best is trial 13 with value: -8.782008806065559.
[I 2024-03-06 11:47:51,706] Trial 30 finished with value: -19.6841444766275 and parameters: {'selector__k': 26, 'model__hidden_layer_sizes': (50,), 'model__activation': 'logistic', 'model__solver': 'adam', 'model__alpha': 5438.673873103818}. Best is trial 13 with value: -8.782008806065559.
[I 2024-03-06 11:50:56,533] Trial 31 finished with value: -8.755830543594172 and parameters: {'selector__k': 15, 'model

ANN -8.755830543594172 9.415229085940808 369.24571713482305 469.3681818279507


In [25]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-06 12:41:07,953] A new study created in memory with name: no-name-9d1e97e0-ad39-4fbe-9104-2361dad6cb8a
[I 2024-03-06 12:41:16,231] Trial 0 finished with value: -21.531212642266436 and parameters: {'n_neighbors': 30, 'weights': 'uniform', 'algorithm': 'ball_tree'}. Best is trial 0 with value: -21.531212642266436.
[I 2024-03-06 12:41:25,371] Trial 1 finished with value: -21.669243272341472 and parameters: {'n_neighbors': 47, 'weights': 'uniform', 'algorithm': 'ball_tree'}. Best is trial 0 with value: -21.531212642266436.
[I 2024-03-06 12:41:26,891] Trial 2 finished with value: -21.593307035415496 and parameters: {'n_neighbors': 34, 'weights': 'uniform', 'algorithm': 'brute'}. Best is trial 0 with value: -21.531212642266436.
[I 2024-03-06 12:41:28,432] Trial 3 finished with value: -21.6138640482782 and parameters: {'n_neighbors': 37, 'weights': 'uniform', 'algorithm': 'brute'}. Best is trial 0 with value: -21.531212642266436.
[I 2024-03-06 12:41:31,467] Trial 4 finished with va

[I 2024-03-06 12:44:28,913] Trial 39 finished with value: -21.27720468997154 and parameters: {'n_neighbors': 40, 'weights': 'distance', 'algorithm': 'kd_tree'}. Best is trial 28 with value: -20.783484601448286.
[I 2024-03-06 12:44:32,854] Trial 40 finished with value: -21.236105993613364 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'algorithm': 'kd_tree'}. Best is trial 28 with value: -20.783484601448286.
[I 2024-03-06 12:44:34,362] Trial 41 finished with value: -20.80225394539705 and parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 28 with value: -20.783484601448286.
[I 2024-03-06 12:44:35,993] Trial 42 finished with value: -20.83152568167121 and parameters: {'n_neighbors': 12, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 28 with value: -20.783484601448286.
[I 2024-03-06 12:44:37,457] Trial 43 finished with value: -20.92091893124984 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'brute'}. Best

KNN -20.78348460144828 19.349740828939886 962.855741632449 259.2251187786917


[I 2024-03-06 12:44:54,416] Trial 0 finished with value: -12.509127745023783 and parameters: {'learning_rate': 0.5622981053668684, 'n_estimators': 721, 'max_depth': 28, 'num_leaves': 28, 'min_child_samples': 5, 'subsample': 0.31240851423622445, 'colsample_bytree': 0.2848625830362576}. Best is trial 0 with value: -12.509127745023783.
[I 2024-03-06 12:45:15,433] Trial 1 finished with value: -12.098096690302008 and parameters: {'learning_rate': 0.4709694138499216, 'n_estimators': 2317, 'max_depth': 39, 'num_leaves': 28, 'min_child_samples': 3, 'subsample': 0.8029934468934813, 'colsample_bytree': 0.805734185671378}. Best is trial 1 with value: -12.098096690302008.
[I 2024-03-06 12:45:16,459] Trial 2 finished with value: -11.399994022506117 and parameters: {'learning_rate': 0.21094728603842836, 'n_estimators': 75, 'max_depth': 50, 'num_leaves': 21, 'min_child_samples': 2, 'subsample': 0.383954822557372, 'colsample_bytree': 0.7891154583828867}. Best is trial 2 with value: -11.399994022506117

[I 2024-03-06 12:47:34,518] Trial 25 finished with value: -11.212551647390281 and parameters: {'learning_rate': 0.13125215148682318, 'n_estimators': 436, 'max_depth': 21, 'num_leaves': 47, 'min_child_samples': 10, 'subsample': 0.6245704732798223, 'colsample_bytree': 0.5137364475199928}. Best is trial 12 with value: -10.374137880994173.
[I 2024-03-06 12:47:49,160] Trial 26 finished with value: -11.938038547223567 and parameters: {'learning_rate': 0.40280579851516707, 'n_estimators': 1720, 'max_depth': 11, 'num_leaves': 34, 'min_child_samples': 20, 'subsample': 0.2009132831441548, 'colsample_bytree': 0.4362418257725746}. Best is trial 12 with value: -10.374137880994173.
[I 2024-03-06 12:47:53,640] Trial 27 finished with value: -10.764968837495546 and parameters: {'learning_rate': 0.28465371839396053, 'n_estimators': 959, 'max_depth': 4, 'num_leaves': 40, 'min_child_samples': 13, 'subsample': 0.4664940989846848, 'colsample_bytree': 0.2564883912325336}. Best is trial 12 with value: -10.374

[I 2024-03-06 12:49:57,172] A new study created in memory with name: no-name-c4312bb8-47f4-4c61-90ee-a6c19458c8e8


LGBM -9.813017166878337 10.610752934525996 342.9104220412855 202.0727365224596


[I 2024-03-06 12:50:25,019] Trial 0 finished with value: -12.079886897800455 and parameters: {'max_features': 'sqrt', 'n_estimators': 165, 'max_depth': 144, 'min_samples_leaf': 8}. Best is trial 0 with value: -12.079886897800455.
[I 2024-03-06 12:51:39,397] Trial 1 finished with value: -11.83206945962868 and parameters: {'max_features': 'log2', 'n_estimators': 424, 'max_depth': 48, 'min_samples_leaf': 3}. Best is trial 1 with value: -11.83206945962868.
[I 2024-03-06 12:56:20,816] Trial 2 finished with value: -11.771961949477161 and parameters: {'max_features': 'log2', 'n_estimators': 1447, 'max_depth': 156, 'min_samples_leaf': 2}. Best is trial 2 with value: -11.771961949477161.
[I 2024-03-06 12:56:26,956] Trial 3 finished with value: -15.509359185829355 and parameters: {'max_features': 'log2', 'n_estimators': 212, 'max_depth': 2, 'min_samples_leaf': 16}. Best is trial 2 with value: -11.771961949477161.
[I 2024-03-06 12:56:56,745] Trial 4 finished with value: -12.013585549477433 and pa

[I 2024-03-06 14:29:32,175] Trial 36 finished with value: -11.991082837926871 and parameters: {'max_features': 'log2', 'n_estimators': 751, 'max_depth': 153, 'min_samples_leaf': 4}. Best is trial 12 with value: -11.685473930781528.
[I 2024-03-06 14:37:51,287] Trial 37 finished with value: -11.651151535249324 and parameters: {'max_features': 'sqrt', 'n_estimators': 2000, 'max_depth': 96, 'min_samples_leaf': 1}. Best is trial 37 with value: -11.651151535249324.
[I 2024-03-06 14:48:52,203] Trial 38 finished with value: -11.962591133985752 and parameters: {'max_features': 'sqrt', 'n_estimators': 2247, 'max_depth': 96, 'min_samples_leaf': 6}. Best is trial 37 with value: -11.651151535249324.
[I 2024-03-06 14:54:54,947] Trial 39 finished with value: -11.751217706314941 and parameters: {'max_features': 'sqrt', 'n_estimators': 1684, 'max_depth': 51, 'min_samples_leaf': 2}. Best is trial 37 with value: -11.651151535249324.
[I 2024-03-06 15:02:10,992] Trial 40 finished with value: -11.8502422101

RF -11.651151535249324 9.915961623636665 345.507688949671 209.9105


[I 2024-03-06 15:47:12,026] Trial 0 finished with value: -24.339151652811022 and parameters: {'hidden_layer_sizes': (30, 30, 30), 'activation': 'relu', 'solver': 'adam', 'alpha': 5.1873746758734936e-05}. Best is trial 0 with value: -24.339151652811022.
[W 2024-03-06 15:51:25,095] Trial 1 failed with parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'relu', 'solver': 'sgd', 'alpha': 6.549279862090275} because of the following error: The value nan is not acceptable.
[W 2024-03-06 15:51:25,096] Trial 1 failed with value nan.
[W 2024-03-06 15:55:49,635] Trial 2 failed with parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'relu', 'solver': 'sgd', 'alpha': 50.5445369700774} because of the following error: The value nan is not acceptable.
[W 2024-03-06 15:55:49,637] Trial 2 failed with value nan.
[I 2024-03-06 15:57:22,766] Trial 3 finished with value: -16.465562357260083 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'logistic', 'solver': 'adam', 'alpha':

[I 2024-03-06 16:47:24,586] Trial 33 finished with value: -28.882583612267762 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.004602952097008432}. Best is trial 27 with value: -11.562705721389749.
[I 2024-03-06 16:48:56,632] Trial 34 finished with value: -15.669992131539033 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'logistic', 'solver': 'adam', 'alpha': 114.22832799075934}. Best is trial 27 with value: -11.562705721389749.
[I 2024-03-06 16:50:52,855] Trial 35 finished with value: -12.773632253940384 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'logistic', 'solver': 'adam', 'alpha': 556.5551822445497}. Best is trial 27 with value: -11.562705721389749.
[I 2024-03-06 16:53:27,720] Trial 36 finished with value: -19.88276095625377 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'logistic', 'solver': 'adam', 'alpha': 8542.628941198647}. Best is trial 27 with value: -11.562705721389749.
[I 2024-0

ANN -11.562705721389749 12.091260494115376 440.04431464163446 220.50840443176978


In [None]:
with open('best_model_onlyreg.pickle', 'wb') as f:
    pickle.dump(best_model_onlyreg, f, pickle.HIGHEST_PROTOCOL)

In [None]:

# X_t = X_t_old[selected_features] 
# X_t['cluster'] = train_data_tr['remainder__clusters']
# X_t['remainder__link_counts'] = train_data_tr['remainder__link_counts']
# X_v = validate_data_tr[selected_features]
# X_v['cluster'] = validate_data_tr['remainder__clusters']
# X_v['remainder__link_counts'] = validate_data_tr['remainder__link_counts']
# X_te = test_data_tr[selected_features]
# X_te['cluster'] = test_data_tr['remainder__clusters']
# X_te['remainder__link_counts'] = test_data_tr['remainder__link_counts']

In [None]:
selected_features

In [None]:
model = xgb.XGBRegressor(random_state=101)
opt = BayesSearchCV(
    model,
    param_space['XGB'],
    n_iter=50,  # Adjust the number of iterations based on your computational resources
    cv=ShuffleSplit(test_size=0.20, n_splits=3),  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

opt.fit(X_t, y_t)
best_params = opt.best_params_ 

model = xgb.XGBRegressor(**best_params, random_state=101)
model.fit(X_t, y_t)
te_predictions = model.predict(X_te)
te_mae = mean_absolute_error(y_te, te_predictions)
print(te_mae)

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(model.apply(X_t))
rf_lr = LogisticRegressionCV(cv=ShuffleSplit(test_size=0.20, n_splits=3))
rf_lr.fit(X_t_leaves, y_t)
X_te_leaves = onehot.transform(model.apply(X_te))
y_te_pred_rf_lr = rf_lr.predict(X_te_leaves)
te_rflr_mae = mean_absolute_error(y_te, y_te_pred_rf_lr)
print(te_rflr_mae)

In [None]:
v_mae_list = []
te_mae_list = []
v_rf_mae_list = []
te_rf_mae_list = []
# X_te['cluster'] = 1
for cluster_label in list(set(X_t['cluster'])):
    X_v_cluster, X_te_cluster = None, None
    v_mae, te_mae, v_rflr_mae, te_rflr_mae = None, None, None, None
    # Subset the training data for the current cluster
    X_t_cluster = X_t[X_t['cluster'] == cluster_label]
    y_t_cluster = X_t_cluster['remainder__link_counts']
    X_t_cluster = X_t_cluster.drop(columns=['remainder__link_counts','cluster'])  
    
    if cluster_label in list(set(X_v['cluster'])):
        X_v_cluster = X_v[X_v['cluster'] == cluster_label]
        y_v_cluster = X_v_cluster['remainder__link_counts']
        X_v_cluster = X_v_cluster.drop(columns=['remainder__link_counts','cluster'])  
    if cluster_label in list(set(X_te['cluster'])):
        X_te_cluster = X_te[X_te['cluster'] == cluster_label]
        y_te_cluster = X_te_cluster['remainder__link_counts']
        X_te_cluster = X_te_cluster.drop(columns=['remainder__link_counts','cluster'])  
    

    # Train a regression model for the current cluster
    # model = RidgeCV(scoring='neg_mean_absolute_error')
    # model = RandomForestRegressor(criterion='friedman_mse', n_estimators=300,
    #                   random_state=101)

    if not ((X_v_cluster is None) and (X_te_cluster is None)):

        model = xgb.XGBRegressor(random_state=101)
        opt = BayesSearchCV(
            model,
            param_space['XGB'],
            n_iter=60,  # Adjust the number of iterations based on your computational resources
            cv=ShuffleSplit(test_size=0.20, n_splits=1),  # Adjust the number of cross-validation folds
            scoring='neg_mean_absolute_error',  # Use a suitable regression metric
            n_jobs=-1,
        )
        
        opt.fit(X_t_cluster, y_t_cluster)
        best_params = opt.best_params_ 
        onehot = OneHotEncoder()
        model = xgb.XGBRegressor(**best_params, random_state=101)
        model.fit(X_t_cluster, y_t_cluster)

        X_t_leaves = onehot.fit_transform(model.apply(X_t_cluster))
        rf_lr = LogisticRegression()
        rf_lr.fit(X_t_leaves, y_t_cluster)
        if (X_v_cluster is not None):
            v_predictions = model.predict(X_v_cluster)
            v_mae = mean_absolute_error(y_v_cluster, v_predictions)
            row_count = len(y_v_cluster)
            v_mae_list.append(v_mae*row_count)

            X_v_leaves = onehot.transform(model.apply(X_v_cluster))
            y_v_pred_rf_lr = rf_lr.predict(X_v_leaves)
            row_count = len(y_v_pred_rf_lr)
            v_rflr_mae = mean_absolute_error(y_v_cluster, y_v_pred_rf_lr)
            v_rf_mae_list.append(v_rflr_mae*row_count)
        if (X_te_cluster is not None):
            te_predictions = model.predict(X_te_cluster)
            te_mae = mean_absolute_error(y_te_cluster, te_predictions)
            row_count = len(y_te_cluster)
            te_mae_list.append(te_mae*row_count)
      
            X_te_leaves = onehot.transform(model.apply(X_te_cluster))
            y_te_pred_rf_lr = rf_lr.predict(X_te_leaves)
            row_count = len(y_te_pred_rf_lr)
            te_rflr_mae = mean_absolute_error(y_te_cluster, y_te_pred_rf_lr)
            te_rf_mae_list.append(te_rflr_mae*row_count)
    print(cluster_label, v_mae, te_mae, v_rflr_mae, te_rflr_mae)
v_mae_list = list(filter(lambda item: item is not None, v_mae_list))    
te_mae_list = list(filter(lambda item: item is not None, te_mae_list))    
v_mean_mae = sum(v_mae_list)/len(X_v)
te_mean_mae = sum(te_mae_list)/len(X_te)
print(v_mean_mae)
print(te_mean_mae)    

In [None]:
best_params = None
best_mae = 0.0

# Generate all combinations of hyperparameters
param_combinations = product(*param_grid.values())
# Loop through hyperparameter combinations
for params in param_combinations:
    # Create and train the model with the current hyperparameters
    model = lgb.LGBMRegressor(**dict(zip(param_grid.keys(), params)), random_state=101)
    model.fit(X_t[X_v.columns], y_t)

    # Evaluate the model on the validation dataset
    y_val_pred = model.predict(X_v)
    mae = mean_absolute_error(y_v, y_val_pred)

    # Check if current hyperparameters are the best
    if mae > best_mae:
        best_mae = mae
        best_params = dict(zip(param_grid.keys(), params))
print(best_mae)
print(best_params)
# Train the final model using the entire training dataset with the best hyperparameters
final_model = lgb.LGBMRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

# Evaluate the final model on the test dataset
y_test_pred = final_model.predict(X_te)
test_mae = mean_absolute_error(y_te, y_test_pred)
print(test_mae)

In [None]:
# Define the LGBMRegressor model
lgbm_model = lgb.LGBMRegressor(random_state=101, force_col_wise=True)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    lgbm_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t[X_v.columns], y_t, eval_set = [(X_v, y_v)])

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)

# Train the final model with the best hyperparameters
final_model = lgb.LGBMRegressor(**best_params, random_state=101, force_col_wise=True)
final_model.fit(X_t, y_t, eval_set = [(X_te, y_te)])

print("With LGB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))


In [None]:
print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.predict(X_t, pred_leaf=True))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With LGB + LR Pred")
X_t_leaves = onehot.transform(final_model.predict(X_t, pred_leaf=True))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.predict(X_te, pred_leaf=True))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
param_space = {
    'learning_rate': np.arange(0.01, 1.0, 0.01),
    'n_estimators': np.arange(50, 2001, 50),
    'max_depth': np.arange(1, 20),
    'max_leaves': np.arange(2, 50),
    'max_bin': np.arange(2, 50),
    'gamma': np.arange(1, 20),
    # 'min_child_weight': np.arange(0, 20),
    # 'subsample': np.arange(0.1, 1.0, 0.1),
    # 'colsample_bytree': np.arange(0.1, 1.0, 0.1),
    # 'reg_alpha': np.arange(0, 100),
    # 'reg_lambda': np.arange(0, 10, 0.01),
}

# Define the LGBMRegressor model
xgb_model = xgb.XGBRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    xgb_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t[X_v.columns], y_t, eval_set = [(X_v, y_v)])
# 
# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)

# Train the final model using the entire training dataset with the best hyperparameters
final_model = xgb.XGBRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t, eval_set = [(X_te, y_te)])

# Evaluate the final model on the test dataset
y_test_pred = final_model.predict(X_te)
test_mae = mean_absolute_error(y_te, y_test_pred)
print(test_mae)

In [None]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With XGB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))


# # Make predictions on the validation set
# y_pred = final_model.predict(X_te)

# # Evaluate the model on the validation set
# mae = mean_absolute_error(y_te, y_pred)
# print(f'Mean Absolute Error on Validation Set: {mae}')

In [None]:
param_space =  {
    'max_features': Categorical(['sqrt', 'log2']),
    'n_estimators': np.arange(50, 2001, 50),
    'bootstrap': Categorical([True, False]),
    'max_depth': np.arange(1, 20),
    'min_samples_leaf': np.arange(1, 20),
    'criterion': Categorical(['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])
}
# Define the LGBMRegressor model
rf_model = RandomForestRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    rf_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t, y_t)

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)


In [None]:
# Train the final model with the best hyperparameters
final_model = RandomForestRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With RF Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
param_space = {
    'learning_rate': np.arange(0.01, 1.0, 0.01),
    'n_estimators': np.arange(50, 2001, 50),
    'max_depth': np.arange(1, 200),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 10),
    'subsample': np.arange(0.1, 1.0, 0.1),
}
# Define the LGBMRegressor model
gb_model = GradientBoostingRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    gb_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t, y_t)

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)


In [None]:
final_model = GradientBoostingRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With GB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
def set_randomstate(model):
    current_params = model.get_params()

    # Update the random_state if it exists, otherwise add it to the parameters
    current_params['random_state'] = 101

    # Set the modified parameters back to the model
    model.set_params(**current_params)
    return model

kf = KFold(n_splits=5, shuffle=True, random_state=101)
models = {
    # 'Linear Regression': LinearRegression(),
    # 'Lasso': LassoCV(cv=kf, random_state=42, max_iter=200000),
    # 'Logistic': LogisticRegression(),
    # 'KNN': pd.read_pickle("result_KNN.pkl")['estimator'],
    'XGB': set_randomstate(pd.read_pickle("result_XGB.pkl")['estimator']),
    'LGBM': set_randomstate(pd.read_pickle("result_LGBM.pkl")['estimator']),
    # 'Ridge': RidgeCV(cv=kf),
    # 'SVR': pd.read_pickle("result_SVR.pkl")['estimator'],
    # 'Random Forest': RandomForestRegressor(bootstrap=False, max_depth=15, max_features=0.7,
    #                   min_samples_leaf=9, n_estimators=200, random_state=101),
    # 'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.01, max_depth=14, min_samples_leaf=3,
    #                       min_samples_split=4, n_estimators=950, random_state=101, subsample=0.8),
    # 'Artificial Neural Network': MLPRegressor(activation='logistic', alpha=0.01, hidden_layer_sizes=(50, 50),
    #          max_iter=4000, random_state=101),
    # 'Gaussian Process Regression': GaussianProcessRegressor(0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)))
}

In [None]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
        scoring=['neg_mean_absolute_error','neg_mean_squared_error']
        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=101)

        # Perform k-fold cross-validation and calculate MSE and MAE
        scores = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, n_jobs=-1)

        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()

        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Train and evaluate
results = evaluate_models(models, X_t, y_t)


In [None]:
def model_lr_evaluation(models, X_train, y_train, X_test, y_test):
    results = {}
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']

    for name, model in models.items():
        final_model = model
        
        scores = cross_validate(model, X_train, y_train, cv=3, scoring=scoring, n_jobs=-1)
        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()
        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        
        final_model.fit(X_train, y_train)
        y_te_pred = final_model.predict(X_test)
        print(mean_absolute_error(y_test, y_te_pred), mean_squared_error(y_test, y_te_pred))
        
        print("---------------")
        
        onehot = OneHotEncoder()
        if name == 'LGBM':
            X_t_leaves = onehot.fit_transform(final_model.predict(X_train, pred_leaf=True))
        elif name == 'XGB':
            X_t_leaves = onehot.fit_transform(final_model.apply(X_train))
        xgb_lr = LogisticRegression()
        xgb_lr.fit(X_t_leaves, y_train)

        if name == 'LGBM':
            X_t_leaves = onehot.transform(final_model.predict(X_train, pred_leaf=True))
            X_te_leaves = onehot.transform(final_model.predict(X_test, pred_leaf=True))
        elif name == 'XGB':
            X_t_leaves = onehot.transform(final_model.apply(X_train))
            X_te_leaves = onehot.transform(final_model.apply(X_test))

        scores = cross_validate(model, X_t_leaves, y_train, cv=kf, scoring=scoring, n_jobs=-1)
        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()
        print(mean_mae, mean_mse, std_mse) 
        
        y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
        print(mean_absolute_error(y_test, y_te_pred_xgb_lr), mean_squared_error(y_test, y_te_pred_xgb_lr))

model_lr_evaluation(models, X_t, y_t, X_te, y_te)

In [None]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [None]:
import pickle

result_final_with_test = {}
for name, model in models.items():
    result_final_with_test[name] = evaluate_models_with_test(model, X_t, y_t, X_te, y_te)

result_final_with_test

In [None]:
train_data_0 = train_data[train_data['clusters']==0]
train_data_1 = train_data[train_data['clusters']==1]
validate_data_0 = validate_data[validate_data['clusters']==0]
validate_data_1 = validate_data[validate_data['clusters']==1]
test_data_0 = test_data[test_data['clusters']==0]
test_data_1 = test_data[test_data['clusters']==1]
test_data_2 = test_data[test_data['clusters']==2]
test_data_1 = pd.concat([test_data_1, test_data_2], ignore_index=True)

import matplotlib.pyplot as plt

# Assuming you have three DataFrames: df1, df2, and df3

# Plotting DataFrame 1 in red
plt.plot(train_data_0['start_node_x'], train_data_0['start_node_y'], 'ro-', label='DF1 Start')
plt.plot(train_data_0['end_node_x'], train_data_0['end_node_y'], 'ro-', label='DF1 End')

# Plotting DataFrame 2 in green
plt.plot(train_data_1['start_node_x'], train_data_1['start_node_y'], 'go-', label='DF2 Start')
plt.plot(train_data_1['end_node_x'], train_data_1['end_node_y'], 'go-', label='DF2 End')

# Plotting DataFrame 3 in blue
plt.plot(validate_data_0['start_node_x'], validate_data_0['start_node_y'], 'bo-', label='DF3 Start')
plt.plot(validate_data_0['end_node_x'], validate_data_0['end_node_y'], 'bo-', label='DF3 End')
plt.plot(validate_data_1['start_node_x'], validate_data_1['start_node_y'], 'yo-', label='DF4 Start')
plt.plot(validate_data_1['end_node_x'], validate_data_1['end_node_y'], 'yo-', label='DF4 End')

plt.plot(test_data_0['start_node_x'], test_data_0['start_node_y'], 'co-', label='DF5 Start')
plt.plot(test_data_0['end_node_x'], test_data_0['end_node_y'], 'co-', label='DF5 End')
plt.plot(test_data_1['start_node_x'], test_data_1['start_node_y'], 'mo-', label='DF6 Start')
plt.plot(test_data_1['end_node_x'], test_data_1['end_node_y'], 'mo-', label='DF6 End')
# plt.plot(test_data_2['start_node_x'], test_data_2['start_node_y'], 'ko-', label='DF7 Start')
# plt.plot(test_data_2['end_node_x'], test_data_2['end_node_y'], 'ko-', label='DF7 End')
# Add labels and legend
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()

# Show the plot
plt.show()