In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

## Parameter setting

In [4]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'link_length', 'link_freespeed', 
                      'link_capacity', 'link_permlanes', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 
                      'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'link_times', 'capacity_divided_by_lanes',
                      'income', 'score', 'income_avg', 'score_avg'
                     ]
category_feature = ['type', 'home-activity-zone']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
#     'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101)
}

model_space = {
    'KNN': KNeighborsRegressor(),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
    'GPR': GaussianProcessRegressor(copy_X_train=False, random_state=101),
#     'SVR': SVR()
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'identity', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
'GPR':{
    'kernel': optuna.distributions.CategoricalDistribution([0.1**2 * RBF(length_scale=0.1) + 
                                    WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5)), 
                                    0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0),
                                    50.0**2 * RBF(length_scale=50.0), DotProduct() + WhiteKernel(), 
                                    1.0 * Matern(length_scale=1.0, nu=1.5),
                                    RBF() + ConstantKernel(constant_value=2)
                                                           ]),
    'alpha':  optuna.distributions.FloatDistribution(1e-15, 1e10)
}
}

## Load data

In [5]:
train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

## Find the shortest path

In [6]:
# Find the indices where 'link_id' is 0
indices = Big_data.index[Big_data['link_id'] == 0].tolist()

# Add the end of the DataFrame to the indices list
indices.append(len(Big_data))

# Split the DataFrame using the indices
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]

In [10]:
list_od = []
list_nodes = []
all_files = train_files + validate_files + test_files
for i in all_files:
    with open(i) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]

In [13]:
import networkx as nx
shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)

In [14]:
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )

In [15]:
Big_data = pd.concat(dfs)
nodes_data = Big_data[['link_id', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y']]
grouped = nodes_data.groupby(['start_node_x', 'start_node_y'])
filtered_df = grouped.filter(lambda x: len(x) == 1)
filtered_df = filtered_df.drop_duplicates()
node_mapping = filtered_df.set_index(['start_node_x', 'start_node_y']).apply(
    lambda row: (row['end_node_x'], row['end_node_y']), axis=1).to_dict()

all_nodes = set(node_mapping.keys()) | set(node_mapping.values())
end_nodes = set(node_mapping.values())

start_nodes = list(all_nodes - end_nodes)

paths = []
for start_node in start_nodes:
    path = [start_node]
    while path[-1] in node_mapping:
        next_node = node_mapping[path[-1]]
        path.append(next_node)
    paths.append(path)
    
new_paths = [x for x in paths if len(x) >2]
def map_path_to_links(df, path):
    path_links = pd.DataFrame()
    for i in range(len(path) - 1):
        start_node = path[i]
        end_node = path[i+1]
        link_row = df[(df['start_node_x'] == start_node[0]) & 
                      (df['start_node_y'] == start_node[1]) & 
                      (df['end_node_x'] == end_node[0]) & 
                      (df['end_node_y'] == end_node[1])]
        if not link_row.empty:
            path_links = pd.concat([path_links, link_row])
    return path_links

# Step 3: Create separate DataFrames for each path
path_dfs = []
for path in new_paths:
    link_df = map_path_to_links(Big_data, path)
    path_dfs.append(link_df)

Big_data_drop = Big_data.copy(deep=True)
for path_df in path_dfs:
    numeric_df = path_df.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = path_df['home-activity-zone'].mode()
    type_value = path_df['type'].mode()
    dataset = path_df['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    Big_data_drop = pd.concat([Big_data_drop, mean_df])

    try:
        Big_data_drop.drop(path_df.index, inplace=True)
    except:
        pass

## Clustering

In [16]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['x_y_coor'] = cluster.fit_predict(Big_data_drop[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['similar_link'] = cluster1.fit_predict(Big_data_drop[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['planxml'] = cluster2.fit_predict(Big_data_drop[['income', 'score', 'rush_hour',
                                                               'max_dur', 'cemdapStopDuration_s']])

Big_data_drop = Big_data_drop.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

In [17]:
train_data_drop = Big_data_drop[Big_data_drop['dataset']=='train']
validate_data_drop = Big_data_drop[Big_data_drop['dataset']=='validate']
test_data_drop = Big_data_drop[Big_data_drop['dataset']=='test']

## Create compressed berlin dataset

In [18]:
# change the column name for  xy coordinate
column_name = 'similar_link'
train_data_edit = pd.DataFrame()
for i in list(set(train_data_drop[column_name].tolist())):
    new_train = train_data_drop[train_data_drop[column_name]==i]
    numeric_df = new_train.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = new_train['home-activity-zone'].mode()
    type_value = new_train['type'].mode()
    dataset = new_train['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    train_data_edit = pd.concat([train_data_edit, mean_df], ignore_index=True)
    
validate_data_edit = pd.DataFrame()
for i in list(set(validate_data_drop[column_name].tolist())):
    new_validate = validate_data_drop[validate_data_drop[column_name]==i]
    numeric_df = new_validate.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = new_validate['home-activity-zone'].mode()
    type_value = new_validate['type'].mode()
    dataset = new_validate['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    validate_data_edit = pd.concat([validate_data_edit, mean_df], ignore_index=True)

Big_data_edit = pd.concat([train_data_edit, validate_data_edit, test_data_drop], ignore_index=True)
Big_data_edit = Big_data_edit.astype({column_name:'int64'})

In [32]:
Big_data_tr = ct.fit_transform(Big_data_edit)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)


In [34]:
X_t_feature = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'remainder__link_id',
                                 'remainder__link_from', 'remainder__link_to', 'num_preprocess__start_node_x', 'num_preprocess__start_node_y',
                                'num_preprocess__end_node_x', 'num_preprocess__end_node_y', 
                                 'num_preprocess__capacity_divided_by_lanes', 'num_preprocess__score_avg', 'num_preprocess__income_avg',
                                 'num_preprocess__length_per_capacity_ratio', 'num_preprocess__length_times_lanes', 'num_preprocess__speed_times_capacity'
                                ])
correlation_matrix = X_t_feature.corr()


# Find the most correlated feature for each feature, excluding itself
most_correlated = correlation_matrix.apply(lambda x: x.index[x.abs()[x.index != x.name].argmax()], axis=1)

# Create a DataFrame to store the most correlated features
corr_table = pd.DataFrame({'Feature': correlation_matrix.columns, 'Most Correlated': most_correlated})

# Add a column to show the correlation value
corr_table['Correlation Value'] = corr_table.apply(lambda x: correlation_matrix.loc[x['Feature'], x['Most Correlated']], axis=1)

new_corr_table = corr_table.sort_values(by='Correlation Value', ascending=False).reset_index(drop=True)
new_corr_table[(new_corr_table['Correlation Value'].abs()>=0.8) & (new_corr_table['Correlation Value'].abs() !=1)]

Unnamed: 0,Feature,Most Correlated,Correlation Value
3,num_preprocess__score,num_preprocess__income,0.989629
4,num_preprocess__cemdapStopDuration_s,num_preprocess__income,0.908351
5,num_preprocess__link_permlanes,num_preprocess__link_capacity,0.8445


In [21]:
temp = temp.drop(columns=['remainder__link_id', 'remainder__link_from', 'remainder__link_to',
                                  'num_preprocess__start_node_x', 'num_preprocess__start_node_y',
                                'num_preprocess__end_node_x', 'num_preprocess__end_node_y', 
                                 'num_preprocess__capacity_divided_by_lanes', 'num_preprocess__score_avg', 'num_preprocess__income_avg',
                                 'num_preprocess__length_per_capacity_ratio', 'num_preprocess__length_times_lanes', 'num_preprocess__speed_times_capacity'])
test_data_tr = test_data_tr.drop(columns=['remainder__link_id', 'remainder__link_from', 'remainder__link_to',
                                  'num_preprocess__start_node_x', 'num_preprocess__start_node_y',
                                'num_preprocess__end_node_x', 'num_preprocess__end_node_y', 
                                 'num_preprocess__capacity_divided_by_lanes', 'num_preprocess__score_avg', 'num_preprocess__income_avg',
                                 'num_preprocess__length_per_capacity_ratio', 'num_preprocess__length_times_lanes', 'num_preprocess__speed_times_capacity'])                                 

# Classification and regression

In [22]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [28]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t_clf.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-06-14 16:53:42,899] A new study created in memory with name: no-name-0ffb1b2a-c633-440c-97e8-1758eb5ec7f8
[I 2024-06-14 16:53:43,003] Trial 0 finished with value: 0.9936974789915967 and parameters: {'selector__k': 29, 'model__n_neighbors': 1, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:43,053] Trial 1 finished with value: 0.9936974789915967 and parameters: {'selector__k': 25, 'model__n_neighbors': 37, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:43,116] Trial 2 finished with value: 0.9936974789915967 and parameters: {'selector__k': 23, 'model__n_neighbors': 39, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:43,152] Trial 3 finished with value: 0.9936974789915967 and parameters: {'selector__k': 35, 'model__n_neighbors': 6, 'model__

KNN 0.9936974789915967 0.8437458897803498


[I 2024-06-14 16:53:45,975] Trial 6 finished with value: 0.9936974789915967 and parameters: {'selector__k': 16, 'model__learning_rate': 0.7718625538244588, 'model__n_estimators': 204, 'model__max_depth': 41, 'model__num_leaves': 47, 'model__min_child_samples': 18, 'model__subsample': 0.19851419236944068, 'model__colsample_bytree': 0.5318322254620497}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:46,012] Trial 7 finished with value: 0.9936974789915967 and parameters: {'selector__k': 10, 'model__learning_rate': 0.12558381311756836, 'model__n_estimators': 413, 'model__max_depth': 47, 'model__num_leaves': 6, 'model__min_child_samples': 10, 'model__subsample': 0.19961489256185874, 'model__colsample_bytree': 0.4590095777727201}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:46,053] Trial 8 finished with value: 0.9936974789915967 and parameters: {'selector__k': 24, 'model__learning_rate': 0.3616164229378845, 'model__n_estimators': 315, 'model__max_

LGBM 0.9936974789915967 0.8428252005787189


[I 2024-06-14 16:53:49,335] Trial 0 finished with value: 0.9936974789915967 and parameters: {'selector__k': 29, 'model__max_features': 'log2', 'model__n_estimators': 152, 'model__max_depth': 177, 'model__min_samples_leaf': 12}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:50,033] Trial 1 finished with value: 0.9936974789915967 and parameters: {'selector__k': 10, 'model__max_features': 'sqrt', 'model__n_estimators': 325, 'model__max_depth': 186, 'model__min_samples_leaf': 5}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:50,238] Trial 2 finished with value: 0.9936974789915967 and parameters: {'selector__k': 13, 'model__max_features': 'log2', 'model__n_estimators': 94, 'model__max_depth': 157, 'model__min_samples_leaf': 11}. Best is trial 0 with value: 0.9936974789915967.
[I 2024-06-14 16:53:50,622] Trial 3 finished with value: 0.9936974789915967 and parameters: {'selector__k': 3, 'model__max_features': 'sqrt', 'model__n_estimators': 177, 'mod

RF 0.9936974789915967 0.8462449033276338


In [None]:
import pickle
with open('best_model_clf_compressed.pickle', 'wb') as f:
    pickle.dump(best_model_clf, f, pickle.HIGHEST_PROTOCOL)

In [None]:
best_model_clf = pd.read_pickle("best_model_clf_compressed.pickle")

In [29]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

### Group the used link from training and validation dataset and later group the predicted used link in test dataset

In [30]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [31]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-06-14 16:55:44,860] A new study created in memory with name: no-name-b3943769-82a9-44b9-a2d0-1ae1d7c52383
[I 2024-06-14 16:55:44,906] Trial 0 finished with value: -17.47556886410939 and parameters: {'selector__k': 29, 'model__n_neighbors': 23, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: -17.47556886410939.
[W 2024-06-14 16:55:44,924] Trial 1 failed with parameters: {'selector__k': 45, 'model__n_neighbors': 16, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'} because of the following error: The value nan is not acceptable.
[W 2024-06-14 16:55:44,925] Trial 1 failed with value nan.
[W 2024-06-14 16:55:44,943] Trial 2 failed with parameters: {'selector__k': 38, 'model__n_neighbors': 15, 'model__weights': 'distance', 'model__algorithm': 'brute'} because of the following error: The value nan is not acceptable.
[W 2024-06-14 16:55:44,944] Trial 2 failed with value nan.
[W 2024-06-14 16:55:44,963] Trial 3 failed with parameters:

KNN -8.919248656377167 9.871155507378237 353.3200908159889 174.2916344516359


[I 2024-06-14 16:55:46,909] Trial 2 finished with value: -9.255366513429887 and parameters: {'selector__k': 18, 'model__learning_rate': 0.4335256285388143, 'model__n_estimators': 220, 'model__max_depth': 4, 'model__max_leaves': 43, 'model__max_bin': 17, 'model__gamma': 9}. Best is trial 2 with value: -9.255366513429887.
[I 2024-06-14 16:55:47,000] Trial 3 finished with value: -9.918359879705624 and parameters: {'selector__k': 11, 'model__learning_rate': 0.7002698228756383, 'model__n_estimators': 76, 'model__max_depth': 19, 'model__max_leaves': 42, 'model__max_bin': 14, 'model__gamma': 15}. Best is trial 2 with value: -9.255366513429887.
[I 2024-06-14 16:55:47,200] Trial 4 finished with value: -9.54586691254863 and parameters: {'selector__k': 20, 'model__learning_rate': 0.6735951710970887, 'model__n_estimators': 392, 'model__max_depth': 17, 'model__max_leaves': 33, 'model__max_bin': 42, 'model__gamma': 14}. Best is trial 2 with value: -9.255366513429887.
[I 2024-06-14 16:55:47,277] Tria

XGB -9.16506280811135 11.08223330575923 334.8926792245897 168.25401306152344


[I 2024-06-14 16:55:50,851] Trial 0 finished with value: -8.040665971661948 and parameters: {'selector__k': 18, 'model__learning_rate': 0.5729694885290083, 'model__n_estimators': 375, 'model__max_depth': 14, 'model__num_leaves': 40, 'model__min_child_samples': 1, 'model__subsample': 0.8299960182102328, 'model__colsample_bytree': 0.5620655946648829}. Best is trial 0 with value: -8.040665971661948.
[W 2024-06-14 16:55:50,867] Trial 1 failed with parameters: {'selector__k': 36, 'model__learning_rate': 0.13022067107121327, 'model__n_estimators': 82, 'model__max_depth': 21, 'model__num_leaves': 13, 'model__min_child_samples': 9, 'model__subsample': 0.5641217153550651, 'model__colsample_bytree': 0.971070613531251} because of the following error: The value nan is not acceptable.
[W 2024-06-14 16:55:50,867] Trial 1 failed with value nan.
[I 2024-06-14 16:55:51,412] Trial 2 finished with value: -16.31298594215975 and parameters: {'selector__k': 33, 'model__learning_rate': 0.9520764374815296, 'm

LGBM -8.040665971661948 11.77054383274928 428.114625951446 232.92600016562972


[I 2024-06-14 16:56:01,095] Trial 1 finished with value: -12.474638897547186 and parameters: {'selector__k': 29, 'model__max_features': 'log2', 'model__n_estimators': 143, 'model__max_depth': 65, 'model__min_samples_leaf': 18}. Best is trial 1 with value: -12.474638897547186.
[I 2024-06-14 16:56:01,448] Trial 2 finished with value: -8.958731890487584 and parameters: {'selector__k': 25, 'model__max_features': 'sqrt', 'model__n_estimators': 57, 'model__max_depth': 98, 'model__min_samples_leaf': 3}. Best is trial 2 with value: -8.958731890487584.
[W 2024-06-14 16:56:01,463] Trial 3 failed with parameters: {'selector__k': 47, 'model__max_features': 'sqrt', 'model__n_estimators': 69, 'model__max_depth': 21, 'model__min_samples_leaf': 10} because of the following error: The value nan is not acceptable.
[W 2024-06-14 16:56:01,464] Trial 3 failed with value nan.
[I 2024-06-14 16:56:02,331] Trial 4 finished with value: -18.59260377269172 and parameters: {'selector__k': 2, 'model__max_features':

RF -8.958731890487584 11.360163961460948 327.47328303009516 170.623820977072


[I 2024-06-14 16:56:08,828] Trial 0 finished with value: -137.74741442348608 and parameters: {'selector__k': 25, 'model__learning_rate': 0.8892096765027632, 'model__n_estimators': 62, 'model__max_depth': 33, 'model__min_samples_split': 7, 'model__min_samples_leaf': 7, 'model__subsample': 0.22204630716300733}. Best is trial 0 with value: -137.74741442348608.
[I 2024-06-14 16:56:21,853] Trial 1 finished with value: -9.280895149779212 and parameters: {'selector__k': 21, 'model__learning_rate': 0.28511316619471977, 'model__n_estimators': 172, 'model__max_depth': 85, 'model__min_samples_split': 11, 'model__min_samples_leaf': 3, 'model__subsample': 0.9402528874866496}. Best is trial 1 with value: -9.280895149779212.
[W 2024-06-14 16:56:21,867] Trial 2 failed with parameters: {'selector__k': 45, 'model__learning_rate': 0.8707303088408895, 'model__n_estimators': 50, 'model__max_depth': 94, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__subsample': 0.4653784395562355} beca

KeyboardInterrupt: 

[I 2024-03-05 22:46:37,445] Trial 1 finished with value: -24.040013343273138 and parameters: {'selector__k': 22, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 424793770.1229809}. Best is trial 1 with value: -24.040013343273138.
[I 2024-03-05 22:46:39,339] Trial 2 finished with value: -29.13557360985365 and parameters: {'selector__k': 40, 'model__kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'model__alpha': 9765930507.399818}. Best is trial 1 with value: -24.040013343273138.
[I 2024-03-05 22:46:39,579] Trial 3 finished with value: -29.135573608006247 and parameters: {'selector__k': 18, 'model__kernel': 0.5**2 * RationalQuadratic(alpha=1, length_scale=1), 'model__alpha': 4155669346.4848695}. Best is trial 1 with value: -24.040013343273138.
[I 2024-03-05 22:46:39,839] Trial 4 finished with value: -29.134933841605687 and parameters: {'selector__k': 21, 'model__kernel': 50**2 * RBF(length_scale=50), 'model__alpha': 4403261578.166934}. Best is trial 1

[I 2024-03-05 22:46:49,520] Trial 33 finished with value: -28.961343869996355 and parameters: {'selector__k': 19, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 471905804.7094228}. Best is trial 1 with value: -24.040013343273138.
[I 2024-03-05 22:46:49,811] Trial 34 finished with value: -29.13557360977793 and parameters: {'selector__k': 27, 'model__kernel': 0.5**2 * RationalQuadratic(alpha=1, length_scale=1), 'model__alpha': 2339818816.5326204}. Best is trial 1 with value: -24.040013343273138.
[I 2024-03-05 22:46:50,093] Trial 35 finished with value: -29.13051661265087 and parameters: {'selector__k': 23, 'model__kernel': 50**2 * RBF(length_scale=50), 'model__alpha': 556339961.6199908}. Best is trial 1 with value: -24.040013343273138.
[I 2024-03-05 22:46:50,260] Trial 36 finished with value: -29.083025513636876 and parameters: {'selector__k': 18, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 1572041215.2071025}

GPR -24.040013343273138 23.14118208402261 1417.8421436981196 279.48291966896824


## Feature Selection Regression task

In [59]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

In [60]:
best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)



[I 2024-03-05 22:46:53,720] A new study created in memory with name: no-name-e7d521e1-338d-4902-841c-ae86520e70f7
[I 2024-03-05 22:46:53,994] Trial 0 finished with value: -22.851192946919547 and parameters: {'selector__k': 29, 'model__n_neighbors': 40, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: -22.851192946919547.
[I 2024-03-05 22:46:54,062] Trial 1 finished with value: -12.571121027834295 and parameters: {'selector__k': 19, 'model__n_neighbors': 16, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: -12.571121027834295.
[I 2024-03-05 22:46:54,136] Trial 2 finished with value: -20.09697837000389 and parameters: {'selector__k': 21, 'model__n_neighbors': 27, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: -12.571121027834295.
[I 2024-03-05 22:46:54,201] Trial 3 finished with value: -12.974052115695672 and parameters: {'selector__k': 15, 'model__n_neighbors': 26, 'm

[I 2024-03-05 22:46:58,877] Trial 33 finished with value: -9.317200842871099 and parameters: {'selector__k': 11, 'model__n_neighbors': 13, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 23 with value: -8.339468687737778.
[I 2024-03-05 22:46:58,960] Trial 34 finished with value: -12.509927144886474 and parameters: {'selector__k': 17, 'model__n_neighbors': 4, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 23 with value: -8.339468687737778.
[I 2024-03-05 22:46:59,045] Trial 35 finished with value: -8.805657484460857 and parameters: {'selector__k': 14, 'model__n_neighbors': 9, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 23 with value: -8.339468687737778.
[I 2024-03-05 22:46:59,279] Trial 36 finished with value: -12.774646718363995 and parameters: {'selector__k': 19, 'model__n_neighbors': 13, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 23 with value: -8.339468687737778.
[I 2024

KNN -8.122516623961058 10.77756375023874 440.3410405790753 278.1714285714286


[I 2024-03-05 22:47:02,015] Trial 0 finished with value: -9.326510872688637 and parameters: {'selector__k': 46, 'model__learning_rate': 0.30258297596150974, 'model__n_estimators': 205, 'model__max_depth': 5, 'model__max_leaves': 26, 'model__max_bin': 14, 'model__gamma': 2}. Best is trial 0 with value: -9.326510872688637.
[I 2024-03-05 22:47:02,150] Trial 1 finished with value: -8.779938334451984 and parameters: {'selector__k': 19, 'model__learning_rate': 0.5452452009507645, 'model__n_estimators': 108, 'model__max_depth': 19, 'model__max_leaves': 20, 'model__max_bin': 32, 'model__gamma': 1}. Best is trial 1 with value: -8.779938334451984.
[I 2024-03-05 22:47:02,450] Trial 2 finished with value: -9.724720507057928 and parameters: {'selector__k': 41, 'model__learning_rate': 0.152070891692908, 'model__n_estimators': 319, 'model__max_depth': 18, 'model__max_leaves': 41, 'model__max_bin': 31, 'model__gamma': 20}. Best is trial 1 with value: -8.779938334451984.
[I 2024-03-05 22:47:02,587] Tri

[I 2024-03-05 22:47:08,296] Trial 26 finished with value: -8.743282381891177 and parameters: {'selector__k': 26, 'model__learning_rate': 0.1811627126789333, 'model__n_estimators': 119, 'model__max_depth': 12, 'model__max_leaves': 49, 'model__max_bin': 43, 'model__gamma': 7}. Best is trial 20 with value: -7.547178927206244.
[I 2024-03-05 22:47:08,526] Trial 27 finished with value: -10.747379464003187 and parameters: {'selector__k': 6, 'model__learning_rate': 0.4557405798121832, 'model__n_estimators': 161, 'model__max_depth': 7, 'model__max_leaves': 44, 'model__max_bin': 21, 'model__gamma': 6}. Best is trial 20 with value: -7.547178927206244.
[I 2024-03-05 22:47:08,746] Trial 28 finished with value: -8.669656603375836 and parameters: {'selector__k': 23, 'model__learning_rate': 0.3403669741510426, 'model__n_estimators': 95, 'model__max_depth': 9, 'model__max_leaves': 39, 'model__max_bin': 35, 'model__gamma': 8}. Best is trial 20 with value: -7.547178927206244.
[I 2024-03-05 22:47:09,016] 

XGB -7.547178927206244 10.329288991723658 306.23490355093264 195.2083522251674


[I 2024-03-05 22:47:15,064] Trial 0 finished with value: -8.858247267168705 and parameters: {'selector__k': 24, 'model__learning_rate': 0.4653584482628788, 'model__n_estimators': 413, 'model__max_depth': 33, 'model__num_leaves': 32, 'model__min_child_samples': 1, 'model__subsample': 0.21247981918362702, 'model__colsample_bytree': 0.2825551216142438}. Best is trial 0 with value: -8.858247267168705.
[I 2024-03-05 22:47:15,205] Trial 1 finished with value: -11.662470801059547 and parameters: {'selector__k': 22, 'model__learning_rate': 0.6117486036840505, 'model__n_estimators': 61, 'model__max_depth': 41, 'model__num_leaves': 43, 'model__min_child_samples': 16, 'model__subsample': 0.3005432701846761, 'model__colsample_bytree': 0.28162322390926914}. Best is trial 0 with value: -8.858247267168705.
[I 2024-03-05 22:47:15,448] Trial 2 finished with value: -13.243402470893761 and parameters: {'selector__k': 7, 'model__learning_rate': 0.6595528615159119, 'model__n_estimators': 187, 'model__max_d

[I 2024-03-05 22:47:22,768] Trial 21 finished with value: -8.905087267295642 and parameters: {'selector__k': 11, 'model__learning_rate': 0.8436250912802201, 'model__n_estimators': 214, 'model__max_depth': 29, 'model__num_leaves': 28, 'model__min_child_samples': 4, 'model__subsample': 0.6087980372129388, 'model__colsample_bytree': 0.6627015627654691}. Best is trial 16 with value: -8.00055307630308.
[I 2024-03-05 22:47:23,209] Trial 22 finished with value: -8.835125224762205 and parameters: {'selector__k': 19, 'model__learning_rate': 0.23563972303773661, 'model__n_estimators': 165, 'model__max_depth': 25, 'model__num_leaves': 26, 'model__min_child_samples': 3, 'model__subsample': 0.7290063490340349, 'model__colsample_bytree': 0.8693785010326622}. Best is trial 16 with value: -8.00055307630308.
[I 2024-03-05 22:47:23,885] Trial 23 finished with value: -8.993226893054311 and parameters: {'selector__k': 13, 'model__learning_rate': 0.39300143217305716, 'model__n_estimators': 288, 'model__max

[I 2024-03-05 22:47:43,729] Trial 42 finished with value: -8.480564152947359 and parameters: {'selector__k': 30, 'model__learning_rate': 0.06198096004177568, 'model__n_estimators': 499, 'model__max_depth': 35, 'model__num_leaves': 40, 'model__min_child_samples': 5, 'model__subsample': 0.8275719705576507, 'model__colsample_bytree': 0.8721055000090076}. Best is trial 35 with value: -7.604467130365416.
[I 2024-03-05 22:47:45,052] Trial 43 finished with value: -8.214298279638871 and parameters: {'selector__k': 25, 'model__learning_rate': 0.1760054095809835, 'model__n_estimators': 397, 'model__max_depth': 38, 'model__num_leaves': 45, 'model__min_child_samples': 8, 'model__subsample': 0.7322840843849818, 'model__colsample_bytree': 0.7327318204821518}. Best is trial 35 with value: -7.604467130365416.
[I 2024-03-05 22:47:46,028] Trial 44 finished with value: -8.216885593439788 and parameters: {'selector__k': 22, 'model__learning_rate': 0.04978039578532354, 'model__n_estimators': 305, 'model__m

LGBM -7.604467130365416 11.96562619922016 367.8850978666239 149.36160196369866


[I 2024-03-05 22:47:51,701] Trial 0 finished with value: -18.5310244450649 and parameters: {'selector__k': 5, 'model__max_features': 'sqrt', 'model__n_estimators': 102, 'model__max_depth': 1, 'model__min_samples_leaf': 19}. Best is trial 0 with value: -18.5310244450649.
[I 2024-03-05 22:47:56,346] Trial 1 finished with value: -10.473993455518386 and parameters: {'selector__k': 45, 'model__max_features': 'log2', 'model__n_estimators': 460, 'model__max_depth': 141, 'model__min_samples_leaf': 3}. Best is trial 1 with value: -10.473993455518386.
[I 2024-03-05 22:47:59,076] Trial 2 finished with value: -9.00352846021563 and parameters: {'selector__k': 13, 'model__max_features': 'log2', 'model__n_estimators': 410, 'model__max_depth': 41, 'model__min_samples_leaf': 9}. Best is trial 2 with value: -9.00352846021563.
[I 2024-03-05 22:47:59,839] Trial 3 finished with value: -9.940831987598 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators': 116, 'model__max_

[I 2024-03-05 22:48:58,460] Trial 30 finished with value: -9.859250431883101 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators': 166, 'model__max_depth': 86, 'model__min_samples_leaf': 14}. Best is trial 23 with value: -7.457269739396862.
[I 2024-03-05 22:49:00,497] Trial 31 finished with value: -10.561074022036568 and parameters: {'selector__k': 5, 'model__max_features': 'sqrt', 'model__n_estimators': 333, 'model__max_depth': 6, 'model__min_samples_leaf': 1}. Best is trial 23 with value: -7.457269739396862.
[I 2024-03-05 22:49:04,063] Trial 32 finished with value: -7.632620598674949 and parameters: {'selector__k': 20, 'model__max_features': 'log2', 'model__n_estimators': 387, 'model__max_depth': 72, 'model__min_samples_leaf': 2}. Best is trial 23 with value: -7.457269739396862.
[I 2024-03-05 22:49:08,006] Trial 33 finished with value: -7.755639421244877 and parameters: {'selector__k': 14, 'model__max_features': 'log2', 'model__n_estimators': 496, 

RF -7.457269739396862 10.032409553028515 310.4462965245929 166.99110231131442


[I 2024-03-05 22:49:57,204] Trial 0 finished with value: -12.396238978517234 and parameters: {'selector__k': 10, 'model__learning_rate': 0.5103912199370998, 'model__n_estimators': 277, 'model__max_depth': 162, 'model__min_samples_split': 6, 'model__min_samples_leaf': 9, 'model__subsample': 0.3694056335087347}. Best is trial 0 with value: -12.396238978517234.
[I 2024-03-05 22:50:14,187] Trial 1 finished with value: -10.330289417592907 and parameters: {'selector__k': 46, 'model__learning_rate': 0.5848768364439724, 'model__n_estimators': 134, 'model__max_depth': 94, 'model__min_samples_split': 5, 'model__min_samples_leaf': 9, 'model__subsample': 0.9669955892352692}. Best is trial 1 with value: -10.330289417592907.
[I 2024-03-05 22:50:14,971] Trial 2 finished with value: -15.78300967768568 and parameters: {'selector__k': 10, 'model__learning_rate': 0.5695964162347436, 'model__n_estimators': 131, 'model__max_depth': 71, 'model__min_samples_split': 4, 'model__min_samples_leaf': 7, 'model__su

[I 2024-03-05 22:54:18,532] Trial 23 finished with value: -7.616641339261005 and parameters: {'selector__k': 13, 'model__learning_rate': 0.13472468658707126, 'model__n_estimators': 360, 'model__max_depth': 85, 'model__min_samples_split': 7, 'model__min_samples_leaf': 5, 'model__subsample': 0.6630189917352604}. Best is trial 22 with value: -7.547734203888875.
[I 2024-03-05 22:54:22,687] Trial 24 finished with value: -8.345291475193694 and parameters: {'selector__k': 12, 'model__learning_rate': 0.27257151953164327, 'model__n_estimators': 241, 'model__max_depth': 78, 'model__min_samples_split': 7, 'model__min_samples_leaf': 7, 'model__subsample': 0.6480180257861959}. Best is trial 22 with value: -7.547734203888875.
[I 2024-03-05 22:54:24,395] Trial 25 finished with value: -14.284143845799088 and parameters: {'selector__k': 5, 'model__learning_rate': 0.7473259054273813, 'model__n_estimators': 405, 'model__max_depth': 86, 'model__min_samples_split': 10, 'model__min_samples_leaf': 5, 'model_

[I 2024-03-05 22:56:19,537] Trial 46 finished with value: -8.025932066803126 and parameters: {'selector__k': 10, 'model__learning_rate': 0.011590495685839475, 'model__n_estimators': 257, 'model__max_depth': 180, 'model__min_samples_split': 4, 'model__min_samples_leaf': 8, 'model__subsample': 0.7373753232052127}. Best is trial 26 with value: -7.456544340245043.
[I 2024-03-05 22:57:21,794] Trial 47 finished with value: -8.743297246523788 and parameters: {'selector__k': 33, 'model__learning_rate': 0.18542633663706531, 'model__n_estimators': 456, 'model__max_depth': 126, 'model__min_samples_split': 8, 'model__min_samples_leaf': 5, 'model__subsample': 0.9901141643145024}. Best is trial 26 with value: -7.456544340245043.
[I 2024-03-05 22:57:22,850] Trial 48 finished with value: -32.41792380005215 and parameters: {'selector__k': 21, 'model__learning_rate': 0.9239022292428107, 'model__n_estimators': 81, 'model__max_depth': 138, 'model__min_samples_split': 6, 'model__min_samples_leaf': 6, 'mode

GB -7.456544340245043 10.774794730138279 355.05148362834433 176.59651232147186


[I 2024-03-05 22:57:40,163] Trial 0 finished with value: -22.97619379649801 and parameters: {'selector__k': 4, 'model__hidden_layer_sizes': (50,), 'model__activation': 'tanh', 'model__solver': 'adam', 'model__alpha': 5045.986090528567}. Best is trial 0 with value: -22.97619379649801.
[I 2024-03-05 22:57:42,775] Trial 1 finished with value: -24.904412506208995 and parameters: {'selector__k': 9, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'logistic', 'model__solver': 'sgd', 'model__alpha': 1934.0966727913815}. Best is trial 0 with value: -22.97619379649801.
[I 2024-03-05 22:57:45,099] Trial 2 finished with value: -8.638800891417262 and parameters: {'selector__k': 8, 'model__hidden_layer_sizes': (30, 30, 30), 'model__activation': 'identity', 'model__solver': 'adam', 'model__alpha': 0.09853421451142941}. Best is trial 2 with value: -8.638800891417262.
[I 2024-03-05 22:57:46,923] Trial 3 finished with value: -23.472552639144126 and parameters: {'selector__k': 29, 'model_

[W 2024-03-05 22:58:37,908] Trial 25 failed with value nan.
[W 2024-03-05 22:58:39,902] Trial 26 failed with parameters: {'selector__k': 46, 'model__hidden_layer_sizes': (50, 50), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 1.4757652299128472e-05} because of the following error: The value nan is not acceptable.
[W 2024-03-05 22:58:39,904] Trial 26 failed with value nan.
[W 2024-03-05 22:58:41,878] Trial 27 failed with parameters: {'selector__k': 44, 'model__hidden_layer_sizes': (50, 50), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 1.915486284285697e-05} because of the following error: The value nan is not acceptable.
[W 2024-03-05 22:58:41,880] Trial 27 failed with value nan.
[W 2024-03-05 22:58:43,866] Trial 28 failed with parameters: {'selector__k': 47, 'model__hidden_layer_sizes': (50, 50), 'model__activation': 'identity', 'model__solver': 'sgd', 'model__alpha': 1.4149523429573e-05} because of the following error: The value n

[I 2024-03-05 22:59:27,235] A new study created in memory with name: no-name-a1181ded-a574-4fe8-b0b0-ea2eae6d6120


ANN -8.638800891417262 12.134652152574736 502.33922316787306 364.9480938634163


[I 2024-03-05 22:59:27,470] Trial 0 finished with value: -29.011849065928192 and parameters: {'selector__k': 22, 'model__kernel': RBF(length_scale=1) + 1.41**2, 'model__alpha': 4119933960.900528}. Best is trial 0 with value: -29.011849065928192.
[I 2024-03-05 22:59:29,319] Trial 1 finished with value: -29.011855675203833 and parameters: {'selector__k': 25, 'model__kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'model__alpha': 814997452.3112924}. Best is trial 0 with value: -29.011849065928192.
[I 2024-03-05 22:59:29,538] Trial 2 finished with value: -29.01185567486901 and parameters: {'selector__k': 19, 'model__kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'model__alpha': 4206087220.2298927}. Best is trial 0 with value: -29.011849065928192.
[I 2024-03-05 22:59:29,779] Trial 3 finished with value: -29.01184346381388 and parameters: {'selector__k': 27, 'model__kernel': RBF(length_scale=1) + 1.41**2, 'model__alpha': 2229866118.6486745}. Best is trial 3 with value: -29.01184346381388.
[

[I 2024-03-05 22:59:37,907] Trial 32 finished with value: -27.344323612710017 and parameters: {'selector__k': 44, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 7158469293.739258}. Best is trial 4 with value: -27.31842328087018.
[I 2024-03-05 22:59:38,088] Trial 33 finished with value: -27.71661429508581 and parameters: {'selector__k': 38, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 7077683335.189041}. Best is trial 4 with value: -27.31842328087018.
[I 2024-03-05 22:59:39,944] Trial 34 finished with value: -29.011855675204426 and parameters: {'selector__k': 31, 'model__kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'model__alpha': 3765504293.7399054}. Best is trial 4 with value: -27.31842328087018.
[I 2024-03-05 22:59:40,121] Trial 35 finished with value: -26.701025730195003 and parameters: {'selector__k': 44, 'model__kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'model__alpha': 4738988628.43

GPR -23.973172049015396 20.87464791169681 1010.7307151707481 268.5482156188156


# Regression w/o feature selection

In [61]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-05 22:59:47,110] A new study created in memory with name: no-name-bde915b4-029f-4d33-98fc-01ee372f4177
[I 2024-03-05 22:59:47,193] Trial 0 finished with value: -22.81977768302291 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 0 with value: -22.81977768302291.
[I 2024-03-05 22:59:47,418] Trial 1 finished with value: -23.605399323485784 and parameters: {'n_neighbors': 32, 'weights': 'uniform', 'algorithm': 'brute'}. Best is trial 0 with value: -22.81977768302291.
[I 2024-03-05 22:59:47,500] Trial 2 finished with value: -22.81977768302291 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'algorithm': 'kd_tree'}. Best is trial 0 with value: -22.81977768302291.
[I 2024-03-05 22:59:47,589] Trial 3 finished with value: -23.158758368954953 and parameters: {'n_neighbors': 29, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 0 with value: -22.81977768302291.
[I 2024-03-05 22:59:47,687] Trial 4 finished with val

[I 2024-03-05 22:59:52,085] Trial 39 finished with value: -22.765917739281754 and parameters: {'n_neighbors': 13, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 19 with value: -21.964962229855747.
[I 2024-03-05 22:59:52,188] Trial 40 finished with value: -23.09118448176882 and parameters: {'n_neighbors': 23, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 19 with value: -21.964962229855747.
[I 2024-03-05 22:59:52,278] Trial 41 finished with value: -21.986703169320908 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 19 with value: -21.964962229855747.
[I 2024-03-05 22:59:52,372] Trial 42 finished with value: -21.964962229855747 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree'}. Best is trial 19 with value: -21.964962229855747.
[I 2024-03-05 22:59:52,455] Trial 43 finished with value: -24.59251183111022 and parameters: {'n_neighbors': 1, 'weights': 'distance', 'algorithm': 'bal

KNN -21.964962229855747 29.525559165290822 1755.0081627017971 260.8948079729678


[I 2024-03-05 22:59:53,977] Trial 0 finished with value: -9.149874910310963 and parameters: {'learning_rate': 0.08912067922007341, 'n_estimators': 201, 'max_depth': 16, 'max_leaves': 10, 'max_bin': 36, 'gamma': 6}. Best is trial 0 with value: -9.149874910310963.
[I 2024-03-05 22:59:54,178] Trial 1 finished with value: -10.451506891070496 and parameters: {'learning_rate': 0.4785370139984635, 'n_estimators': 70, 'max_depth': 18, 'max_leaves': 47, 'max_bin': 13, 'gamma': 8}. Best is trial 0 with value: -9.149874910310963.
[I 2024-03-05 22:59:54,864] Trial 2 finished with value: -10.680020720672815 and parameters: {'learning_rate': 0.9985357428903479, 'n_estimators': 474, 'max_depth': 18, 'max_leaves': 40, 'max_bin': 7, 'gamma': 3}. Best is trial 0 with value: -9.149874910310963.
[I 2024-03-05 22:59:55,509] Trial 3 finished with value: -10.255202441091706 and parameters: {'learning_rate': 0.7653146496405073, 'n_estimators': 484, 'max_depth': 13, 'max_leaves': 21, 'max_bin': 25, 'gamma': 11

[I 2024-03-05 23:00:05,567] Trial 32 finished with value: -9.131527100269329 and parameters: {'learning_rate': 0.2351648234603783, 'n_estimators': 78, 'max_depth': 3, 'max_leaves': 46, 'max_bin': 47, 'gamma': 8}. Best is trial 21 with value: -8.545447452500282.
[I 2024-03-05 23:00:05,917] Trial 33 finished with value: -8.578411271599572 and parameters: {'learning_rate': 0.1406196779984645, 'n_estimators': 109, 'max_depth': 5, 'max_leaves': 48, 'max_bin': 34, 'gamma': 3}. Best is trial 21 with value: -8.545447452500282.
[I 2024-03-05 23:00:06,225] Trial 34 finished with value: -9.015726425097972 and parameters: {'learning_rate': 0.3743811755676604, 'n_estimators': 105, 'max_depth': 5, 'max_leaves': 42, 'max_bin': 34, 'gamma': 3}. Best is trial 21 with value: -8.545447452500282.
[I 2024-03-05 23:00:06,571] Trial 35 finished with value: -8.129276107202934 and parameters: {'learning_rate': 0.06740534512960786, 'n_estimators': 123, 'max_depth': 2, 'max_leaves': 47, 'max_bin': 33, 'gamma': 3

XGB -8.129276107202934 10.655416532121972 309.0461921707954 157.2042476109096


[I 2024-03-05 23:00:12,414] Trial 1 finished with value: -9.007271369368588 and parameters: {'learning_rate': 0.021865067121023492, 'n_estimators': 155, 'max_depth': 9, 'num_leaves': 6, 'min_child_samples': 5, 'subsample': 0.9185931992055312, 'colsample_bytree': 0.4343921407762077}. Best is trial 1 with value: -9.007271369368588.
[I 2024-03-05 23:00:12,557] Trial 2 finished with value: -9.35837353834492 and parameters: {'learning_rate': 0.15416263806994593, 'n_estimators': 94, 'max_depth': 40, 'num_leaves': 9, 'min_child_samples': 5, 'subsample': 0.616246314184996, 'colsample_bytree': 0.9860193735807611}. Best is trial 1 with value: -9.007271369368588.
[I 2024-03-05 23:00:12,763] Trial 3 finished with value: -13.64590152382586 and parameters: {'learning_rate': 0.42331903358307293, 'n_estimators': 248, 'max_depth': 35, 'num_leaves': 6, 'min_child_samples': 14, 'subsample': 0.4445243533199511, 'colsample_bytree': 0.5316928920805017}. Best is trial 1 with value: -9.007271369368588.
[I 202

[I 2024-03-05 23:00:20,843] Trial 26 finished with value: -9.295438413104927 and parameters: {'learning_rate': 0.20497364193114287, 'n_estimators': 104, 'max_depth': 24, 'num_leaves': 2, 'min_child_samples': 2, 'subsample': 0.5906286238454056, 'colsample_bytree': 0.9198587028081391}. Best is trial 12 with value: -8.734837504496886.
[I 2024-03-05 23:00:21,184] Trial 27 finished with value: -13.761304112400108 and parameters: {'learning_rate': 0.3090709962420246, 'n_estimators': 142, 'max_depth': 15, 'num_leaves': 25, 'min_child_samples': 7, 'subsample': 0.7184036006244846, 'colsample_bytree': 0.22525490418353544}. Best is trial 12 with value: -8.734837504496886.
[I 2024-03-05 23:00:21,550] Trial 28 finished with value: -8.868710594581549 and parameters: {'learning_rate': 0.07123269540296198, 'n_estimators': 71, 'max_depth': 41, 'num_leaves': 49, 'min_child_samples': 4, 'subsample': 0.8251170180869597, 'colsample_bytree': 0.38307023541703467}. Best is trial 12 with value: -8.734837504496

LGBM -8.734837504496886 11.662476826703262 364.1744088589248 269.11295441207966


[I 2024-03-05 23:00:33,808] Trial 0 finished with value: -13.979547731304478 and parameters: {'max_features': 'log2', 'n_estimators': 63, 'max_depth': 18, 'min_samples_leaf': 20}. Best is trial 0 with value: -13.979547731304478.
[I 2024-03-05 23:00:35,396] Trial 1 finished with value: -10.647985240129241 and parameters: {'max_features': 'log2', 'n_estimators': 145, 'max_depth': 190, 'min_samples_leaf': 2}. Best is trial 1 with value: -10.647985240129241.
[I 2024-03-05 23:00:36,470] Trial 2 finished with value: -12.410210345156223 and parameters: {'max_features': 'sqrt', 'n_estimators': 131, 'max_depth': 83, 'min_samples_leaf': 18}. Best is trial 1 with value: -10.647985240129241.
[I 2024-03-05 23:00:38,909] Trial 3 finished with value: -10.597215488928832 and parameters: {'max_features': 'sqrt', 'n_estimators': 251, 'max_depth': 60, 'min_samples_leaf': 6}. Best is trial 3 with value: -10.597215488928832.
[I 2024-03-05 23:00:41,130] Trial 4 finished with value: -11.71128486164705 and pa

[I 2024-03-05 23:01:50,836] Trial 36 finished with value: -10.446044233470849 and parameters: {'max_features': 'sqrt', 'n_estimators': 260, 'max_depth': 51, 'min_samples_leaf': 5}. Best is trial 32 with value: -9.580251356051921.
[I 2024-03-05 23:01:52,143] Trial 37 finished with value: -10.980586655105272 and parameters: {'max_features': 'sqrt', 'n_estimators': 121, 'max_depth': 28, 'min_samples_leaf': 7}. Best is trial 32 with value: -9.580251356051921.
[I 2024-03-05 23:01:55,210] Trial 38 finished with value: -10.039586202849517 and parameters: {'max_features': 'log2', 'n_estimators': 227, 'max_depth': 14, 'min_samples_leaf': 1}. Best is trial 32 with value: -9.580251356051921.
[I 2024-03-05 23:01:57,153] Trial 39 finished with value: -9.871287930482094 and parameters: {'max_features': 'sqrt', 'n_estimators': 154, 'max_depth': 82, 'min_samples_leaf': 3}. Best is trial 32 with value: -9.580251356051921.
[I 2024-03-05 23:01:58,814] Trial 40 finished with value: -11.775991726557514 and

RF -9.577795851412322 13.900826495073362 396.99112940277314 154.83863149276195


[I 2024-03-05 23:02:19,945] Trial 0 finished with value: -6377.900666117326 and parameters: {'learning_rate': 0.6572198495289885, 'n_estimators': 222, 'max_depth': 152, 'min_samples_split': 6, 'min_samples_leaf': 5, 'subsample': 0.18295781112942514}. Best is trial 0 with value: -6377.900666117326.
[I 2024-03-05 23:02:26,393] Trial 1 finished with value: -9.529561248733849 and parameters: {'learning_rate': 0.2673997529518659, 'n_estimators': 54, 'max_depth': 71, 'min_samples_split': 8, 'min_samples_leaf': 9, 'subsample': 0.9547330282676173}. Best is trial 1 with value: -9.529561248733849.
[I 2024-03-05 23:02:34,961] Trial 2 finished with value: -8.954625193790577 and parameters: {'learning_rate': 0.2016258277911086, 'n_estimators': 151, 'max_depth': 189, 'min_samples_split': 4, 'min_samples_leaf': 4, 'subsample': 0.4414991645316042}. Best is trial 2 with value: -8.954625193790577.
[I 2024-03-05 23:02:39,538] Trial 3 finished with value: -20.76414254317176 and parameters: {'learning_rate

[I 2024-03-05 23:06:11,145] Trial 28 finished with value: -8.285955781202032 and parameters: {'learning_rate': 0.06715761255965912, 'n_estimators': 72, 'max_depth': 146, 'min_samples_split': 5, 'min_samples_leaf': 1, 'subsample': 0.5374387868959568}. Best is trial 15 with value: -7.914402518871215.
[I 2024-03-05 23:06:32,713] Trial 29 finished with value: -8.477036027383276 and parameters: {'learning_rate': 0.45048215412709913, 'n_estimators': 187, 'max_depth': 144, 'min_samples_split': 6, 'min_samples_leaf': 4, 'subsample': 0.6680782035195132}. Best is trial 15 with value: -7.914402518871215.
[I 2024-03-05 23:06:33,643] Trial 30 finished with value: -1111.3453124897708 and parameters: {'learning_rate': 0.8924912735516317, 'n_estimators': 51, 'max_depth': 87, 'min_samples_split': 4, 'min_samples_leaf': 3, 'subsample': 0.24108243827231868}. Best is trial 15 with value: -7.914402518871215.
[I 2024-03-05 23:06:37,793] Trial 31 finished with value: -7.659966616387766 and parameters: {'lear

GB -7.659966616387766 12.937248770504212 461.66067665337306 248.72842023075572


[W 2024-03-05 23:09:51,761] Trial 0 failed with parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'identity', 'solver': 'sgd', 'alpha': 15140.617681184025} because of the following error: The value nan is not acceptable.
[W 2024-03-05 23:09:51,763] Trial 0 failed with value nan.
[I 2024-03-05 23:09:52,036] Trial 1 finished with value: -6.610724621753541e+134 and parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'relu', 'solver': 'sgd', 'alpha': 1.4245200095066528e-05}. Best is trial 1 with value: -6.610724621753541e+134.
[I 2024-03-05 23:09:52,473] Trial 2 finished with value: -23.18445472715947 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 22669.649495305846}. Best is trial 2 with value: -23.18445472715947.
[I 2024-03-05 23:09:56,679] Trial 3 finished with value: -27.854511691472982 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'tanh', 'solver': 'adam', 'alpha': 60078.28092105243}. Best is trial 2 wi

[W 2024-03-05 23:10:54,126] Trial 31 failed with parameters: {'hidden_layer_sizes': (50,), 'activation': 'identity', 'solver': 'sgd', 'alpha': 99.10000104103007} because of the following error: The value nan is not acceptable.
[W 2024-03-05 23:10:54,127] Trial 31 failed with value nan.
[W 2024-03-05 23:10:55,504] Trial 32 failed with parameters: {'hidden_layer_sizes': (50,), 'activation': 'identity', 'solver': 'sgd', 'alpha': 57.28799162864752} because of the following error: The value nan is not acceptable.
[W 2024-03-05 23:10:55,506] Trial 32 failed with value nan.
[W 2024-03-05 23:10:56,813] Trial 33 failed with parameters: {'hidden_layer_sizes': (50,), 'activation': 'identity', 'solver': 'sgd', 'alpha': 69.19243184316122} because of the following error: The value nan is not acceptable.
[W 2024-03-05 23:10:56,814] Trial 33 failed with value nan.
[W 2024-03-05 23:10:58,148] Trial 34 failed with parameters: {'hidden_layer_sizes': (50,), 'activation': 'identity', 'solver': 'sgd', 'alph

ANN -9.996286448794029 17.045429339213317 575.6128432295866 155.79262680242772


[I 2024-03-05 23:11:19,631] Trial 0 finished with value: -29.011855675190283 and parameters: {'kernel': 0.5**2 * RationalQuadratic(alpha=1, length_scale=1), 'alpha': 5928647174.024457}. Best is trial 0 with value: -29.011855675190283.
[I 2024-03-05 23:11:19,943] Trial 1 finished with value: -29.01185567520459 and parameters: {'kernel': 0.1**2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.01), 'alpha': 378150093.4768778}. Best is trial 0 with value: -29.011855675190283.
[I 2024-03-05 23:11:20,217] Trial 2 finished with value: -29.011852104205747 and parameters: {'kernel': RBF(length_scale=1) + 1.41**2, 'alpha': 7625254438.795543}. Best is trial 2 with value: -29.011852104205747.
[I 2024-03-05 23:11:20,488] Trial 3 finished with value: -29.011851116912382 and parameters: {'kernel': RBF(length_scale=1) + 1.41**2, 'alpha': 5973678860.655819}. Best is trial 3 with value: -29.011851116912382.
[I 2024-03-05 23:11:22,451] Trial 4 finished with value: -29.01185567520459 and parameters: {'

[I 2024-03-05 23:11:32,967] Trial 36 finished with value: -26.11632787004309 and parameters: {'kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'alpha': 3408731422.816139}. Best is trial 21 with value: -23.34320473375313.
[I 2024-03-05 23:11:33,170] Trial 37 finished with value: -23.953836814513082 and parameters: {'kernel': DotProduct(sigma_0=1) + WhiteKernel(noise_level=1), 'alpha': 15301163.1896168}. Best is trial 21 with value: -23.34320473375313.
[I 2024-03-05 23:11:35,105] Trial 38 finished with value: -29.01185567520459 and parameters: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5), 'alpha': 2283398396.0498924}. Best is trial 21 with value: -23.34320473375313.
[I 2024-03-05 23:11:35,404] Trial 39 finished with value: -29.011852791621692 and parameters: {'kernel': RBF(length_scale=1) + 1.41**2, 'alpha': 9443035416.225826}. Best is trial 21 with value: -23.34320473375313.
[I 2024-03-05 23:11:35,706] Trial 40 finished with value: -29.011840832998146 and parameters: {'

GPR -23.320734789384375 21.914804028579766 962.1333617828723 262.681917815002


In [None]:
with open('best_model_onlyreg.pickle', 'wb') as f:
    pickle.dump(best_model_onlyreg, f, pickle.HIGHEST_PROTOCOL)

In [None]:

# X_t = X_t_old[selected_features] 
# X_t['cluster'] = train_data_tr['remainder__clusters']
# X_t['remainder__link_counts'] = train_data_tr['remainder__link_counts']
# X_v = validate_data_tr[selected_features]
# X_v['cluster'] = validate_data_tr['remainder__clusters']
# X_v['remainder__link_counts'] = validate_data_tr['remainder__link_counts']
# X_te = test_data_tr[selected_features]
# X_te['cluster'] = test_data_tr['remainder__clusters']
# X_te['remainder__link_counts'] = test_data_tr['remainder__link_counts']

## old/temp record(not used anymore)

In [None]:
model = xgb.XGBRegressor(random_state=101)
opt = BayesSearchCV(
    model,
    param_space['XGB'],
    n_iter=50,  # Adjust the number of iterations based on your computational resources
    cv=ShuffleSplit(test_size=0.20, n_splits=3),  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

opt.fit(X_t, y_t)
best_params = opt.best_params_ 

model = xgb.XGBRegressor(**best_params, random_state=101)
model.fit(X_t, y_t)
te_predictions = model.predict(X_te)
te_mae = mean_absolute_error(y_te, te_predictions)
print(te_mae)

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(model.apply(X_t))
rf_lr = LogisticRegressionCV(cv=ShuffleSplit(test_size=0.20, n_splits=3))
rf_lr.fit(X_t_leaves, y_t)
X_te_leaves = onehot.transform(model.apply(X_te))
y_te_pred_rf_lr = rf_lr.predict(X_te_leaves)
te_rflr_mae = mean_absolute_error(y_te, y_te_pred_rf_lr)
print(te_rflr_mae)

In [None]:
v_mae_list = []
te_mae_list = []
v_rf_mae_list = []
te_rf_mae_list = []
# X_te['cluster'] = 1
for cluster_label in list(set(X_t['cluster'])):
    X_v_cluster, X_te_cluster = None, None
    v_mae, te_mae, v_rflr_mae, te_rflr_mae = None, None, None, None
    # Subset the training data for the current cluster
    X_t_cluster = X_t[X_t['cluster'] == cluster_label]
    y_t_cluster = X_t_cluster['remainder__link_counts']
    X_t_cluster = X_t_cluster.drop(columns=['remainder__link_counts','cluster'])  
    
    if cluster_label in list(set(X_v['cluster'])):
        X_v_cluster = X_v[X_v['cluster'] == cluster_label]
        y_v_cluster = X_v_cluster['remainder__link_counts']
        X_v_cluster = X_v_cluster.drop(columns=['remainder__link_counts','cluster'])  
    if cluster_label in list(set(X_te['cluster'])):
        X_te_cluster = X_te[X_te['cluster'] == cluster_label]
        y_te_cluster = X_te_cluster['remainder__link_counts']
        X_te_cluster = X_te_cluster.drop(columns=['remainder__link_counts','cluster'])  
    

    # Train a regression model for the current cluster
    # model = RidgeCV(scoring='neg_mean_absolute_error')
    # model = RandomForestRegressor(criterion='friedman_mse', n_estimators=300,
    #                   random_state=101)

    if not ((X_v_cluster is None) and (X_te_cluster is None)):

        model = xgb.XGBRegressor(random_state=101)
        opt = BayesSearchCV(
            model,
            param_space['XGB'],
            n_iter=60,  # Adjust the number of iterations based on your computational resources
            cv=ShuffleSplit(test_size=0.20, n_splits=1),  # Adjust the number of cross-validation folds
            scoring='neg_mean_absolute_error',  # Use a suitable regression metric
            n_jobs=-1,
        )
        
        opt.fit(X_t_cluster, y_t_cluster)
        best_params = opt.best_params_ 
        onehot = OneHotEncoder()
        model = xgb.XGBRegressor(**best_params, random_state=101)
        model.fit(X_t_cluster, y_t_cluster)

        X_t_leaves = onehot.fit_transform(model.apply(X_t_cluster))
        rf_lr = LogisticRegression()
        rf_lr.fit(X_t_leaves, y_t_cluster)
        if (X_v_cluster is not None):
            v_predictions = model.predict(X_v_cluster)
            v_mae = mean_absolute_error(y_v_cluster, v_predictions)
            row_count = len(y_v_cluster)
            v_mae_list.append(v_mae*row_count)

            X_v_leaves = onehot.transform(model.apply(X_v_cluster))
            y_v_pred_rf_lr = rf_lr.predict(X_v_leaves)
            row_count = len(y_v_pred_rf_lr)
            v_rflr_mae = mean_absolute_error(y_v_cluster, y_v_pred_rf_lr)
            v_rf_mae_list.append(v_rflr_mae*row_count)
        if (X_te_cluster is not None):
            te_predictions = model.predict(X_te_cluster)
            te_mae = mean_absolute_error(y_te_cluster, te_predictions)
            row_count = len(y_te_cluster)
            te_mae_list.append(te_mae*row_count)
      
            X_te_leaves = onehot.transform(model.apply(X_te_cluster))
            y_te_pred_rf_lr = rf_lr.predict(X_te_leaves)
            row_count = len(y_te_pred_rf_lr)
            te_rflr_mae = mean_absolute_error(y_te_cluster, y_te_pred_rf_lr)
            te_rf_mae_list.append(te_rflr_mae*row_count)
    print(cluster_label, v_mae, te_mae, v_rflr_mae, te_rflr_mae)
v_mae_list = list(filter(lambda item: item is not None, v_mae_list))    
te_mae_list = list(filter(lambda item: item is not None, te_mae_list))    
v_mean_mae = sum(v_mae_list)/len(X_v)
te_mean_mae = sum(te_mae_list)/len(X_te)
print(v_mean_mae)
print(te_mean_mae)    

In [None]:
best_params = None
best_mae = 0.0

# Generate all combinations of hyperparameters
param_combinations = product(*param_grid.values())
# Loop through hyperparameter combinations
for params in param_combinations:
    # Create and train the model with the current hyperparameters
    model = lgb.LGBMRegressor(**dict(zip(param_grid.keys(), params)), random_state=101)
    model.fit(X_t[X_v.columns], y_t)

    # Evaluate the model on the validation dataset
    y_val_pred = model.predict(X_v)
    mae = mean_absolute_error(y_v, y_val_pred)

    # Check if current hyperparameters are the best
    if mae > best_mae:
        best_mae = mae
        best_params = dict(zip(param_grid.keys(), params))
print(best_mae)
print(best_params)
# Train the final model using the entire training dataset with the best hyperparameters
final_model = lgb.LGBMRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

# Evaluate the final model on the test dataset
y_test_pred = final_model.predict(X_te)
test_mae = mean_absolute_error(y_te, y_test_pred)
print(test_mae)

In [None]:
# Define the LGBMRegressor model
lgbm_model = lgb.LGBMRegressor(random_state=101, force_col_wise=True)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    lgbm_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t[X_v.columns], y_t, eval_set = [(X_v, y_v)])

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)

# Train the final model with the best hyperparameters
final_model = lgb.LGBMRegressor(**best_params, random_state=101, force_col_wise=True)
final_model.fit(X_t, y_t, eval_set = [(X_te, y_te)])

print("With LGB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))


In [None]:
print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.predict(X_t, pred_leaf=True))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With LGB + LR Pred")
X_t_leaves = onehot.transform(final_model.predict(X_t, pred_leaf=True))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.predict(X_te, pred_leaf=True))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
param_space = {
    'learning_rate': np.arange(0.01, 1.0, 0.01),
    'n_estimators': np.arange(50, 2001, 50),
    'max_depth': np.arange(1, 20),
    'max_leaves': np.arange(2, 50),
    'max_bin': np.arange(2, 50),
    'gamma': np.arange(1, 20),
    # 'min_child_weight': np.arange(0, 20),
    # 'subsample': np.arange(0.1, 1.0, 0.1),
    # 'colsample_bytree': np.arange(0.1, 1.0, 0.1),
    # 'reg_alpha': np.arange(0, 100),
    # 'reg_lambda': np.arange(0, 10, 0.01),
}

# Define the LGBMRegressor model
xgb_model = xgb.XGBRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    xgb_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t[X_v.columns], y_t, eval_set = [(X_v, y_v)])
# 
# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)

# Train the final model using the entire training dataset with the best hyperparameters
final_model = xgb.XGBRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t, eval_set = [(X_te, y_te)])

# Evaluate the final model on the test dataset
y_test_pred = final_model.predict(X_te)
test_mae = mean_absolute_error(y_te, y_test_pred)
print(test_mae)

In [None]:
# Train the final model with the best hyperparameters
final_model = xgb.XGBRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With XGB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))


# # Make predictions on the validation set
# y_pred = final_model.predict(X_te)

# # Evaluate the model on the validation set
# mae = mean_absolute_error(y_te, y_pred)
# print(f'Mean Absolute Error on Validation Set: {mae}')

In [None]:
param_space =  {
    'max_features': Categorical(['sqrt', 'log2']),
    'n_estimators': np.arange(50, 2001, 50),
    'bootstrap': Categorical([True, False]),
    'max_depth': np.arange(1, 20),
    'min_samples_leaf': np.arange(1, 20),
    'criterion': Categorical(['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])
}
# Define the LGBMRegressor model
rf_model = RandomForestRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    rf_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t, y_t)

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)


In [None]:
# Train the final model with the best hyperparameters
final_model = RandomForestRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With RF Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
param_space = {
    'learning_rate': np.arange(0.01, 1.0, 0.01),
    'n_estimators': np.arange(50, 2001, 50),
    'max_depth': np.arange(1, 200),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 10),
    'subsample': np.arange(0.1, 1.0, 0.1),
}
# Define the LGBMRegressor model
gb_model = GradientBoostingRegressor(random_state=101)

# Create a Bayesian optimization object
opt = BayesSearchCV(
    gb_model,
    param_space,
    n_iter=100,  # Adjust the number of iterations based on your computational resources
    cv=3,  # Adjust the number of cross-validation folds
    scoring='neg_mean_absolute_error',  # Use a suitable regression metric
    n_jobs=-1,
)

# Perform Bayesian optimization
opt.fit(X_t, y_t)

# Get the best hyperparameters
best_params = opt.best_params_
print(opt.best_estimator_)


In [None]:
final_model = GradientBoostingRegressor(**best_params, random_state=101)
final_model.fit(X_t, y_t)

print("With GB Pred")
y_t_pred = final_model.predict(X_t)
print(mean_absolute_error(y_t, y_t_pred))

y_te_pred = final_model.predict(X_te)
print(mean_absolute_error(y_te, y_te_pred))

print("---------------")

onehot = OneHotEncoder()
X_t_leaves = onehot.fit_transform(final_model.apply(X_t))
xgb_lr = LogisticRegression()
xgb_lr.fit(X_t_leaves, y_t)

print("With XGB + LR Pred")
X_t_leaves = onehot.transform(final_model.apply(X_t))
y_t_pred_xgb_lr = xgb_lr.predict(X_t_leaves)
print(mean_absolute_error(y_t, y_t_pred_xgb_lr))

X_te_leaves = onehot.transform(final_model.apply(X_te))
y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
print(mean_absolute_error(y_te, y_te_pred_xgb_lr))

In [None]:
def set_randomstate(model):
    current_params = model.get_params()

    # Update the random_state if it exists, otherwise add it to the parameters
    current_params['random_state'] = 101

    # Set the modified parameters back to the model
    model.set_params(**current_params)
    return model

kf = KFold(n_splits=5, shuffle=True, random_state=101)
models = {
    # 'Linear Regression': LinearRegression(),
    # 'Lasso': LassoCV(cv=kf, random_state=42, max_iter=200000),
    # 'Logistic': LogisticRegression(),
    # 'KNN': pd.read_pickle("result_KNN.pkl")['estimator'],
    'XGB': set_randomstate(pd.read_pickle("result_XGB.pkl")['estimator']),
    'LGBM': set_randomstate(pd.read_pickle("result_LGBM.pkl")['estimator']),
    # 'Ridge': RidgeCV(cv=kf),
    # 'SVR': pd.read_pickle("result_SVR.pkl")['estimator'],
    # 'Random Forest': RandomForestRegressor(bootstrap=False, max_depth=15, max_features=0.7,
    #                   min_samples_leaf=9, n_estimators=200, random_state=101),
    # 'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.01, max_depth=14, min_samples_leaf=3,
    #                       min_samples_split=4, n_estimators=950, random_state=101, subsample=0.8),
    # 'Artificial Neural Network': MLPRegressor(activation='logistic', alpha=0.01, hidden_layer_sizes=(50, 50),
    #          max_iter=4000, random_state=101),
    # 'Gaussian Process Regression': GaussianProcessRegressor(0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)))
}

In [None]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
        scoring=['neg_mean_absolute_error','neg_mean_squared_error']
        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=101)

        # Perform k-fold cross-validation and calculate MSE and MAE
        scores = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, n_jobs=-1)

        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()

        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Train and evaluate
results = evaluate_models(models, X_t, y_t)


In [None]:
def model_lr_evaluation(models, X_train, y_train, X_test, y_test):
    results = {}
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']

    for name, model in models.items():
        final_model = model
        
        scores = cross_validate(model, X_train, y_train, cv=3, scoring=scoring, n_jobs=-1)
        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()
        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        
        final_model.fit(X_train, y_train)
        y_te_pred = final_model.predict(X_test)
        print(mean_absolute_error(y_test, y_te_pred), mean_squared_error(y_test, y_te_pred))
        
        print("---------------")
        
        onehot = OneHotEncoder()
        if name == 'LGBM':
            X_t_leaves = onehot.fit_transform(final_model.predict(X_train, pred_leaf=True))
        elif name == 'XGB':
            X_t_leaves = onehot.fit_transform(final_model.apply(X_train))
        xgb_lr = LogisticRegression()
        xgb_lr.fit(X_t_leaves, y_train)

        if name == 'LGBM':
            X_t_leaves = onehot.transform(final_model.predict(X_train, pred_leaf=True))
            X_te_leaves = onehot.transform(final_model.predict(X_test, pred_leaf=True))
        elif name == 'XGB':
            X_t_leaves = onehot.transform(final_model.apply(X_train))
            X_te_leaves = onehot.transform(final_model.apply(X_test))

        scores = cross_validate(model, X_t_leaves, y_train, cv=kf, scoring=scoring, n_jobs=-1)
        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()
        print(mean_mae, mean_mse, std_mse) 
        
        y_te_pred_xgb_lr = xgb_lr.predict(X_te_leaves)
        print(mean_absolute_error(y_test, y_te_pred_xgb_lr), mean_squared_error(y_test, y_te_pred_xgb_lr))

model_lr_evaluation(models, X_t, y_t, X_te, y_te)

In [None]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [None]:
import pickle

result_final_with_test = {}
for name, model in models.items():
    result_final_with_test[name] = evaluate_models_with_test(model, X_t, y_t, X_te, y_te)

result_final_with_test

In [None]:
train_data_0 = train_data[train_data['clusters']==0]
train_data_1 = train_data[train_data['clusters']==1]
validate_data_0 = validate_data[validate_data['clusters']==0]
validate_data_1 = validate_data[validate_data['clusters']==1]
test_data_0 = test_data[test_data['clusters']==0]
test_data_1 = test_data[test_data['clusters']==1]
test_data_2 = test_data[test_data['clusters']==2]
test_data_1 = pd.concat([test_data_1, test_data_2], ignore_index=True)

import matplotlib.pyplot as plt

# Assuming you have three DataFrames: df1, df2, and df3

# Plotting DataFrame 1 in red
plt.plot(train_data_0['start_node_x'], train_data_0['start_node_y'], 'ro-', label='DF1 Start')
plt.plot(train_data_0['end_node_x'], train_data_0['end_node_y'], 'ro-', label='DF1 End')

# Plotting DataFrame 2 in green
plt.plot(train_data_1['start_node_x'], train_data_1['start_node_y'], 'go-', label='DF2 Start')
plt.plot(train_data_1['end_node_x'], train_data_1['end_node_y'], 'go-', label='DF2 End')

# Plotting DataFrame 3 in blue
plt.plot(validate_data_0['start_node_x'], validate_data_0['start_node_y'], 'bo-', label='DF3 Start')
plt.plot(validate_data_0['end_node_x'], validate_data_0['end_node_y'], 'bo-', label='DF3 End')
plt.plot(validate_data_1['start_node_x'], validate_data_1['start_node_y'], 'yo-', label='DF4 Start')
plt.plot(validate_data_1['end_node_x'], validate_data_1['end_node_y'], 'yo-', label='DF4 End')

plt.plot(test_data_0['start_node_x'], test_data_0['start_node_y'], 'co-', label='DF5 Start')
plt.plot(test_data_0['end_node_x'], test_data_0['end_node_y'], 'co-', label='DF5 End')
plt.plot(test_data_1['start_node_x'], test_data_1['start_node_y'], 'mo-', label='DF6 Start')
plt.plot(test_data_1['end_node_x'], test_data_1['end_node_y'], 'mo-', label='DF6 End')
# plt.plot(test_data_2['start_node_x'], test_data_2['start_node_y'], 'ko-', label='DF7 Start')
# plt.plot(test_data_2['end_node_x'], test_data_2['end_node_y'], 'ko-', label='DF7 End')
# Add labels and legend
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()

# Show the plot
plt.show()