In [40]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

In [41]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'link_length', 'link_freespeed', 
                      'link_capacity', 'link_permlanes', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 
                      'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes',
                      'income', 'score', 'income_avg', 'score_avg'
                     ]
category_feature = ['type', 'home-activity-zone']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
#     'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101)
}

model_space = {
#     'KNN': KNeighborsRegressor(),
    # 'XGB': xgb.XGBRegressor(random_state=101),
#     'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
#     'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
#     'ANN': MLPRegressor(random_state=101),
    # 'SVR': SVR(),
    'Linear': LinearRegression(),
    'Lasso': LassoCV(random_state=42, max_iter=100000),
    'Ridge': RidgeCV(),
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'Linear': {  
},
'Lasso': {
},
'Ridge': {  
},
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
# 'GRNN':{
#     'sigma' : np.arange(0.1, 4, 0.01)
# }
}

In [8]:
train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

In [9]:
# Find the indices where 'link_id' is 0
indices = Big_data.index[Big_data['link_id'] == 0].tolist()

# Add the end of the DataFrame to the indices list
indices.append(len(Big_data))

# Split the DataFrame using the indices
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]
list_od = []
list_nodes = []
all_files = train_files + validate_files + test_files
for i in all_files:
    with open(i) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]
import networkx as nx

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data_new = pd.concat(dfs)

In [10]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['x_y_coor'] = cluster.fit_predict(Big_data_new[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['similar_link'] = cluster1.fit_predict(Big_data_new[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['planxml'] = cluster2.fit_predict(Big_data_new[['income', 'score', 'rush_hour',
                                                               'max_dur', 'cemdapStopDuration_s']])

Big_data_new = Big_data_new.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

In [11]:
Big_data_tr = ct.fit_transform(Big_data_new)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)

# Classification and regression

In [12]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [13]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-03-14 16:44:23,916] A new study created in memory with name: no-name-54add620-3906-48df-afe7-9b7c096cd59c
[I 2024-03-14 16:44:25,955] Trial 0 finished with value: 0.9191844755588308 and parameters: {'selector__k': 32, 'model__n_neighbors': 24, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: 0.9191844755588308.
[I 2024-03-14 16:44:28,223] Trial 1 finished with value: 0.9026037828543355 and parameters: {'selector__k': 11, 'model__n_neighbors': 24, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 0 with value: 0.9191844755588308.
[I 2024-03-14 16:44:30,477] Trial 2 finished with value: 0.8967084254482928 and parameters: {'selector__k': 17, 'model__n_neighbors': 10, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: 0.9191844755588308.
[I 2024-03-14 16:44:32,163] Trial 3 finished with value: 0.8160157209530828 and parameters: {'selector__k': 42, 'model__n_neighbors': 5, 'model__we

[I 2024-03-14 16:45:53,601] Trial 33 finished with value: 0.902849422746254 and parameters: {'selector__k': 18, 'model__n_neighbors': 26, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 27 with value: 0.919921395234586.
[I 2024-03-14 16:45:55,490] Trial 34 finished with value: 0.9134119380987472 and parameters: {'selector__k': 24, 'model__n_neighbors': 12, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 27 with value: 0.919921395234586.
[I 2024-03-14 16:46:02,456] Trial 35 finished with value: 0.9180790960451978 and parameters: {'selector__k': 35, 'model__n_neighbors': 23, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 27 with value: 0.919921395234586.
[I 2024-03-14 16:46:05,281] Trial 36 finished with value: 0.9161139769098502 and parameters: {'selector__k': 31, 'model__n_neighbors': 8, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 27 with value: 0.919921395234586.
[I 2

KNN 0.919921395234586 0.9324376695517375


[I 2024-03-14 16:47:00,815] Trial 0 finished with value: 0.9234831736674036 and parameters: {'selector__k': 39, 'model__learning_rate': 0.49291402738964324, 'model__n_estimators': 1135, 'model__max_depth': 12, 'model__num_leaves': 41, 'model__min_child_samples': 11, 'model__subsample': 0.649303682637739, 'model__colsample_bytree': 0.46420961323084364}. Best is trial 0 with value: 0.9234831736674036.
[I 2024-03-14 16:47:14,263] Trial 1 finished with value: 0.9196757553426677 and parameters: {'selector__k': 29, 'model__learning_rate': 0.33367311608935507, 'model__n_estimators': 908, 'model__max_depth': 13, 'model__num_leaves': 48, 'model__min_child_samples': 7, 'model__subsample': 0.769048336275233, 'model__colsample_bytree': 0.17079529976027527}. Best is trial 0 with value: 0.9234831736674036.
[I 2024-03-14 16:47:17,226] Trial 2 finished with value: 0.8188405797101449 and parameters: {'selector__k': 44, 'model__learning_rate': 0.8274232718556764, 'model__n_estimators': 177, 'model__max_

[I 2024-03-14 16:49:40,938] Trial 21 finished with value: 0.9250798329648735 and parameters: {'selector__k': 48, 'model__learning_rate': 0.26500358748756103, 'model__n_estimators': 1730, 'model__max_depth': 28, 'model__num_leaves': 20, 'model__min_child_samples': 14, 'model__subsample': 0.5008980668517066, 'model__colsample_bytree': 0.9495982370332877}. Best is trial 17 with value: 0.9272905919921395.
[I 2024-03-14 16:50:23,674] Trial 22 finished with value: 0.9265536723163842 and parameters: {'selector__k': 45, 'model__learning_rate': 0.2561320105268826, 'model__n_estimators': 2941, 'model__max_depth': 21, 'model__num_leaves': 33, 'model__min_child_samples': 12, 'model__subsample': 0.5327396561966159, 'model__colsample_bytree': 0.8693508900820248}. Best is trial 17 with value: 0.9272905919921395.
[I 2024-03-14 16:50:43,021] Trial 23 finished with value: 0.9258167526406288 and parameters: {'selector__k': 44, 'model__learning_rate': 0.400204571704346, 'model__n_estimators': 1882, 'model

[I 2024-03-14 16:55:05,597] Trial 42 finished with value: 0.9260623925325473 and parameters: {'selector__k': 43, 'model__learning_rate': 0.22045874008859584, 'model__n_estimators': 1613, 'model__max_depth': 19, 'model__num_leaves': 36, 'model__min_child_samples': 8, 'model__subsample': 0.1458100017467679, 'model__colsample_bytree': 0.834102147417585}. Best is trial 24 with value: 0.9281503316138541.
[I 2024-03-14 16:55:39,903] Trial 43 finished with value: 0.9300540407762221 and parameters: {'selector__k': 38, 'model__learning_rate': 0.13461059513573087, 'model__n_estimators': 2142, 'model__max_depth': 15, 'model__num_leaves': 40, 'model__min_child_samples': 13, 'model__subsample': 0.35369728444776016, 'model__colsample_bytree': 0.9063086157661319}. Best is trial 43 with value: 0.9300540407762221.
[I 2024-03-14 16:56:21,216] Trial 44 finished with value: 0.9225006140997298 and parameters: {'selector__k': 34, 'model__learning_rate': 0.14552206084178665, 'model__n_estimators': 2506, 'mod

LGBM 0.9300540407762221 0.9435473453042242


[I 2024-03-14 16:59:41,292] Trial 0 finished with value: 0.9147629575042987 and parameters: {'selector__k': 35, 'model__max_features': 'log2', 'model__n_estimators': 347, 'model__max_depth': 163, 'model__min_samples_leaf': 3}. Best is trial 0 with value: 0.9147629575042987.
[I 2024-03-14 17:00:11,543] Trial 1 finished with value: 0.9080078604765414 and parameters: {'selector__k': 37, 'model__max_features': 'sqrt', 'model__n_estimators': 266, 'model__max_depth': 134, 'model__min_samples_leaf': 18}. Best is trial 0 with value: 0.9147629575042987.
[I 2024-03-14 17:01:43,346] Trial 2 finished with value: 0.9073937607467453 and parameters: {'selector__k': 26, 'model__max_features': 'log2', 'model__n_estimators': 1165, 'model__max_depth': 177, 'model__min_samples_leaf': 8}. Best is trial 0 with value: 0.9147629575042987.
[I 2024-03-14 17:03:07,844] Trial 3 finished with value: 0.9042004421518055 and parameters: {'selector__k': 10, 'model__max_features': 'log2', 'model__n_estimators': 2717, '

[I 2024-03-14 17:18:50,739] Trial 30 finished with value: 0.9179562760992385 and parameters: {'selector__k': 45, 'model__max_features': 'log2', 'model__n_estimators': 388, 'model__max_depth': 141, 'model__min_samples_leaf': 2}. Best is trial 15 with value: 0.9263080324244657.
[I 2024-03-14 17:19:00,063] Trial 31 finished with value: 0.9236059936133628 and parameters: {'selector__k': 33, 'model__max_features': 'log2', 'model__n_estimators': 74, 'model__max_depth': 43, 'model__min_samples_leaf': 1}. Best is trial 15 with value: 0.9263080324244657.
[I 2024-03-14 17:19:08,410] Trial 32 finished with value: 0.9127978383689511 and parameters: {'selector__k': 35, 'model__max_features': 'log2', 'model__n_estimators': 73, 'model__max_depth': 77, 'model__min_samples_leaf': 3}. Best is trial 15 with value: 0.9263080324244657.
[I 2024-03-14 17:19:19,701] Trial 33 finished with value: 0.9166052566936871 and parameters: {'selector__k': 36, 'model__max_features': 'log2', 'model__n_estimators': 93, 'm

RF 0.9267993122083026 0.9452267148947164


In [None]:
import pickle
with open('best_model_clf.pickle', 'wb') as f:
    pickle.dump(best_model_clf, f, pickle.HIGHEST_PROTOCOL)
best_model_clf = pd.read_pickle("best_model_clf.pickle")

In [42]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [43]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [44]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-15 00:21:15,531] A new study created in memory with name: no-name-9a1865e2-40ad-453c-8cfc-97dd0a87f4dd
[I 2024-03-15 00:21:15,821] Trial 0 finished with value: -12.670366663780099 and parameters: {'selector__k': 14}. Best is trial 0 with value: -12.670366663780099.
[I 2024-03-15 00:21:16,157] Trial 1 finished with value: -16.854833525805756 and parameters: {'selector__k': 36}. Best is trial 0 with value: -12.670366663780099.
[I 2024-03-15 00:21:16,383] Trial 2 finished with value: -16.523989669643218 and parameters: {'selector__k': 22}. Best is trial 0 with value: -12.670366663780099.
[I 2024-03-15 00:21:16,615] Trial 3 finished with value: -16.494224716755845 and parameters: {'selector__k': 30}. Best is trial 0 with value: -12.670366663780099.
[I 2024-03-15 00:21:16,818] Trial 4 finished with value: -12.56146462893152 and parameters: {'selector__k': 19}. Best is trial 4 with value: -12.56146462893152.
[I 2024-03-15 00:21:17,181] Trial 5 finished with value: -16.226136913104

[I 2024-03-15 00:21:27,231] A new study created in memory with name: no-name-06461b51-ef89-4715-817f-fad669257be9


Linear -12.56146462893152 11.633064134096394 479.9708574243267 203.78731152804988


[I 2024-03-15 00:21:28,833] Trial 0 finished with value: -13.018343679690156 and parameters: {'selector__k': 48}. Best is trial 0 with value: -13.018343679690156.
[I 2024-03-15 00:21:30,283] Trial 1 finished with value: -13.014962629884124 and parameters: {'selector__k': 45}. Best is trial 1 with value: -13.014962629884124.
[I 2024-03-15 00:21:31,561] Trial 2 finished with value: -13.013351156335819 and parameters: {'selector__k': 44}. Best is trial 2 with value: -13.013351156335819.
[I 2024-03-15 00:21:32,477] Trial 3 finished with value: -12.612629806386462 and parameters: {'selector__k': 14}. Best is trial 3 with value: -12.612629806386462.
[I 2024-03-15 00:21:33,726] Trial 4 finished with value: -13.014962629884124 and parameters: {'selector__k': 45}. Best is trial 3 with value: -12.612629806386462.
[I 2024-03-15 00:21:35,064] Trial 5 finished with value: -12.671015976500088 and parameters: {'selector__k': 29}. Best is trial 3 with value: -12.612629806386462.
[I 2024-03-15 00:21:36

[I 2024-03-15 00:22:20,372] A new study created in memory with name: no-name-782a7ba2-b026-4c3a-8fc0-56b62edc02d3


Lasso -12.612111341145178 11.090910113036244 410.8523566256865 210.73379700048747


[I 2024-03-15 00:22:20,708] Trial 0 finished with value: -16.57866925903431 and parameters: {'selector__k': 33}. Best is trial 0 with value: -16.57866925903431.
[I 2024-03-15 00:22:20,987] Trial 1 finished with value: -12.730652403906063 and parameters: {'selector__k': 21}. Best is trial 1 with value: -12.730652403906063.
[I 2024-03-15 00:22:21,253] Trial 2 finished with value: -16.535338199280666 and parameters: {'selector__k': 23}. Best is trial 1 with value: -12.730652403906063.
[I 2024-03-15 00:22:21,709] Trial 3 finished with value: -16.8659571386577 and parameters: {'selector__k': 35}. Best is trial 1 with value: -12.730652403906063.
[I 2024-03-15 00:22:21,982] Trial 4 finished with value: -16.535338199280666 and parameters: {'selector__k': 23}. Best is trial 1 with value: -12.730652403906063.
[I 2024-03-15 00:22:22,315] Trial 5 finished with value: -16.845906678297016 and parameters: {'selector__k': 40}. Best is trial 1 with value: -12.730652403906063.
[I 2024-03-15 00:22:22,580

Ridge -12.560868873984644 11.605446685599029 477.2304373719873 204.10476971179108


# Only Regression

In [45]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

In [46]:
best_model_onlyreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(10, 48)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)



[I 2024-03-15 00:23:05,008] A new study created in memory with name: no-name-1f88a31f-1b0d-406d-aa4b-d36c4b58cdf1
[I 2024-03-15 00:23:05,583] Trial 0 finished with value: -14.146715418859506 and parameters: {'selector__k': 47}. Best is trial 0 with value: -14.146715418859506.
[I 2024-03-15 00:23:05,796] Trial 1 finished with value: -11.108574073953662 and parameters: {'selector__k': 12}. Best is trial 1 with value: -11.108574073953662.
[I 2024-03-15 00:23:06,275] Trial 2 finished with value: -14.141154998203142 and parameters: {'selector__k': 42}. Best is trial 1 with value: -11.108574073953662.
[I 2024-03-15 00:23:06,542] Trial 3 finished with value: -10.946256507279456 and parameters: {'selector__k': 17}. Best is trial 3 with value: -10.946256507279456.
[I 2024-03-15 00:23:07,154] Trial 4 finished with value: -14.13829726215299 and parameters: {'selector__k': 46}. Best is trial 3 with value: -10.946256507279456.
[I 2024-03-15 00:23:07,457] Trial 5 finished with value: -11.01356641381

[I 2024-03-15 00:23:21,190] A new study created in memory with name: no-name-d756e19d-a60a-4182-9633-37ab81436fc8


Linear -10.932716882326256 12.02956112614181 484.350622872108 204.4534845259334


[I 2024-03-15 00:23:22,735] Trial 0 finished with value: -11.355253410081092 and parameters: {'selector__k': 47}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:23:24,214] Trial 1 finished with value: -11.359393498041378 and parameters: {'selector__k': 43}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:23:25,631] Trial 2 finished with value: -11.359393498041378 and parameters: {'selector__k': 40}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:23:27,013] Trial 3 finished with value: -11.09027366368712 and parameters: {'selector__k': 37}. Best is trial 3 with value: -11.09027366368712.
[I 2024-03-15 00:23:28,371] Trial 4 finished with value: -11.03826102300287 and parameters: {'selector__k': 28}. Best is trial 4 with value: -11.03826102300287.
[I 2024-03-15 00:23:29,713] Trial 5 finished with value: -11.03826102300287 and parameters: {'selector__k': 26}. Best is trial 4 with value: -11.03826102300287.
[I 2024-03-15 00:23:31,297] 

Lasso -11.038258059547191 12.037470151500209 466.89108267403185 223.1110291610708


[I 2024-03-15 00:24:26,544] Trial 0 finished with value: -13.714841984652512 and parameters: {'selector__k': 33}. Best is trial 0 with value: -13.714841984652512.
[I 2024-03-15 00:24:26,828] Trial 1 finished with value: -11.00372583769677 and parameters: {'selector__k': 21}. Best is trial 1 with value: -11.00372583769677.
[I 2024-03-15 00:24:27,171] Trial 2 finished with value: -13.774736457162733 and parameters: {'selector__k': 36}. Best is trial 1 with value: -11.00372583769677.
[I 2024-03-15 00:24:27,621] Trial 3 finished with value: -13.91509759901855 and parameters: {'selector__k': 39}. Best is trial 1 with value: -11.00372583769677.
[I 2024-03-15 00:24:28,199] Trial 4 finished with value: -13.91509759901855 and parameters: {'selector__k': 39}. Best is trial 1 with value: -11.00372583769677.
[I 2024-03-15 00:24:28,552] Trial 5 finished with value: -11.007534112546068 and parameters: {'selector__k': 23}. Best is trial 1 with value: -11.00372583769677.
[I 2024-03-15 00:24:28,849] Tr

Ridge -10.932193326058163 12.001669653784276 481.6004396041761 204.74302954197816


# Regression w/o feature selection

In [47]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-15 00:24:55,082] A new study created in memory with name: no-name-0879fc64-a27a-479a-9e3b-861af0cff655
[I 2024-03-15 00:24:55,457] Trial 0 finished with value: -14.206013599883764 and parameters: {}. Best is trial 0 with value: -14.206013599883764.
[I 2024-03-15 00:24:55,787] Trial 1 finished with value: -14.206013599883764 and parameters: {}. Best is trial 0 with value: -14.206013599883764.
[I 2024-03-15 00:24:56,088] Trial 2 finished with value: -14.206013599883764 and parameters: {}. Best is trial 0 with value: -14.206013599883764.
[I 2024-03-15 00:24:56,360] Trial 3 finished with value: -14.206013599883764 and parameters: {}. Best is trial 0 with value: -14.206013599883764.
[I 2024-03-15 00:24:56,601] Trial 4 finished with value: -14.206013599883764 and parameters: {}. Best is trial 0 with value: -14.206013599883764.
[I 2024-03-15 00:24:56,902] Trial 5 finished with value: -14.206013599883764 and parameters: {}. Best is trial 0 with value: -14.206013599883764.
[I 2024-03

Linear -14.206013599883764 25.18371710012428 1416.9691949380042 223.68591915614297


[I 2024-03-15 00:25:14,015] Trial 0 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:25:15,484] Trial 1 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:25:16,933] Trial 2 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:25:18,310] Trial 3 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:25:19,998] Trial 4 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:25:21,867] Trial 5 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value: -11.355253410081092.
[I 2024-03-15 00:25:23,441] Trial 6 finished with value: -11.355253410081092 and parameters: {}. Best is trial 0 with value:

Lasso -11.355253410081092 12.0757732726152 467.95199317548077 221.66753955039087


[I 2024-03-15 00:26:34,673] Trial 0 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value: -14.197455069338432.
[I 2024-03-15 00:26:35,002] Trial 1 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value: -14.197455069338432.
[I 2024-03-15 00:26:35,338] Trial 2 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value: -14.197455069338432.
[I 2024-03-15 00:26:35,653] Trial 3 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value: -14.197455069338432.
[I 2024-03-15 00:26:35,979] Trial 4 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value: -14.197455069338432.
[I 2024-03-15 00:26:36,417] Trial 5 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value: -14.197455069338432.
[I 2024-03-15 00:26:36,743] Trial 6 finished with value: -14.197455069338432 and parameters: {}. Best is trial 0 with value:

Ridge -14.197455069338432 25.166675012846074 1415.2334945390903 223.1453497903005


# GNN for classification & regression

In [16]:
import torch
from torch_geometric.nn import GATConv
import torch.nn.functional as F
class GATNet(torch.nn.Module):
    def __init__(self, num_features, num_classes,
                hid, in_head, out_head, dor, extra_layer):
        super(GATNet, self).__init__()
        self.hid = hid
        self.in_head = in_head
        self.out_head = out_head
        self.dor = dor
        self.extra_layer = extra_layer
        self.gat1 = GATConv(num_features, self.hid, heads=self.in_head, dropout=self.dor)
        if self.extra_layer:
            self.gat2 = GATConv(self.hid*self.in_head, self.hid, heads=self.in_head, dropout=self.dor)
            self.gat3 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)
        else:
            self.gat2 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dor, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=self.dor, training=self.training)
        if self.extra_layer:
            x = F.elu(self.gat2(x, edge_index))  # Add non-linearity after the second layer
            x = F.dropout(x, p=self.dor, training=self.training)
            x = self.gat3(x, edge_index) 
        else:
            x = self.gat2(x, edge_index)
        return x

In [17]:
all_features = list(temp_2.columns)
nodes_features = ['remainder__link_from', 'remainder__link_to']
drop_featrues = ['remainder__dataset', 'remainder__link_counts', 'used_link']
temp_features = list(set(all_features) - set(nodes_features))
other_features = list(set(temp_features) - set(drop_featrues))

In [21]:
import optuna
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

best_k = None
best_performance = float('inf')
performance_history = []

def objective(trial):
    # Hyperparameters to tune
    k = trial.suggest_int('k', 2, len(other_features))
    hid = trial.suggest_categorical('hid', [16, 32, 64, 128, 256, 512])
    in_head = trial.suggest_categorical('in_head', [1, 2, 4, 8, 16, 32])
    out_head = trial.suggest_categorical('out_head', [1, 2])
    dor = trial.suggest_categorical('dor', [0, 0.05, 0.1])
    extra_layer = trial.suggest_categorical('extra_layer', [False])
    
    # Create a tensor of your labels/targets
    y = torch.tensor(temp_2['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
    
    # Feature selection for the current k
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(temp_2[other_features], y)
    selected_columns = list(temp_2[other_features].columns[selector.get_support(indices=True)])
    
    edge_index = torch.tensor(temp_2[nodes_features].values.T, dtype=torch.long)
    x = torch.tensor(temp_2[selected_columns].values, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, y=y)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_data = data.to(device)
    model = GATNet(k, 1, hid=hid, in_head=in_head, out_head=out_head, dor=dor, extra_layer=extra_layer).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion_MAE = torch.nn.L1Loss()
    def train():
        model.train()
        optimizer.zero_grad()
        out = model(train_data.x, train_data.edge_index)
        loss = criterion_MAE(out, train_data.y)
        loss.backward()
        optimizer.step()
        return loss
    for epoch in range(50):
        loss = train()

    return loss.item()

#     # Store the performance for each k
#     performance_history.append((k, test_loss))

#     # Update the best k if the current performance is better
#     if performance < best_performance:
#         best_performance = performance
#         best_k = k
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2024-03-14 17:38:38,710] A new study created in memory with name: no-name-1fe40b3d-3bc0-4447-88a4-6d6912475971
[I 2024-03-14 17:39:13,279] Trial 0 finished with value: 20.554716110229492 and parameters: {'k': 33, 'hid': 128, 'in_head': 2, 'out_head': 1, 'dor': 0.05, 'extra_layer': False}. Best is trial 0 with value: 20.554716110229492.
[I 2024-03-14 17:39:26,450] Trial 1 finished with value: 16.9612979888916 and parameters: {'k': 26, 'hid': 32, 'in_head': 1, 'out_head': 2, 'dor': 0, 'extra_layer': False}. Best is trial 1 with value: 16.9612979888916.
[I 2024-03-14 17:41:35,385] Trial 2 finished with value: 13.339067459106445 and parameters: {'k': 9, 'hid': 512, 'in_head': 4, 'out_head': 1, 'dor': 0.1, 'extra_layer': False}. Best is trial 2 with value: 13.339067459106445.
[I 2024-03-14 17:43:03,688] Trial 3 finished with value: 12.462443351745605 and parameters: {'k': 25, 'hid': 128, 'in_head': 8, 'out_head': 2, 'dor': 0, 'extra_layer': False}. Best is trial 3 with value: 12.46244335

[I 2024-03-14 18:27:44,498] Trial 36 finished with value: 12.145896911621094 and parameters: {'k': 24, 'hid': 64, 'in_head': 8, 'out_head': 1, 'dor': 0, 'extra_layer': False}. Best is trial 35 with value: 10.565139770507812.
[I 2024-03-14 18:27:55,595] Trial 37 finished with value: 14.622413635253906 and parameters: {'k': 17, 'hid': 128, 'in_head': 1, 'out_head': 2, 'dor': 0.1, 'extra_layer': False}. Best is trial 35 with value: 10.565139770507812.
[I 2024-03-14 18:29:53,256] Trial 38 finished with value: 12.2727632522583 and parameters: {'k': 27, 'hid': 64, 'in_head': 32, 'out_head': 2, 'dor': 0, 'extra_layer': False}. Best is trial 35 with value: 10.565139770507812.
[I 2024-03-14 18:30:15,552] Trial 39 finished with value: 11.274246215820312 and parameters: {'k': 18, 'hid': 256, 'in_head': 2, 'out_head': 1, 'dor': 0, 'extra_layer': False}. Best is trial 35 with value: 10.565139770507812.
[I 2024-03-14 18:31:54,459] Trial 40 finished with value: 13.575441360473633 and parameters: {'k'

In [22]:
best_params = study.best_params
best_k = best_params['k']
best_hid = best_params['hid']
best_in_head = best_params['in_head']
best_out_head = best_params['out_head']
best_dor = best_params['dor']
best_extra_layer = best_params['extra_layer']

In [23]:
# Feature selection for the current k
selector = SelectKBest(score_func=f_regression, k=best_k)
y = torch.tensor(temp_2['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
X_new = selector.fit_transform(temp_2[other_features], y)
selected_columns = list(temp_2[other_features].columns[selector.get_support(indices=True)])

edge_index = torch.tensor(temp_2[nodes_features].values.T, dtype=torch.long)
x = torch.tensor(temp_2[selected_columns].values, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=y)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = data.to(device)

best_model = GATNet(best_k, 1, hid=best_hid, in_head=best_in_head,
                    out_head=best_out_head, dor=best_dor, extra_layer=best_extra_layer).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion_MAE = torch.nn.L1Loss()
def train():
    best_model.train()
    optimizer.zero_grad()
    out = best_model(train_data.x, train_data.edge_index)
    loss = criterion_MAE(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss
for epoch in range(250):
    loss = train()


In [39]:
test_edge_index = torch.tensor(test_data_tr[nodes_features].values.T, dtype=torch.long)
test_x = torch.tensor(X_te[selected_columns].values, dtype=torch.float)
test_y = torch.tensor(y_te.values, dtype=torch.float).unsqueeze(1)
test_data = Data(x=test_x, edge_index=test_edge_index, y=test_y)
test_data = test_data.to(device)

criterion_MSE = torch.nn.MSELoss()
def test(test_data):
    best_model.eval()
    with torch.no_grad():
        pred = best_model(test_data.x, test_data.edge_index)
        y_pred_all = np.concatenate([np.array(pd.DataFrame(pred).astype("float")[0]), np.array(X_te_0['y_pred'])])
        loss_MAE = criterion_MAE(pred, test_data.y)
        loss_MSE = criterion_MSE(pred, test_data.y)
    return loss_MAE.item(), loss_MSE.item()

test_loss = test(test_data)
test_loss

0       11.022389
1       11.858150
2        8.795033
3       10.348769
4       14.365703
          ...    
6605    20.636030
6606    24.730162
6607    24.730162
6608    49.950474
6609    49.937408
Name: 0, Length: 6610, dtype: float64


(11.724844932556152, 526.4126586914062)