In [23]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

In [24]:
def load_data_small(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)

            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']
        
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [75]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'length_per_capacity_ratio', 'speed_capacity_ratio', 'length_times_lanes', 'speed_times_capacity', 
                      'length_times', 'capacity_divided_by_lanes'
                     ]
scaler = StandardScaler()
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
#     'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101)
}

model_space = {
#     'KNN': KNeighborsRegressor(),
#     'XGB': xgb.XGBRegressor(random_state=101),
#     'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
#     'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
#     'ANN': MLPRegressor(random_state=101),
#     'GPR': GaussianProcessRegressor(copy_X_train=False, random_state=101),
#     'SVR': SVR(),
    'Linear': LinearRegression(),
    'Lasso': LassoCV(random_state=42, max_iter=100000),
    'Ridge': RidgeCV(),
}
model_space_feature = {
    'SVR': RandomForestRegressor(random_state=101),
    'KNN': RandomForestRegressor(random_state=101),
    'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
    'GB': GradientBoostingRegressor(random_state=101),
    'ANN': RandomForestRegressor(random_state=101),
    # 'GRNN': RandomForestRegressor(random_state=101)
}
param_space = {
'Linear': {  
},
'Lasso': {
},
'Ridge': {  
},
'SVR': {
    "C": optuna.distributions.FloatDistribution(1e-5, 1e5),
    'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
    'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
},
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
'GB':{
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_split': optuna.distributions.IntDistribution(2, 11),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'identity', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['sgd', 'adam']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
'XGB': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 20),
    'max_leaves': optuna.distributions.IntDistribution(2, 50),
    'max_bin': optuna.distributions.IntDistribution(2, 50),
    'gamma': optuna.distributions.IntDistribution(1, 20),
},
'GPR':{
    'kernel': optuna.distributions.CategoricalDistribution([0.1**2 * RBF(length_scale=0.1) + 
                                    WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5)), 
                                    0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0),
                                    50.0**2 * RBF(length_scale=50.0), DotProduct() + WhiteKernel(), 
                                    1.0 * Matern(length_scale=1.0, nu=1.5),
                                    RBF() + ConstantKernel(constant_value=2)
                                                           ]),
    'alpha':  optuna.distributions.FloatDistribution(1e-15, 1e10)
}
}

In [76]:
df_train = []
list_od = []
list_nodes = []
for i in range(0, 10):
    small_train_files = f'Data/smallWorlds/Train/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Train/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Train/s/df_links_network_{i}.pkl")
    small_train_data = load_data_small([small_train_files], small_df_activities, small_df_links_network)
    df_train.append(small_train_data)
    with open(small_train_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
train_data = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    small_validate_files = f'Data/smallWorlds/Validate/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_links_network_{i}.pkl")
    small_validate_data = load_data_small([small_validate_files], small_df_activities, small_df_links_network)
    df_validate.append(small_validate_data)
    with open(small_validate_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
validate_data = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    small_test_files = f'Data/smallWorlds/Test/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Test/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Test/s/df_links_network_{i}.pkl")
    small_test_data = load_data_small([small_test_files], small_df_activities, small_df_links_network)
    df_test.append(small_test_data)
    with open(small_test_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
test_data = pd.concat(df_test, ignore_index=True)

train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)


indices = Big_data.index[Big_data['link_id'] == 0].tolist()
indices.append(len(Big_data))
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]
import networkx as nx

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data_new = pd.concat(dfs)

In [77]:
cluster = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_new['x_y_coor'] = cluster.fit_predict(Big_data_new[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_new['similar_link'] = cluster1.fit_predict(Big_data_new[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])

Big_data_new = Big_data_new.astype({'x_y_coor':'int64','similar_link':'int64'})

In [78]:
Big_data_tr = ct.fit_transform(Big_data_new)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)

In [79]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

In [80]:
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 22)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)]
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf]
    print(model_name, opt.best_score_, accuracy_score(y_te_clf, y_pred_clf))

[I 2024-03-14 20:50:16,004] A new study created in memory with name: no-name-160f51de-877b-4147-a124-25567209294e
[I 2024-03-14 20:50:16,427] Trial 0 finished with value: 0.9589403973509933 and parameters: {'selector__k': 17, 'model__n_neighbors': 43, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-03-14 20:50:16,548] Trial 1 finished with value: 0.9509933774834437 and parameters: {'selector__k': 4, 'model__n_neighbors': 10, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-03-14 20:50:16,719] Trial 2 finished with value: 0.9589403973509933 and parameters: {'selector__k': 8, 'model__n_neighbors': 15, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-03-14 20:50:16,969] Trial 3 finished with value: 0.9589403973509933 and parameters: {'selector__k': 17, 'model__n_neighbors': 35, 'model__wei

[I 2024-03-14 20:50:22,156] Trial 33 finished with value: 0.9589403973509933 and parameters: {'selector__k': 5, 'model__n_neighbors': 12, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 31 with value: 0.9635761589403974.
[I 2024-03-14 20:50:22,399] Trial 34 finished with value: 0.9589403973509933 and parameters: {'selector__k': 7, 'model__n_neighbors': 25, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 31 with value: 0.9635761589403974.
[I 2024-03-14 20:50:22,523] Trial 35 finished with value: 0.9496688741721855 and parameters: {'selector__k': 4, 'model__n_neighbors': 7, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 31 with value: 0.9635761589403974.
[I 2024-03-14 20:50:22,664] Trial 36 finished with value: 0.9589403973509933 and parameters: {'selector__k': 8, 'model__n_neighbors': 18, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 31 with value: 0.9635761589403974.
[I 2024-03-14 20:50

KNN 0.9635761589403974 0.9648074369189907


[I 2024-03-14 20:50:25,459] Trial 0 finished with value: 0.9490066225165563 and parameters: {'selector__k': 18, 'model__learning_rate': 0.42350751156206606, 'model__n_estimators': 233, 'model__max_depth': 15, 'model__num_leaves': 39, 'model__min_child_samples': 1, 'model__subsample': 0.207342751440155, 'model__colsample_bytree': 0.18514999813597915}. Best is trial 0 with value: 0.9490066225165563.
[I 2024-03-14 20:50:25,664] Trial 1 finished with value: 0.9516556291390729 and parameters: {'selector__k': 5, 'model__learning_rate': 0.7984265723473858, 'model__n_estimators': 59, 'model__max_depth': 11, 'model__num_leaves': 48, 'model__min_child_samples': 17, 'model__subsample': 0.17928528299750757, 'model__colsample_bytree': 0.7837425776607473}. Best is trial 1 with value: 0.9516556291390729.
[I 2024-03-14 20:50:25,842] Trial 2 finished with value: 0.9503311258278145 and parameters: {'selector__k': 9, 'model__learning_rate': 0.5113527890201344, 'model__n_estimators': 57, 'model__max_depth

[I 2024-03-14 20:50:31,626] Trial 21 finished with value: 0.9622516556291391 and parameters: {'selector__k': 3, 'model__learning_rate': 0.16250620929016868, 'model__n_estimators': 130, 'model__max_depth': 31, 'model__num_leaves': 33, 'model__min_child_samples': 5, 'model__subsample': 0.7997786777164607, 'model__colsample_bytree': 0.3178958431549861}. Best is trial 21 with value: 0.9622516556291391.
[I 2024-03-14 20:50:31,877] Trial 22 finished with value: 0.9615894039735099 and parameters: {'selector__k': 4, 'model__learning_rate': 0.16558842975473526, 'model__n_estimators': 77, 'model__max_depth': 39, 'model__num_leaves': 31, 'model__min_child_samples': 6, 'model__subsample': 0.897492086002224, 'model__colsample_bytree': 0.34633211307028494}. Best is trial 21 with value: 0.9622516556291391.
[I 2024-03-14 20:50:32,085] Trial 23 finished with value: 0.9609271523178808 and parameters: {'selector__k': 4, 'model__learning_rate': 0.1616376055971257, 'model__n_estimators': 75, 'model__max_de

[I 2024-03-14 20:50:38,121] Trial 42 finished with value: 0.9622516556291391 and parameters: {'selector__k': 3, 'model__learning_rate': 0.38843637180372315, 'model__n_estimators': 133, 'model__max_depth': 43, 'model__num_leaves': 40, 'model__min_child_samples': 4, 'model__subsample': 0.9324705262030833, 'model__colsample_bytree': 0.5899833882723293}. Best is trial 21 with value: 0.9622516556291391.
[I 2024-03-14 20:50:38,349] Trial 43 finished with value: 0.9602649006622517 and parameters: {'selector__k': 5, 'model__learning_rate': 0.5213893501352639, 'model__n_estimators': 144, 'model__max_depth': 2, 'model__num_leaves': 28, 'model__min_child_samples': 5, 'model__subsample': 0.8190166315415471, 'model__colsample_bytree': 0.7874408426666534}. Best is trial 21 with value: 0.9622516556291391.
[I 2024-03-14 20:50:38,675] Trial 44 finished with value: 0.9622516556291391 and parameters: {'selector__k': 3, 'model__learning_rate': 0.36775591154474463, 'model__n_estimators': 109, 'model__max_d

LGBM 0.9622516556291391 0.9594953519256308


[I 2024-03-14 20:50:43,116] Trial 0 finished with value: 0.9589403973509933 and parameters: {'selector__k': 13, 'model__max_features': 'sqrt', 'model__n_estimators': 118, 'model__max_depth': 137, 'model__min_samples_leaf': 9}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-03-14 20:50:47,914] Trial 1 finished with value: 0.9589403973509933 and parameters: {'selector__k': 16, 'model__max_features': 'log2', 'model__n_estimators': 309, 'model__max_depth': 136, 'model__min_samples_leaf': 18}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-03-14 20:50:50,109] Trial 2 finished with value: 0.9589403973509933 and parameters: {'selector__k': 6, 'model__max_features': 'log2', 'model__n_estimators': 322, 'model__max_depth': 6, 'model__min_samples_leaf': 18}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-03-14 20:50:52,012] Trial 3 finished with value: 0.9589403973509933 and parameters: {'selector__k': 21, 'model__max_features': 'log2', 'model__n_estimators': 132, 'mode

[I 2024-03-14 20:51:28,456] Trial 30 finished with value: 0.9615894039735099 and parameters: {'selector__k': 14, 'model__max_features': 'log2', 'model__n_estimators': 58, 'model__max_depth': 82, 'model__min_samples_leaf': 5}. Best is trial 21 with value: 0.9649006622516556.
[I 2024-03-14 20:51:29,218] Trial 31 finished with value: 0.9655629139072848 and parameters: {'selector__k': 11, 'model__max_features': 'log2', 'model__n_estimators': 50, 'model__max_depth': 141, 'model__min_samples_leaf': 7}. Best is trial 31 with value: 0.9655629139072848.
[I 2024-03-14 20:51:30,040] Trial 32 finished with value: 0.9635761589403974 and parameters: {'selector__k': 12, 'model__max_features': 'log2', 'model__n_estimators': 54, 'model__max_depth': 134, 'model__min_samples_leaf': 8}. Best is trial 31 with value: 0.9655629139072848.
[I 2024-03-14 20:51:30,826] Trial 33 finished with value: 0.9622516556291391 and parameters: {'selector__k': 10, 'model__max_features': 'log2', 'model__n_estimators': 51, 'm

RF 0.9655629139072848 0.9594953519256308


In [81]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [82]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [50]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 22)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me)
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-14 15:17:56,595] A new study created in memory with name: no-name-700d490c-7a28-4418-a990-29f22c6c5adf
[I 2024-03-14 15:17:56,678] Trial 0 finished with value: -1.208931659780151 and parameters: {'selector__k': 18}. Best is trial 0 with value: -1.208931659780151.
[I 2024-03-14 15:17:56,736] Trial 1 finished with value: -1.2059053904883545 and parameters: {'selector__k': 4}. Best is trial 1 with value: -1.2059053904883545.
[I 2024-03-14 15:17:56,791] Trial 2 finished with value: -1.2080616313076031 and parameters: {'selector__k': 9}. Best is trial 1 with value: -1.2059053904883545.
[I 2024-03-14 15:17:56,836] Trial 3 finished with value: -1.2063422611052002 and parameters: {'selector__k': 5}. Best is trial 1 with value: -1.2059053904883545.
[I 2024-03-14 15:17:56,939] Trial 4 finished with value: -1.208931659780151 and parameters: {'selector__k': 19}. Best is trial 1 with value: -1.2059053904883545.
[I 2024-03-14 15:17:56,992] Trial 5 finished with value: -1.2073874551611805 

[I 2024-03-14 15:17:59,779] A new study created in memory with name: no-name-cf0c03c0-6635-4897-9dec-a5ab3d5a594e


Linear -1.2059053904883545 1.2394810264318108 2.4919038211321483 6.88448261893557


[I 2024-03-14 15:17:59,988] Trial 0 finished with value: -1.2072580910532877 and parameters: {'selector__k': 12}. Best is trial 0 with value: -1.2072580910532877.
[I 2024-03-14 15:18:00,193] Trial 1 finished with value: -1.2067265310611157 and parameters: {'selector__k': 3}. Best is trial 1 with value: -1.2067265310611157.
[I 2024-03-14 15:18:00,403] Trial 2 finished with value: -1.2067656957716013 and parameters: {'selector__k': 14}. Best is trial 1 with value: -1.2067265310611157.
[I 2024-03-14 15:18:00,617] Trial 3 finished with value: -1.2083964470058306 and parameters: {'selector__k': 21}. Best is trial 1 with value: -1.2067265310611157.
[I 2024-03-14 15:18:00,852] Trial 4 finished with value: -1.2072580910532877 and parameters: {'selector__k': 12}. Best is trial 1 with value: -1.2067265310611157.
[I 2024-03-14 15:18:01,068] Trial 5 finished with value: -1.2072580910532877 and parameters: {'selector__k': 12}. Best is trial 1 with value: -1.2067265310611157.
[I 2024-03-14 15:18:01,

[I 2024-03-14 15:18:10,333] Trial 0 finished with value: -1.2090036118562535 and parameters: {'selector__k': 10}. Best is trial 0 with value: -1.2090036118562535.


Lasso -1.2055266709742498 1.239192969494044 2.4909007695974394 6.887163549040238


[I 2024-03-14 15:18:10,491] Trial 1 finished with value: -1.2089196159335656 and parameters: {'selector__k': 22}. Best is trial 1 with value: -1.2089196159335656.
[I 2024-03-14 15:18:10,603] Trial 2 finished with value: -1.2089196159308557 and parameters: {'selector__k': 20}. Best is trial 2 with value: -1.2089196159308557.
[I 2024-03-14 15:18:10,683] Trial 3 finished with value: -1.2080286187209943 and parameters: {'selector__k': 8}. Best is trial 3 with value: -1.2080286187209943.
[I 2024-03-14 15:18:10,762] Trial 4 finished with value: -1.207161831626635 and parameters: {'selector__k': 15}. Best is trial 4 with value: -1.207161831626635.
[I 2024-03-14 15:18:10,833] Trial 5 finished with value: -1.2084381909167707 and parameters: {'selector__k': 12}. Best is trial 4 with value: -1.207161831626635.
[I 2024-03-14 15:18:10,891] Trial 6 finished with value: -1.2063141419174108 and parameters: {'selector__k': 5}. Best is trial 6 with value: -1.2063141419174108.
[I 2024-03-14 15:18:10,945]

Ridge -1.2058718213482411 1.2394865693754724 2.4919004833319094 6.884834459430021


In [51]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_boreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, 21)
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_boreg[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-14 15:20:21,455] A new study created in memory with name: no-name-4340ae72-b810-4bb1-8dc1-b0b74c886c58
[I 2024-03-14 15:20:21,514] Trial 0 finished with value: -1.2211466243909481 and parameters: {'selector__k': 5}. Best is trial 0 with value: -1.2211466243909481.
[I 2024-03-14 15:20:21,574] Trial 1 finished with value: -1.2216305976336708 and parameters: {'selector__k': 7}. Best is trial 0 with value: -1.2211466243909481.
[I 2024-03-14 15:20:21,631] Trial 2 finished with value: -1.225056678765997 and parameters: {'selector__k': 17}. Best is trial 0 with value: -1.2211466243909481.
[I 2024-03-14 15:20:21,795] Trial 3 finished with value: -1.225056678765997 and parameters: {'selector__k': 18}. Best is trial 0 with value: -1.2211466243909481.
[I 2024-03-14 15:20:21,885] Trial 4 finished with value: -1.2253399443655724 and parameters: {'selector__k': 15}. Best is trial 0 with value: -1.2211466243909481.
[I 2024-03-14 15:20:21,940] Trial 5 finished with value: -1.222761381008133

[I 2024-03-14 15:20:24,735] A new study created in memory with name: no-name-cf23d8d7-82a6-4b34-b4b8-f8b6c7d02a7d
[I 2024-03-14 15:20:24,914] Trial 0 finished with value: -1.2211374424767913 and parameters: {'selector__k': 4}. Best is trial 0 with value: -1.2211374424767913.


Linear -1.2211466243909481 1.2285862853852223 2.4760624207015574 6.8862533859797646


[I 2024-03-14 15:20:25,142] Trial 1 finished with value: -1.2242188664539926 and parameters: {'selector__k': 18}. Best is trial 0 with value: -1.2211374424767913.
[I 2024-03-14 15:20:25,336] Trial 2 finished with value: -1.2210753621744292 and parameters: {'selector__k': 6}. Best is trial 2 with value: -1.2210753621744292.
[I 2024-03-14 15:20:25,523] Trial 3 finished with value: -1.22107536195206 and parameters: {'selector__k': 8}. Best is trial 3 with value: -1.22107536195206.
[I 2024-03-14 15:20:25,748] Trial 4 finished with value: -1.2242188664539926 and parameters: {'selector__k': 18}. Best is trial 3 with value: -1.22107536195206.
[I 2024-03-14 15:20:25,954] Trial 5 finished with value: -1.22107536195206 and parameters: {'selector__k': 7}. Best is trial 3 with value: -1.22107536195206.
[I 2024-03-14 15:20:26,205] Trial 6 finished with value: -1.2242188664539926 and parameters: {'selector__k': 18}. Best is trial 3 with value: -1.22107536195206.
[I 2024-03-14 15:20:26,420] Trial 7 f

[I 2024-03-14 15:20:35,823] Trial 1 finished with value: -1.2322464004967688 and parameters: {'selector__k': 2}. Best is trial 0 with value: -1.2232519537679265.
[I 2024-03-14 15:20:35,873] Trial 2 finished with value: -1.2322464004967688 and parameters: {'selector__k': 2}. Best is trial 0 with value: -1.2232519537679265.


Lasso -1.2208784552858187 1.2282113873268732 2.4739762826986187 6.886906251309284


[I 2024-03-14 15:20:35,926] Trial 3 finished with value: -1.2216078573443627 and parameters: {'selector__k': 7}. Best is trial 3 with value: -1.2216078573443627.
[I 2024-03-14 15:20:35,975] Trial 4 finished with value: -1.2322464004967688 and parameters: {'selector__k': 2}. Best is trial 3 with value: -1.2216078573443627.
[I 2024-03-14 15:20:36,032] Trial 5 finished with value: -1.2227410934695282 and parameters: {'selector__k': 10}. Best is trial 3 with value: -1.2216078573443627.
[I 2024-03-14 15:20:36,082] Trial 6 finished with value: -1.2214513654192312 and parameters: {'selector__k': 4}. Best is trial 6 with value: -1.2214513654192312.
[I 2024-03-14 15:20:36,240] Trial 7 finished with value: -1.2250525915954733 and parameters: {'selector__k': 20}. Best is trial 6 with value: -1.2214513654192312.
[I 2024-03-14 15:20:36,375] Trial 8 finished with value: -1.2250525915946282 and parameters: {'selector__k': 18}. Best is trial 6 with value: -1.2214513654192312.
[I 2024-03-14 15:20:36,45

Ridge -1.221124502311403 1.2285911750752267 2.4760067882063175 6.8866158781182385


In [52]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error'
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt, mae, mse, me]
    print(model_name, opt.best_score_, mae, mse, me)

[I 2024-03-14 15:22:24,642] A new study created in memory with name: no-name-7e7c8759-4b1e-41c9-9625-bd19c46662b0
[I 2024-03-14 15:22:24,742] Trial 0 finished with value: -1.225056678765997 and parameters: {}. Best is trial 0 with value: -1.225056678765997.
[I 2024-03-14 15:22:24,824] Trial 1 finished with value: -1.225056678765997 and parameters: {}. Best is trial 0 with value: -1.225056678765997.
[I 2024-03-14 15:22:24,906] Trial 2 finished with value: -1.225056678765997 and parameters: {}. Best is trial 0 with value: -1.225056678765997.
[I 2024-03-14 15:22:25,043] Trial 3 finished with value: -1.225056678765997 and parameters: {}. Best is trial 0 with value: -1.225056678765997.
[I 2024-03-14 15:22:25,112] Trial 4 finished with value: -1.225056678765997 and parameters: {}. Best is trial 0 with value: -1.225056678765997.
[I 2024-03-14 15:22:25,229] Trial 5 finished with value: -1.225056678765997 and parameters: {}. Best is trial 0 with value: -1.225056678765997.
[I 2024-03-14 15:22:25

Linear -1.225056678765997 1.233188385703093 2.4956422538932106 6.907610890780822


[I 2024-03-14 15:22:29,862] Trial 0 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value: -1.2242188664539926.
[I 2024-03-14 15:22:30,061] Trial 1 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value: -1.2242188664539926.
[I 2024-03-14 15:22:30,274] Trial 2 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value: -1.2242188664539926.
[I 2024-03-14 15:22:30,473] Trial 3 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value: -1.2242188664539926.
[I 2024-03-14 15:22:30,687] Trial 4 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value: -1.2242188664539926.
[I 2024-03-14 15:22:30,879] Trial 5 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value: -1.2242188664539926.
[I 2024-03-14 15:22:31,072] Trial 6 finished with value: -1.2242188664539926 and parameters: {}. Best is trial 0 with value:

Lasso -1.2242188664539926 1.231282751656711 2.488265392265941 6.905616797708196


[I 2024-03-14 15:22:40,531] Trial 1 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value: -1.2250525915957675.
[I 2024-03-14 15:22:40,637] Trial 2 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value: -1.2250525915957675.
[I 2024-03-14 15:22:40,711] Trial 3 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value: -1.2250525915957675.
[I 2024-03-14 15:22:40,850] Trial 4 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value: -1.2250525915957675.
[I 2024-03-14 15:22:40,955] Trial 5 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value: -1.2250525915957675.
[I 2024-03-14 15:22:41,057] Trial 6 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value: -1.2250525915957675.
[I 2024-03-14 15:22:41,181] Trial 7 finished with value: -1.2250525915957675 and parameters: {}. Best is trial 0 with value:

Ridge -1.2250525915957675 1.2331567173115723 2.4954676246116927 6.907660494386366


In [86]:
import torch
from torch_geometric.nn import GATConv
import torch.nn.functional as F
class GATNet(torch.nn.Module):
    def __init__(self, num_features, num_classes,
                hid, in_head, out_head, dor, extra_layer):
        super(GATNet, self).__init__()
        self.hid = hid
        self.in_head = in_head
        self.out_head = out_head
        self.dor = dor
        self.extra_layer = extra_layer
        self.gat1 = GATConv(num_features, self.hid, heads=self.in_head, dropout=self.dor)
        if self.extra_layer:
            self.gat2 = GATConv(self.hid*self.in_head, self.hid, heads=self.in_head, dropout=self.dor)
            self.gat3 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)
        else:
            self.gat2 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dor, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=self.dor, training=self.training)
        if self.extra_layer:
            x = F.elu(self.gat2(x, edge_index))  # Add non-linearity after the second layer
            x = F.dropout(x, p=self.dor, training=self.training)
            x = self.gat3(x, edge_index) 
        else:
            x = self.gat2(x, edge_index)
        return x

In [87]:
all_features = list(temp_2.columns)
nodes_features = ['remainder__link_from', 'remainder__link_to']
drop_featrues = ['remainder__dataset', 'remainder__link_counts', 'used_link']
temp_features = list(set(all_features) - set(nodes_features))
other_features = list(set(temp_features) - set(drop_featrues))

In [88]:
import optuna
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

best_k = None
best_performance = float('inf')
performance_history = []

def objective(trial):
    # Hyperparameters to tune
    k = trial.suggest_int('k', 2, len(other_features))
    hid = trial.suggest_categorical('hid', [16, 32, 64, 128, 256, 512])
    in_head = trial.suggest_categorical('in_head', [1, 2, 4, 8, 16, 32])
    out_head = trial.suggest_categorical('out_head', [1, 2])
    dor = trial.suggest_categorical('dor', [0, 0.05, 0.1])
    extra_layer = trial.suggest_categorical('extra_layer', [False])
    
    # Create a tensor of your labels/targets
    y = torch.tensor(temp_2['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
    
    # Feature selection for the current k
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(temp_2[other_features], y)
    selected_columns = list(temp_2[other_features].columns[selector.get_support(indices=True)])
    
    edge_index = torch.tensor(temp_2[nodes_features].values.T, dtype=torch.long)
    x = torch.tensor(temp_2[selected_columns].values, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, y=y)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_data = data.to(device)
    model = GATNet(k, 1, hid=hid, in_head=in_head, out_head=out_head, dor=dor, extra_layer=extra_layer).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion_MAE = torch.nn.L1Loss()
    def train():
        model.train()
        optimizer.zero_grad()
        out = model(train_data.x, train_data.edge_index)
        loss = criterion_MAE(out, train_data.y)
        loss.backward()
        optimizer.step()
        return loss
    for epoch in range(50):
        loss = train()

    return loss.item()

#     # Store the performance for each k
#     performance_history.append((k, test_loss))

#     # Update the best k if the current performance is better
#     if performance < best_performance:
#         best_performance = performance
#         best_k = k
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2024-03-14 21:29:24,592] A new study created in memory with name: no-name-4c2c06cb-a29b-47ba-b3c5-c204f88e62da
[I 2024-03-14 21:30:39,110] Trial 0 finished with value: 1.6713389158248901 and parameters: {'k': 15, 'hid': 512, 'in_head': 16, 'out_head': 1, 'dor': 0.05, 'extra_layer': False}. Best is trial 0 with value: 1.6713389158248901.
[I 2024-03-14 21:31:20,443] Trial 1 finished with value: 1.6148873567581177 and parameters: {'k': 18, 'hid': 128, 'in_head': 32, 'out_head': 1, 'dor': 0.05, 'extra_layer': False}. Best is trial 1 with value: 1.6148873567581177.
[I 2024-03-14 21:33:33,047] Trial 2 finished with value: 1.6569758653640747 and parameters: {'k': 18, 'hid': 512, 'in_head': 32, 'out_head': 1, 'dor': 0, 'extra_layer': False}. Best is trial 1 with value: 1.6148873567581177.
[I 2024-03-14 21:33:37,786] Trial 3 finished with value: 2.6431338787078857 and parameters: {'k': 16, 'hid': 256, 'in_head': 1, 'out_head': 1, 'dor': 0.05, 'extra_layer': False}. Best is trial 1 with value

[I 2024-03-14 21:45:33,935] Trial 36 finished with value: 1.1843080520629883 and parameters: {'k': 7, 'hid': 128, 'in_head': 32, 'out_head': 2, 'dor': 0, 'extra_layer': False}. Best is trial 31 with value: 1.182878017425537.
[I 2024-03-14 21:48:24,548] Trial 37 finished with value: 1.3278802633285522 and parameters: {'k': 9, 'hid': 512, 'in_head': 32, 'out_head': 2, 'dor': 0.05, 'extra_layer': False}. Best is trial 31 with value: 1.182878017425537.
[I 2024-03-14 21:49:05,605] Trial 38 finished with value: 1.1861892938613892 and parameters: {'k': 6, 'hid': 128, 'in_head': 32, 'out_head': 1, 'dor': 0, 'extra_layer': False}. Best is trial 31 with value: 1.182878017425537.
[I 2024-03-14 21:49:14,107] Trial 39 finished with value: 1.3687376976013184 and parameters: {'k': 9, 'hid': 16, 'in_head': 32, 'out_head': 2, 'dor': 0.05, 'extra_layer': False}. Best is trial 31 with value: 1.182878017425537.
[I 2024-03-14 21:49:17,417] Trial 40 finished with value: 1.1856321096420288 and parameters: {'

In [89]:
best_params = study.best_params
best_k = best_params['k']
best_hid = best_params['hid']
best_in_head = best_params['in_head']
best_out_head = best_params['out_head']
best_dor = best_params['dor']
best_extra_layer = best_params['extra_layer']

In [90]:
# Feature selection for the current k
selector = SelectKBest(score_func=f_regression, k=best_k)
y = torch.tensor(temp_2['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
X_new = selector.fit_transform(temp_2[other_features], y)
selected_columns = list(temp_2[other_features].columns[selector.get_support(indices=True)])

edge_index = torch.tensor(temp_2[nodes_features].values.T, dtype=torch.long)
x = torch.tensor(temp_2[selected_columns].values, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=y)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = data.to(device)

best_model = GATNet(best_k, 1, hid=best_hid, in_head=best_in_head,
                    out_head=best_out_head, dor=best_dor, extra_layer=best_extra_layer).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion_MAE = torch.nn.L1Loss()
def train():
    best_model.train()
    optimizer.zero_grad()
    out = best_model(train_data.x, train_data.edge_index)
    loss = criterion_MAE(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss
for epoch in range(250):
    loss = train()

test_edge_index = torch.tensor(test_data_tr[nodes_features].values.T, dtype=torch.long)
test_x = torch.tensor(X_te[selected_columns].values, dtype=torch.float)
test_y = torch.tensor(y_te.values, dtype=torch.float).unsqueeze(1)
test_data = Data(x=test_x, edge_index=test_edge_index, y=test_y)
test_data = test_data.to(device)

criterion_MSE = torch.nn.MSELoss()
def test(test_data):
    best_model.eval()
    with torch.no_grad():
        pred = best_model(test_data.x, test_data.edge_index)
        y_pred_all = np.concatenate([np.array(pd.DataFrame(pred).astype("float")[0]), np.array(X_te_0['y_pred'])])
        loss_MAE = criterion_MAE(pred, test_data.y)
        loss_MSE = criterion_MSE(pred, test_data.y)
    return loss_MAE.item(), loss_MSE.item()

test_loss = test(test_data)
test_loss

(1.2467106580734253, 2.549755096435547)