In [5]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

## data loader

In [6]:
def load_data_small(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)

            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']
        
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


## Define parameter space

In [7]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'length_per_capacity_ratio', 'speed_capacity_ratio', 'length_times_lanes', 'speed_times_capacity', 
                      'length_times', 'capacity_divided_by_lanes'
                     ]
scaler = StandardScaler()
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
#     'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101)
}

model_space = {
    'KNN': KNeighborsRegressor(),
#     'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
#     'GPR': GaussianProcessRegressor(copy_X_train=False, random_state=101),
#     'SVR': SVR(),
    'Linear': LinearRegression(),
    'Lasso': LassoCV(random_state=42, max_iter=100000),
    'Ridge': RidgeCV(),
}
# model_space_feature = {
#     'SVR': RandomForestRegressor(random_state=101),
#     'KNN': RandomForestRegressor(random_state=101),
#     'XGB': xgb.XGBRegressor(random_state=101),
#     'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
#     'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
#     'ANN': RandomForestRegressor(random_state=101),
#     'GRNN': RandomForestRegressor(random_state=101)
# }
param_space = {
'Linear': {  
},
'Lasso': {
},
'Ridge': {  
},
# 'SVR': {
#     "C": optuna.distributions.FloatDistribution(1e-5, 1e5),
#     'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
#     'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
    # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
# },
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
# 'GB':{
#     'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
#     'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
#     'max_depth': optuna.distributions.IntDistribution(1, 200),
#     'min_samples_split': optuna.distributions.IntDistribution(2, 11),
#     'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
#     'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
# },
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['adam', 'lbfgs']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
# 'XGB': {
#     'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
#     'n_estimators': optuna.distributions.IntDistribution(50, 501, 50),
#     'max_depth': optuna.distributions.IntDistribution(1, 20),
#     'max_leaves': optuna.distributions.IntDistribution(2, 50),
#     'max_bin': optuna.distributions.IntDistribution(2, 50),
#     'gamma': optuna.distributions.IntDistribution(1, 20),
# },
# 'GPR':{
#     'kernel': optuna.distributions.CategoricalDistribution([0.1**2 * RBF(length_scale=0.1) + 
#                                     WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5)), 
#                                     0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0),
#                                     50.0**2 * RBF(length_scale=50.0), DotProduct() + WhiteKernel(), 
#                                     1.0 * Matern(length_scale=1.0, nu=1.5),
#                                     RBF() + ConstantKernel(constant_value=2)
#                                                            ]),
#     'alpha':  optuna.distributions.FloatDistribution(1e-15, 1e10)
# }
}

## Load data and create feature from shortest path

In [8]:
df_train = []
list_od = []
list_nodes = []
for i in range(0, 10):
    small_train_files = f'Data/smallWorlds/Train/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Train/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Train/s/df_links_network_{i}.pkl")
    small_train_data = load_data_small([small_train_files], small_df_activities, small_df_links_network)
    df_train.append(small_train_data)
    with open(small_train_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
train_data = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    small_validate_files = f'Data/smallWorlds/Validate/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_links_network_{i}.pkl")
    small_validate_data = load_data_small([small_validate_files], small_df_activities, small_df_links_network)
    df_validate.append(small_validate_data)
    with open(small_validate_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
validate_data = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    small_test_files = f'Data/smallWorlds/Test/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Test/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Test/s/df_links_network_{i}.pkl")
    small_test_data = load_data_small([small_test_files], small_df_activities, small_df_links_network)
    df_test.append(small_test_data)
    with open(small_test_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
test_data = pd.concat(df_test, ignore_index=True)

train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)


indices = Big_data.index[Big_data['link_id'] == 0].tolist()
indices.append(len(Big_data))
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]
import networkx as nx

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data_new = pd.concat(dfs)

## Create feature from clustering

In [9]:
cluster = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_new['x_y_coor'] = cluster.fit_predict(Big_data_new[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=100, random_state=101)
Big_data_new['similar_link'] = cluster1.fit_predict(Big_data_new[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])

Big_data_new = Big_data_new.astype({'x_y_coor':'int64','similar_link':'int64'})

## Dataset numerification and standardization

In [10]:
Big_data_tr = ct.fit_transform(Big_data_new)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)

## Classification task

In [11]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']
best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t_clf.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        random_state=101
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf, accuracy_score(y_te_clf, y_pred_clf)]

[I 2024-05-24 02:00:08,181] A new study created in memory with name: no-name-9037240d-bfd9-46c9-9861-c7761564fc75
[I 2024-05-24 02:00:08,288] Trial 0 finished with value: 0.9589403973509933 and parameters: {'selector__k': 10, 'model__n_neighbors': 20, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-05-24 02:00:08,381] Trial 1 finished with value: 0.9589403973509933 and parameters: {'selector__k': 9, 'model__n_neighbors': 19, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-05-24 02:00:08,539] Trial 2 finished with value: 0.9589403973509933 and parameters: {'selector__k': 12, 'model__n_neighbors': 34, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 0 with value: 0.9589403973509933.
[I 2024-05-24 02:00:08,707] Trial 3 finished with value: 0.9582781456953643 and parameters: {'selector__k': 3, 'model__n_neighbors': 21, 'model_

In [12]:
for i in best_model_clf.keys():
    print(i, best_model_clf[i][1],best_model_clf[i][3])

KNN 0.9642384105960264 0.9621513944223108
LGBM 0.9622516556291391 0.9594953519256308
RF 0.9642384105960264 0.9568393094289509


### Check the best result and put the predict results into test dataset

In [13]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]

In [19]:
import pickle
with open('CLF_small.pickle', 'wb') as handle:
    pickle.dump(temp_tr, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Group the used link from training and validation dataset and later group the predicted used link in test dataset

In [14]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [15]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error',
        random_state=101
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt.best_score_, mae, mse, me, y_te_all, y_pred_all)

[I 2024-05-24 02:03:32,074] A new study created in memory with name: no-name-e39743d2-23ad-4ddb-a675-972ec3161f67
[I 2024-05-24 02:03:32,153] Trial 0 finished with value: -1.2842949189131139 and parameters: {'selector__k': 7, 'model__n_neighbors': 7, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 0 with value: -1.2842949189131139.
[I 2024-05-24 02:03:32,253] Trial 1 finished with value: -1.2193476413089674 and parameters: {'selector__k': 7, 'model__n_neighbors': 26, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 1 with value: -1.2193476413089674.
[I 2024-05-24 02:03:32,319] Trial 2 finished with value: -1.222530724997181 and parameters: {'selector__k': 2, 'model__n_neighbors': 49, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: -1.2193476413089674.
[I 2024-05-24 02:03:32,423] Trial 3 finished with value: -1.2206203959484345 and parameters: {'selector__k': 6, 'model__n_neighbors': 24, 'model__we

Collecting torch
  Using cached torch-2.3.0-cp38-cp38-win_amd64.whl.metadata (26 kB)
Collecting torchvision
  Using cached torchvision-0.18.0-cp38-cp38-win_amd64.whl.metadata (6.6 kB)
Collecting torchaudio
  Using cached torchaudio-2.3.0-cp38-cp38-win_amd64.whl.metadata (6.4 kB)
Collecting filelock (from torch)
  Using cached filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Using cached sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Using cached mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached tbb-2021.12.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Collecting mpmath>=0.19 (fro

[I 2024-05-24 02:04:33,249] Trial 5 finished with value: -1.301956106826687 and parameters: {'selector__k': 22, 'model__max_features': 'log2', 'model__n_estimators': 139, 'model__max_depth': 187, 'model__min_samples_leaf': 5}. Best is trial 0 with value: -1.2394267610716496.
[I 2024-05-24 02:04:34,321] Trial 6 finished with value: -1.262237638848736 and parameters: {'selector__k': 4, 'model__max_features': 'sqrt', 'model__n_estimators': 80, 'model__max_depth': 78, 'model__min_samples_leaf': 2}. Best is trial 0 with value: -1.2394267610716496.
[I 2024-05-24 02:04:43,790] Trial 7 finished with value: -1.2583750496016606 and parameters: {'selector__k': 7, 'model__max_features': 'log2', 'model__n_estimators': 362, 'model__max_depth': 174, 'model__min_samples_leaf': 1}. Best is trial 0 with value: -1.2394267610716496.
[I 2024-05-24 02:04:44,019] Trial 8 finished with value: -1.2315753910184564 and parameters: {'selector__k': 2, 'model__max_features': 'log2', 'model__n_estimators': 63, 'mode

## Feature Selection Regression task

In [16]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

best_model_fsreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t_onlyreg.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error',
        random_state=101
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_fsreg[model_name] = [opt.best_score_, mae, mse, me, y_te_onlyreg, y_pred]


[I 2024-05-24 02:12:26,563] A new study created in memory with name: no-name-fbad98a2-821f-474d-a8a4-244394ceabd7
[I 2024-05-24 02:12:26,689] Trial 0 finished with value: -1.367682119205298 and parameters: {'selector__k': 11, 'model__n_neighbors': 45, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 0 with value: -1.367682119205298.
[I 2024-05-24 02:12:26,805] Trial 1 finished with value: -1.7193396478247505 and parameters: {'selector__k': 14, 'model__n_neighbors': 2, 'model__weights': 'distance', 'model__algorithm': 'kd_tree'}. Best is trial 0 with value: -1.367682119205298.
[I 2024-05-24 02:12:27,033] Trial 2 finished with value: -1.2372898526368012 and parameters: {'selector__k': 7, 'model__n_neighbors': 35, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 2 with value: -1.2372898526368012.
[I 2024-05-24 02:12:27,242] Trial 3 finished with value: -1.256196783349101 and parameters: {'selector__k': 9, 'model__n_neighbors': 14, 'model

## Regression task w/o Feature Selection 

In [60]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error',
        random_state=101
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt.best_score_, mae, mse, me, y_te_onlyreg, y_pred]

[I 2024-05-24 11:16:03,060] A new study created in memory with name: no-name-3f0fa873-1d6f-49e3-b6d4-3c89164750a3
[I 2024-05-24 11:16:03,182] Trial 0 finished with value: -1.8427814569536423 and parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto'}. Best is trial 0 with value: -1.8427814569536423.
[I 2024-05-24 11:16:03,299] Trial 1 finished with value: -1.7839127911532906 and parameters: {'n_neighbors': 38, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 1 with value: -1.7839127911532906.
[I 2024-05-24 11:16:03,407] Trial 2 finished with value: -1.7774284756531247 and parameters: {'n_neighbors': 31, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 2 with value: -1.7774284756531247.
[I 2024-05-24 11:16:03,570] Trial 3 finished with value: -1.7966887417218542 and parameters: {'n_neighbors': 39, 'weights': 'uniform', 'algorithm': 'ball_tree'}. Best is trial 2 with value: -1.7774284756531247.
[I 2024-05-24 11:16:03,654] Trial 4 finished with value:

In [None]:
# pip install torch==2.0.1
# pip install torch_geometric

## GNN

In [4]:
import torch
from torch_geometric.nn import GATConv
import torch.nn.functional as F
class GATNet(torch.nn.Module):
    def __init__(self, num_features, num_classes,
                hid, in_head, out_head, dor, extra_layer):
        super(GATNet, self).__init__()
        self.hid = hid
        self.in_head = in_head
        self.out_head = out_head
        self.dor = dor
        self.extra_layer = extra_layer
        self.gat1 = GATConv(num_features, self.hid, heads=self.in_head, dropout=self.dor)
        if self.extra_layer:
            self.gat2 = GATConv(self.hid*self.in_head, self.hid, heads=self.in_head, dropout=self.dor)
            self.gat3 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)
        else:
            self.gat2 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dor, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=self.dor, training=self.training)
        if self.extra_layer:
            x = F.elu(self.gat2(x, edge_index))  # Add non-linearity after the second layer
            x = F.dropout(x, p=self.dor, training=self.training)
            x = self.gat3(x, edge_index) 
        else:
            x = self.gat2(x, edge_index)
        return x

In [39]:
# all_features = list(temp_2.columns) #CLF
all_features = list(temp.columns) #REST
nodes_features = ['remainder__link_from', 'remainder__link_to']
drop_featrues = ['remainder__dataset', 'remainder__link_counts', 'used_link']
temp_features = list(set(all_features) - set(nodes_features))
other_features = list(set(temp_features) - set(drop_featrues))

In [75]:
import random
def seed_everything(seed: int) -> None:
    r"""Sets the seed for generating random numbers in :pytorch:`PyTorch`,
    :obj:`numpy` and :python:`Python`.

    Args:
        seed (int): The desired seed.
    """
    random.seed(101)
    np.random.seed(101)
    torch.manual_seed(101)
    torch.cuda.manual_seed_all(101)

Exception in thread Control:
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Programs\Python\Python38\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\user\AppData\Local\Programs\Python\Python38\lib\site-packages\ipykernel\control.py", line 25, in run
    self.io_loop.close()
  File "C:\Users\user\AppData\Local\Programs\Python\Python38\lib\site-packages\tornado\platform\asyncio.py", line 339, in close
    super().close(all_fds=all_fds)
  File "C:\Users\user\AppData\Local\Programs\Python\Python38\lib\site-packages\tornado\platform\asyncio.py", line 152, in close
    self.selector_loop.close()
  File "C:\Users\user\AppData\Local\Programs\Python\Python38\lib\site-packages\tornado\platform\asyncio.py", line 702, in close
    self._real_loop.close()
  File "C:\Users\user\AppData\Local\Programs\Python\Python38\lib\asyncio\proactor_events.py", line 679, in close
    signal.set_wakeup_fd(-1)
ValueError: set_wakeup_fd only works in main thr

In [51]:
from torch_geometric.data import Data

best_k = None
best_performance = float('inf')
performance_history = []

def objective(trial):
    # Hyperparameters to tune
    # k = trial.suggest_int('k', 2, len(other_features))
    k = trial.suggest_int('k', len(other_features), len(other_features)) #REG
    hid = trial.suggest_categorical('hid', [16, 32, 64, 128, 256, 512])
    in_head = trial.suggest_categorical('in_head', [1, 2, 4, 8, 16, 32])
    out_head = trial.suggest_categorical('out_head', [1, 2])
    dor = trial.suggest_categorical('dor', [0, 0.05, 0.1])
    extra_layer = trial.suggest_categorical('extra_layer', [False])
    
    # Create a tensor of your labels/targets
    y = torch.tensor(temp['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
    
    # Feature selection for the current k
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(temp[other_features], y)
    selected_columns = list(temp[other_features].columns[selector.get_support(indices=True)])
    
    edge_index = torch.tensor(temp[nodes_features].values.T, dtype=torch.long)
    x = torch.tensor(temp[selected_columns].values, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, y=y)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_data = data.to(device)
    model = GATNet(k, 1, hid=hid, in_head=in_head, out_head=out_head, dor=dor, extra_layer=extra_layer).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion_MAE = torch.nn.L1Loss()
    def train():
        model.train()
        optimizer.zero_grad()
        out = model(train_data.x, train_data.edge_index)
        loss = criterion_MAE(out, train_data.y)
        loss.backward()
        optimizer.step()
        return loss
    for epoch in range(50):
        loss = train()

    return loss.item()

#     # Store the performance for each k
#     performance_history.append((k, test_loss))

#     # Update the best k if the current performance is better
#     if performance < best_performance:
#         best_performance = performance
#         best_k = k
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2024-05-24 10:46:37,224] A new study created in memory with name: no-name-bf46e45b-a037-4643-82d1-41530bfe49b3
[I 2024-05-24 10:48:22,281] Trial 0 finished with value: 2.100295066833496 and parameters: {'k': 20, 'hid': 256, 'in_head': 32, 'out_head': 1, 'dor': 0.05, 'extra_layer': False}. Best is trial 0 with value: 2.100295066833496.
[I 2024-05-24 10:48:28,894] Trial 1 finished with value: 1.2251050472259521 and parameters: {'k': 20, 'hid': 256, 'in_head': 2, 'out_head': 1, 'dor': 0, 'extra_layer': False}. Best is trial 1 with value: 1.2251050472259521.
[I 2024-05-24 10:48:35,363] Trial 2 finished with value: 1.4662245512008667 and parameters: {'k': 20, 'hid': 64, 'in_head': 8, 'out_head': 1, 'dor': 0, 'extra_layer': False}. Best is trial 1 with value: 1.2251050472259521.
[I 2024-05-24 10:48:37,662] Trial 3 finished with value: 1.2343264818191528 and parameters: {'k': 20, 'hid': 32, 'in_head': 4, 'out_head': 2, 'dor': 0, 'extra_layer': False}. Best is trial 1 with value: 1.22510504

In [52]:
best_params = study.best_params
best_k = best_params['k']
best_hid = best_params['hid']
best_in_head = best_params['in_head']
best_out_head = best_params['out_head']
best_dor = best_params['dor']
best_extra_layer = best_params['extra_layer']

In [53]:
# Feature selection for the current k
selector = SelectKBest(score_func=f_regression, k=best_k)
y = torch.tensor(temp['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
X_new = selector.fit_transform(temp[other_features], y)
selected_columns = list(temp[other_features].columns[selector.get_support(indices=True)])

edge_index = torch.tensor(temp[nodes_features].values.T, dtype=torch.long)
x = torch.tensor(temp[selected_columns].values, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=y)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = data.to(device)

best_model = GATNet(best_k, 1, hid=best_hid, in_head=best_in_head,
                    out_head=best_out_head, dor=best_dor, extra_layer=best_extra_layer).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion_MAE = torch.nn.L1Loss()
def train():
    best_model.train()
    optimizer.zero_grad()
    out = best_model(train_data.x, train_data.edge_index)
    loss = criterion_MAE(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss
for epoch in range(250):
    loss = train()



In [54]:
test_edge_index = torch.tensor(test_data_tr[nodes_features].values.T, dtype=torch.long)
# test_x = torch.tensor(X_te[selected_columns].values, dtype=torch.float)#CLF
# test_y = torch.tensor(y_te.values, dtype=torch.float).unsqueeze(1)#CLF
test_x = torch.tensor(test_data_tr[selected_columns].values, dtype=torch.float)
test_y = torch.tensor(test_data_tr['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
test_data = Data(x=test_x, edge_index=test_edge_index, y=test_y)
test_data = test_data.to(device)

best_model.eval()
with torch.no_grad():
    pred = best_model(test_data.x, test_data.edge_index)
    # y_pred_all = np.concatenate([np.array(pd.DataFrame(pred).astype("float")[0]), np.array(X_te_0['y_pred'])]) #CLF
    # loss_MAE = criterion_MAE(y_pred_all, test_data.y)
    # loss_MSE = criterion_MSE(y_pred_all, test_data.y)
    # error = torch.abs(y_pred_all - test_data.y)
    # max_error = torch.max(error)
    # mae = mean_absolute_error(y_te_all, y_pred_all)#CLF
    # mse = mean_squared_error(y_te_all, y_pred_all)#CLF
    # me = max_error(y_te_all, y_pred_all)#CLF
    y_pred = np.array(pd.DataFrame(pred).astype("float")[0])
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
gnn_result = [study.best_value, mae, mse, me, y_te_onlyreg, y_pred]

# gnn_result = [study.best_value, mae, mse, me, y_te_all, y_pred_all]#CLF


In [55]:
gnn_result

[1.1777287721633911,
 1.3011008252755827,
 2.894200211638598,
 7.261719226837158,
 4588    5
 4589    4
 4590    4
 4591    3
 4592    8
        ..
 6089    5
 6090    5
 6091    3
 6092    5
 6093    4
 Name: remainder__link_counts, Length: 1506, dtype: int64,
 array([3.48553514, 3.67521882, 3.66329193, ..., 2.68099642, 1.46715772,
        2.62913561])]

In [61]:
best_model_onlyreg_wofeatureselect['GNN']=gnn_result

In [33]:
def load_json(file_list):
    index_list = []
    n=test_data_tr.index[0]
    m=test_data_tr.index[0]
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
            })
        n += len(df_links)
        index_list.append(list(range(m, n)))
        m += len(df_links)
    return index_list

In [34]:
test_files = [f'Data/smallWorlds/Test/s/s-{i}.json'for i in range(15, 20)]
small_test_index = load_json(test_files)

In [35]:
def split_five_instance(original_result):
    split5test = {}
    for i in original_result.keys():
        test=pd.DataFrame({
            'true_y':original_result[i][4],
            'pred_y':original_result[i][5]
        })
        split5test[i]={'all':{
            'MAE': original_result[i][1],
            'MSE': original_result[i][2],
            'ME': original_result[i][3],
        }}
        for j in range(1, len(small_test_index)+1):
            split5test[i][f'instance_{j}']={}
            test_df = test.loc[small_test_index[j-1]]
            split5test[i][f'instance_{j}']['MAE'] = mean_absolute_error(test_df['true_y'], test_df['pred_y'])
            split5test[i][f'instance_{j}']['MSE'] = mean_squared_error(test_df['true_y'], test_df['pred_y'])
            split5test[i][f'instance_{j}']['ME'] = max_error(test_df['true_y'], test_df['pred_y'])
    return split5test

In [65]:
REG_small = split_five_instance(best_model_onlyreg_wofeatureselect)

In [36]:
CLFFSREG_small = split_five_instance(best_model_reg)

In [47]:
FSREG_small = split_five_instance(best_model_fsreg)

In [68]:
import pickle
with open('all_small.pickle', 'wb') as handle:
    pickle.dump(all_small, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [67]:
all_small={
    'CLF-FS-REG':CLFFSREG_small,
    'FS-REG':FSREG_small,
    'REG':REG_small
          }

In [69]:
df_small_result = pd.DataFrame.from_dict({(i, j, k): all_small[i][j][k]
                             for i in all_small.keys()
                             for j in all_small[i].keys()
                             for k in all_small[i][j].keys()},
                            orient='index')

In [74]:
# Reset the index to separate the keys into individual columns
df_small_result = df_small_result.reset_index().rename(columns={"level_0": "Approaches", "level_1": "Algorithms", "level_2": "Instances"})


# Create an Excel writer object
with pd.ExcelWriter('Model result_20240523.xlsx', engine='xlsxwriter') as writer:
    # Write the DataFrame to the Excel file
    df_small_result.to_excel(writer, sheet_name='Sheet2', index=False)
    
    # Get the workbook and worksheet objects
    workbook = writer.book
    worksheet = writer.sheets['Sheet2']
    
    # Apply formatting to the worksheet
    header_format = workbook.add_format({'bold': True, 'bg_color': '#FFD700'})
    worksheet.set_column('A:E', 15)
    worksheet.set_column('F:G', 10)
    worksheet.conditional_format('F2:G9', {'type': '3_color_scale'})
    
    # Write the header with the specified format
    for col_num, value in enumerate(df_small_result.columns.values):
        worksheet.write(0, col_num, value, header_format)

print("Excel file created successfully.")

Excel file created successfully.
