In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

## data loader

In [3]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)
            
            # Calculate time of go_to_work and go_to_sum
            df_act_work = df_activities[df_activities['activity_type_main']=='work'].drop(['end_time'], axis=1)
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link")['go_to_work'].sum().reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home'].drop(['end_time'], axis=1)
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link")['go_to_home'].sum().reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']

            df_rushhr = df_activities[df_activities['end_time']!=-1]
            df_rushhr.loc[:, 'rush_hour'] = 0
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('08:00:00'), pd.to_timedelta('10:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('16:00:00'), pd.to_timedelta('19:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
            df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
            
            df_maxduragg = df_activities[df_activities['max_dur']!=-1].groupby(by='link')['max_dur'].sum().reset_index(drop=False)
            
            df_activities['cemdapStopDuration_s'] = df_activities['cemdapStopDuration_s'].astype(float)
            df_cemagg = df_activities[df_activities['cemdapStopDuration_s']!=-1].groupby(by='link')['cemdapStopDuration_s'].sum().reset_index(drop=False)
            
            df_temp = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            df_temp = df_temp[['link_id_x','link_from','link_to','link_id_y','from', 'to', 'type']]
            df_temp = df_temp.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_rushhragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_maxduragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_cemagg, how='left', left_on='link_id_y', right_on='link')
            df_temp.fillna({'cemdapStopDuration_s':-1, 'max_dur':-1, 'rush_hour': -1, 'go_to_sum': -1}, inplace=True)
            df_temp = df_temp[['link_id_x', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'type']]
            
            df_links = df_links.merge(df_temp, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


## Define parameter space

In [4]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes'
                     ]
category_feature = ['type']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
clf = {
    'KNN': KNeighborsClassifier(),
    # 'XGB': xgb.XGBClassifier(random_state=101),
    'LGBM': lgb.LGBMClassifier(random_state=101, verbose=-1),
    'RF': RandomForestClassifier(random_state=101),
#     'GB': GradientBoostingClassifier(random_state=101),
#     'ANN': MLPClassifier(random_state=101),
    # 'SVR': SVC(),
}

model_space = {
    'KNN': KNeighborsRegressor(),
#     'XGB': xgb.XGBRegressor(random_state=101),
    'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
    'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
    'ANN': MLPRegressor(random_state=101),
    # 'SVR': SVR(),
    'Linear': LinearRegression(),
    'Lasso': LassoCV(random_state=42, max_iter=100000),
    'Ridge': RidgeCV(),
}
# model_space_feature = {
#     'SVR': RandomForestRegressor(random_state=101),
#     'KNN': RandomForestRegressor(random_state=101),
#     'XGB': xgb.XGBRegressor(random_state=101),
#     'LGBM': lgb.LGBMRegressor(random_state=101, verbose=-1),
#     'RF': RandomForestRegressor(random_state=101),
#     'GB': GradientBoostingRegressor(random_state=101),
#     'ANN': RandomForestRegressor(random_state=101),
#     # 'GRNN': RandomForestRegressor(random_state=101)
# }
param_space = {
'Linear': {  
},
'Lasso': {
},
'Ridge': {  
},
# 'SVR': {
#     "C": optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
#     'gamma': optuna.distributions.CategoricalDistribution(['scale', 'auto']), 
#     'kernel': optuna.distributions.CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),  
#     # 'epsilon': optuna.distributions.FloatDistribution(0.01, 1),  
# },
'RF':  {
    'max_features': optuna.distributions.CategoricalDistribution(['sqrt', 'log2']),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 200),
    'min_samples_leaf': optuna.distributions.IntDistribution(1, 20),
    # 'criterion': Categorical(['absolute_error', 'friedman_mse'])
},
# 'GB':{
#     'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
#     'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
#     'max_depth': optuna.distributions.IntDistribution(1, 200),
#     'min_samples_split': optuna.distributions.IntDistribution(2, 11),
#     'min_samples_leaf': optuna.distributions.IntDistribution(1, 10),
#     'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
# },
'ANN': {
    'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(100,), (50,), (50, 50), (100, 100), (30, 30, 30)]),
    'activation': optuna.distributions.CategoricalDistribution(['tanh', 'relu', 'logistic']),
    'solver': optuna.distributions.CategoricalDistribution(['adam', 'lbfgs']),
    'alpha': optuna.distributions.FloatDistribution(1e-5, 1e5, log=True),
},
'KNN':{
    'n_neighbors': optuna.distributions.IntDistribution(1, 50),
    'weights': optuna.distributions.CategoricalDistribution(['uniform', 'distance']),
    'algorithm': optuna.distributions.CategoricalDistribution(['auto', 'ball_tree', 'kd_tree', 'brute'])
},    
'LGBM': {
    'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
    'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
    'max_depth': optuna.distributions.IntDistribution(1, 50),
    'num_leaves': optuna.distributions.IntDistribution(2, 50),
    'min_child_samples': optuna.distributions.IntDistribution(1, 20),
    'subsample': optuna.distributions.FloatDistribution(0.1, 1.0),
    'colsample_bytree': optuna.distributions.FloatDistribution(0.1, 1.0),
},
# 'XGB': {
#     'learning_rate': optuna.distributions.FloatDistribution(0.01, 1.0),
#     'n_estimators': optuna.distributions.IntDistribution(50, 3001, 50),
#     'max_depth': optuna.distributions.IntDistribution(1, 20),
#     'max_leaves': optuna.distributions.IntDistribution(2, 50),
#     'max_bin': optuna.distributions.IntDistribution(2, 50),
#     'gamma': optuna.distributions.IntDistribution(1, 20),
# },
# 'GPR':{
#     'kernel': optuna.distributions.CategoricalDistribution([0.1**2 * RBF(length_scale=0.1) + 
#                                     WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5)), 
#                                     0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0),
#                                     50.0**2 * RBF(length_scale=50.0), DotProduct() + WhiteKernel(), 
#                                     1.0 * Matern(length_scale=1.0, nu=1.5),
#                                     RBF() + ConstantKernel(constant_value=2)
#                                                            ]),
#     'alpha':  optuna.distributions.FloatDistribution(1e-15, 1e10)
# }
}

## Load data and create feature from shortest path

In [5]:
df_train = []
list_od = []
list_nodes = []
for i in range(0, 10):
    train_files = f'Data/sparseWorlds/Train/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_links_network_{i}.pkl')
    train_data = load_data([train_files], df_activities, df_links_network)
    df_train.append(train_data)
    with open(train_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
train_data = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    validate_files = f'Data/sparseWorlds/Validate/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_links_network_{i}.pkl')
    validate_data = load_data([validate_files], df_activities, df_links_network)
    df_validate.append(validate_data)
    with open(validate_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
validate_data = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    test_files = f'Data/sparseWorlds/Test/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_links_network_{i}.pkl')
    test_data = load_data([test_files], df_activities, df_links_network)
    df_test.append(test_data)
    with open(test_files) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
test_data = pd.concat(df_test, ignore_index=True)

train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

indices = Big_data.index[Big_data['link_id'] == 0].tolist()
indices.append(len(Big_data))
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]
import networkx as nx

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data_new = pd.concat(dfs)

## Create feature from clustering

In [6]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['x_y_coor'] = cluster.fit_predict(Big_data_new[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['similar_link'] = cluster1.fit_predict(Big_data_new[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_new['planxml'] = cluster2.fit_predict(Big_data_new[['rush_hour', 'max_dur', 'cemdapStopDuration_s']])

Big_data_new = Big_data_new.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

## Dataset numerification and standardization

In [7]:
Big_data_tr = ct.fit_transform(Big_data_new)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)

## Classification task

In [7]:
X_t_clf = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_clf = temp['used_link']

X_te_clf = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_clf = test_data_tr['used_link']

best_model_clf = {}
for model_name in clf.keys():   
    model = clf[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t_clf.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        random_state=101
    )
    opt.fit(X_t_clf, y_t_clf)
    y_pred_clf = opt.predict(X_te_clf)
    best_model_clf[model_name] = [opt, opt.best_score_, y_pred_clf, accuracy_score(y_te_clf, y_pred_clf)]

[I 2024-05-24 11:21:43,435] A new study created in memory with name: no-name-55978207-6719-4547-97f0-a344e7295ee3
[I 2024-05-24 11:21:45,502] Trial 0 finished with value: 0.5590232039720336 and parameters: {'selector__k': 31, 'model__n_neighbors': 46, 'model__weights': 'distance', 'model__algorithm': 'brute'}. Best is trial 0 with value: 0.5590232039720336.
[I 2024-05-24 11:21:46,955] Trial 1 finished with value: 0.6989563278954302 and parameters: {'selector__k': 5, 'model__n_neighbors': 21, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: 0.6989563278954302.
[I 2024-05-24 11:21:49,338] Trial 2 finished with value: 0.5336913567737359 and parameters: {'selector__k': 29, 'model__n_neighbors': 17, 'model__weights': 'uniform', 'model__algorithm': 'auto'}. Best is trial 1 with value: 0.6989563278954302.
[I 2024-05-24 11:21:55,947] Trial 3 finished with value: 0.6874050055730064 and parameters: {'selector__k': 21, 'model__n_neighbors': 49, 'model__weig

RF 0.712736852771304 0.6893630053082891


In [8]:
for i in best_model_clf.keys():
    print(i, best_model_clf[i][1],best_model_clf[i][3])

KNN 0.7117235788833721 0.6845651286239282
LGBM 0.7162833113790658 0.6855859534503879
RF 0.7161819839902726 0.6881380155165374


### Check the best result and put the predict results into test dataset

In [9]:
best_md_from_clf = sorted(best_model_clf.items(), key=lambda t: t[1][1])[-1]
temp_tr = test_data_tr.copy(deep=True)
temp_tr['y_pred_clf'] = best_md_from_clf[1][2]
import pickle
with open('CLF_sparse.pickle', 'wb') as handle:
    pickle.dump(temp_tr, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
import pickle
with open('CLF_sparse.pickle', 'rb') as handle:
    temp_tr = pickle.load(handle)

### Group the used link from training and validation dataset and later group the predicted used link in test dataset

In [9]:
used_link_1 = temp[temp['used_link']==1]
used_link_1_train = used_link_1[used_link_1['remainder__dataset']=='train']
used_link_1_validate = used_link_1[used_link_1['remainder__dataset']=='validate']
temp_2 = pd.concat([used_link_1_train, used_link_1_validate], ignore_index=True)
X_t = temp_2.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t = temp_2['remainder__link_counts']

train_index = list(temp_2[temp_2['remainder__dataset']=='train'].index)
validate_index = list(temp_2[temp_2['remainder__dataset']=='validate'].index)

X_te = temp_tr[temp_tr['y_pred_clf']==1].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
y_te = temp_tr[temp_tr['y_pred_clf']==1]['remainder__link_counts']

X_te_0 = temp_tr[temp_tr['y_pred_clf']==0].drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link', 'y_pred_clf'])
X_te_0['y_pred'] = 0
y_te_0 = temp_tr[temp_tr['y_pred_clf']==0]['remainder__link_counts']
y_te_all = pd.concat([y_te, y_te_0])

In [13]:
best_model_reg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index, validate_index), (train_index, validate_index)],
        scoring='neg_mean_absolute_error',
        random_state=101
    )
    opt.fit(X_t, y_t)
    y_pred = opt.predict(X_te)
    y_pred_all = np.concatenate([y_pred, np.array(X_te_0['y_pred'])])
    mae = mean_absolute_error(y_te_all, y_pred_all)
    mse = mean_squared_error(y_te_all, y_pred_all)
    me = max_error(y_te_all, y_pred_all)
    best_model_reg[model_name] = (opt, mae, mse, me, y_te_all, y_pred_all)
with open('CLFFSREG_sparse_woGNN.pickle', 'wb') as handle:
    pickle.dump(best_model_reg, handle, protocol=pickle.HIGHEST_PROTOCOL)

[I 2024-05-24 12:12:49,315] A new study created in memory with name: no-name-01025609-869d-438b-96a2-64e2814112d8
[I 2024-05-24 12:12:49,811] Trial 0 finished with value: -6.806323250785443 and parameters: {'selector__k': 19, 'model__n_neighbors': 32, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -6.806323250785443.
[I 2024-05-24 12:12:50,649] Trial 1 finished with value: -6.957094527363185 and parameters: {'selector__k': 29, 'model__n_neighbors': 10, 'model__weights': 'uniform', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -6.806323250785443.
[I 2024-05-24 12:12:51,674] Trial 2 finished with value: -6.877825156271765 and parameters: {'selector__k': 37, 'model__n_neighbors': 4, 'model__weights': 'distance', 'model__algorithm': 'ball_tree'}. Best is trial 0 with value: -6.806323250785443.
[I 2024-05-24 12:12:52,474] Trial 3 finished with value: -5.429016619032498 and parameters: {'selector__k': 11, 'model__n_neighbors': 47, 

In [18]:
X_t_onlyreg = temp.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_t_onlyreg = temp['remainder__link_counts']

X_te_onlyreg = test_data_tr.drop(columns=['remainder__dataset', 'remainder__link_counts', 'used_link'])
y_te_onlyreg = test_data_tr['remainder__link_counts']

train_index_onlyreg = list(train_data_tr.index)
validate_index_onlyreg = list(validate_data_tr.index)

## Feature Selection Regression task

In [17]:
best_model_fsreg = {}
for model_name in model_space.keys():   
    model = model_space[model_name]
    pipeline  = Pipeline([('selector', SelectKBest(f_regression)),
                  ('model', model)])
    param_grid = {}
    param_grid['selector__k']=optuna.distributions.IntDistribution(2, len(X_t_onlyreg.columns))
    for key in param_space[model_name].keys():
        param_grid[f'model__{key}']=param_space[model_name][key]
    
    # BayesSearchCV
    opt = OptunaSearchCV(
        pipeline,
        param_grid,
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error',
        random_state=101
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_fsreg[model_name] = [opt.best_score_, mae, mse, me, y_te_onlyreg, y_pred]
with open('FSREG_sparse_woGNN.pickle', 'wb') as handle:
    pickle.dump(best_model_fsreg, handle, protocol=pickle.HIGHEST_PROTOCOL)

[I 2024-05-24 20:18:18,483] A new study created in memory with name: no-name-398fca9d-8595-4320-b8b0-7c82c612d37c
[I 2024-05-24 20:18:19,693] Trial 0 finished with value: -4.622880568784409 and parameters: {'selector__k': 32, 'model__n_neighbors': 24, 'model__weights': 'uniform', 'model__algorithm': 'brute'}. Best is trial 0 with value: -4.622880568784409.
[I 2024-05-24 20:18:20,942] Trial 1 finished with value: -3.6215640236816915 and parameters: {'selector__k': 12, 'model__n_neighbors': 36, 'model__weights': 'distance', 'model__algorithm': 'auto'}. Best is trial 1 with value: -3.6215640236816915.
[I 2024-05-24 20:18:22,483] Trial 2 finished with value: -3.771040632282906 and parameters: {'selector__k': 17, 'model__n_neighbors': 5, 'model__weights': 'uniform', 'model__algorithm': 'kd_tree'}. Best is trial 1 with value: -3.6215640236816915.
[I 2024-05-24 20:18:23,660] Trial 3 finished with value: -3.7577081996374733 and parameters: {'selector__k': 21, 'model__n_neighbors': 45, 'model__

## Regression task w/o Feature Selection 

In [18]:
best_model_onlyreg_wofeatureselect = {}
for model_name in model_space.keys():   
    # BayesSearchCV
    opt = OptunaSearchCV(
        model_space[model_name],
        param_space[model_name],
        n_trials=50,
        cv=[(train_index_onlyreg, validate_index_onlyreg), (train_index_onlyreg, validate_index_onlyreg)],
        scoring='neg_mean_absolute_error',
        random_state=101
    )
    opt.fit(X_t_onlyreg, y_t_onlyreg)
    y_pred = opt.predict(X_te_onlyreg)
    mae = mean_absolute_error(y_te_onlyreg, y_pred)
    mse = mean_squared_error(y_te_onlyreg, y_pred)
    me = max_error(y_te_onlyreg, y_pred)
    best_model_onlyreg_wofeatureselect[model_name] = [opt.best_score_, mae, mse, me, y_te_onlyreg, y_pred]
with open('REG_sparse_woGNN.pickle', 'wb') as handle:
    pickle.dump(best_model_onlyreg_wofeatureselect, handle, protocol=pickle.HIGHEST_PROTOCOL)

[I 2024-05-24 21:46:14,103] A new study created in memory with name: no-name-a40375c3-dab9-4dca-893b-6e371f38b22e
[I 2024-05-24 21:46:15,683] Trial 0 finished with value: -4.4856489534834685 and parameters: {'n_neighbors': 43, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 0 with value: -4.4856489534834685.
[I 2024-05-24 21:46:17,985] Trial 1 finished with value: -4.476850434348437 and parameters: {'n_neighbors': 35, 'weights': 'distance', 'algorithm': 'kd_tree'}. Best is trial 1 with value: -4.476850434348437.
[I 2024-05-24 21:46:21,569] Trial 2 finished with value: -4.593566037674177 and parameters: {'n_neighbors': 31, 'weights': 'uniform', 'algorithm': 'ball_tree'}. Best is trial 1 with value: -4.476850434348437.
[I 2024-05-24 21:46:23,154] Trial 3 finished with value: -4.517918059918263 and parameters: {'n_neighbors': 12, 'weights': 'uniform', 'algorithm': 'kd_tree'}. Best is trial 1 with value: -4.476850434348437.
[I 2024-05-24 21:46:24,308] Trial 4 finished with value

In [12]:
pip install torch_geometric

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## GNN

In [36]:
import torch
from torch_geometric.nn import GATConv
import torch.nn.functional as F
class GATNet(torch.nn.Module):
    def __init__(self, num_features, num_classes,
                hid, in_head, out_head, dor):
        super(GATNet, self).__init__()
        self.hid = hid
        self.in_head = in_head
        self.out_head = out_head
        self.dor = dor
        # self.extra_layer = extra_layer
        self.gat1 = GATConv(num_features, self.hid, heads=self.in_head, dropout=self.dor)
        # if self.extra_layer:
        #     self.gat2 = GATConv(self.hid*self.in_head, self.hid, heads=self.in_head, dropout=self.dor)
        #     self.gat3 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)
        # else:
        self.gat2 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dor, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=self.dor, training=self.training)
        # if self.extra_layer:
        #     x = F.elu(self.gat2(x, edge_index))  # Add non-linearity after the second layer
        #     x = F.dropout(x, p=self.dor, training=self.training)
        #     x = self.gat3(x, edge_index) 
        # else:
        x = self.gat2(x, edge_index)
        return x

In [57]:
# all_features = list(temp.columns)
all_features = list(temp_2.columns) #CLF

nodes_features = ['remainder__link_from', 'remainder__link_to']
drop_featrues = ['remainder__dataset', 'remainder__link_counts', 'used_link']
temp_features = list(set(all_features) - set(nodes_features))
other_features = list(set(temp_features) - set(drop_featrues))

In [31]:
import random
def seed_everything(seed: int) -> None:
    r"""Sets the seed for generating random numbers in :pytorch:`PyTorch`,
    :obj:`numpy` and :python:`Python`.

    Args:
        seed (int): The desired seed.
    """
    random.seed(101)
    np.random.seed(101)
    torch.manual_seed(101)
    torch.cuda.manual_seed_all(101)

In [58]:
from torch_geometric.data import Data

best_k = None
best_performance = float('inf')
performance_history = []

def objective(trial):
    # Hyperparameters to tune
    k = trial.suggest_int('k', 2, len(other_features))
    hid = trial.suggest_categorical('hid', [16, 32, 64, 128])
    in_head = trial.suggest_categorical('in_head', [1, 2, 4, 8])
    out_head = trial.suggest_categorical('out_head', [1, 2])
    dor = trial.suggest_categorical('dor', [0, 0.05])
    # extra_layer = trial.suggest_categorical('extra_layer', [False])
    
    # Create a tensor of your labels/targets
    y = torch.tensor(temp_2['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
    
    # Feature selection for the current k
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(temp_2[other_features], y)
    selected_columns = list(temp_2[other_features].columns[selector.get_support(indices=True)])
    
    edge_index = torch.tensor(temp_2[nodes_features].values.T, dtype=torch.long)
    x = torch.tensor(temp_2[selected_columns].values, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, y=y)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_data = data.to(device)
    model = GATNet(k, 1, hid=hid, in_head=in_head, out_head=out_head, dor=dor).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion_MAE = torch.nn.L1Loss()
    def train():
        model.train()
        optimizer.zero_grad()
        out = model(train_data.x, train_data.edge_index)
        loss = criterion_MAE(out, train_data.y)
        loss.backward()
        optimizer.step()
        return loss
    for epoch in range(50):
        loss = train()

    return loss.item()

#     # Store the performance for each k
#     performance_history.append((k, test_loss))

#     # Update the best k if the current performance is better
#     if performance < best_performance:
#         best_performance = performance
#         best_k = k
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2024-05-25 18:31:37,871] A new study created in memory with name: no-name-84f8439f-50d1-4cbe-bfd7-0ebdfd55c33e
[I 2024-05-25 18:31:40,347] Trial 0 finished with value: 6.315135955810547 and parameters: {'k': 30, 'hid': 32, 'in_head': 1, 'out_head': 2, 'dor': 0}. Best is trial 0 with value: 6.315135955810547.
[I 2024-05-25 18:31:42,931] Trial 1 finished with value: 4.9482951164245605 and parameters: {'k': 15, 'hid': 64, 'in_head': 1, 'out_head': 2, 'dor': 0}. Best is trial 1 with value: 4.9482951164245605.
[I 2024-05-25 18:31:46,830] Trial 2 finished with value: 9.753515243530273 and parameters: {'k': 27, 'hid': 64, 'in_head': 1, 'out_head': 2, 'dor': 0.05}. Best is trial 1 with value: 4.9482951164245605.
[I 2024-05-25 18:31:53,451] Trial 3 finished with value: 5.389454364776611 and parameters: {'k': 28, 'hid': 16, 'in_head': 8, 'out_head': 2, 'dor': 0}. Best is trial 1 with value: 4.9482951164245605.
[I 2024-05-25 18:32:16,456] Trial 4 finished with value: 9.241643905639648 and para

In [59]:
best_params = study.best_params
best_k = best_params['k']
best_hid = best_params['hid']
best_in_head = best_params['in_head']
best_out_head = best_params['out_head']
best_dor = best_params['dor']
# best_extra_layer = best_params['extra_layer']

In [60]:
# Feature selection for the current k
selector = SelectKBest(score_func=f_regression, k=best_k)
y = torch.tensor(temp_2['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
X_new = selector.fit_transform(temp_2[other_features], y)
selected_columns = list(temp_2[other_features].columns[selector.get_support(indices=True)])

edge_index = torch.tensor(temp_2[nodes_features].values.T, dtype=torch.long)
x = torch.tensor(temp_2[selected_columns].values, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=y)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = data.to(device)

best_model = GATNet(best_k, 1, hid=best_hid, in_head=best_in_head,
                    out_head=best_out_head, dor=best_dor).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion_MAE = torch.nn.L1Loss()
def train():
    best_model.train()
    optimizer.zero_grad()
    out = best_model(train_data.x, train_data.edge_index)
    loss = criterion_MAE(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss
for epoch in range(250):
    loss = train()




In [61]:
test_edge_index = torch.tensor(test_data_tr[nodes_features].values.T, dtype=torch.long)
test_x = torch.tensor(X_te[selected_columns].values, dtype=torch.float)#CLF
test_y = torch.tensor(y_te.values, dtype=torch.float).unsqueeze(1)#CLF
# test_x = torch.tensor(test_data_tr[selected_columns].values, dtype=torch.float)
# test_y = torch.tensor(test_data_tr['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
test_data = Data(x=test_x, edge_index=test_edge_index, y=test_y)
test_data = test_data.to(device)

best_model.eval()
with torch.no_grad():
    pred = best_model(test_data.x, test_data.edge_index)
    y_pred_all = np.concatenate([np.array(pd.DataFrame(pred).astype("float")[0]), np.array(X_te_0['y_pred'])]) #CLF
    mae = mean_absolute_error(y_te_all, y_pred_all)#CLF
    mse = mean_squared_error(y_te_all, y_pred_all)#CLF
    me = max_error(y_te_all, y_pred_all)#CLF
    # y_pred = np.array(pd.DataFrame(pred).astype("float")[0])
    # mae = mean_absolute_error(y_te_onlyreg, y_pred)
    # mse = mean_squared_error(y_te_onlyreg, y_pred)
    # me = max_error(y_te_onlyreg, y_pred)
# gnn_result = [study.best_value, mae, mse, me, y_te_onlyreg, y_pred]
gnn_result = [study.best_value, mae, mse, me, y_te_all, y_pred_all]#CLF

In [62]:
gnn_result

[4.821235179901123,
 3.0022214982474567,
 27.03843206173479,
 29.200674057006836,
 29531     7.0
 29532     2.0
 29534     6.0
 29539     0.0
 29540    17.0
          ... 
 39314     0.0
 39315     0.0
 39316     0.0
 39318     1.0
 39320     0.0
 Name: remainder__link_counts, Length: 9796, dtype: float64,
 array([3.51270318, 4.37963724, 3.98916078, ..., 0.        , 0.        ,
        0.        ])]

In [63]:
with open('CLFFSREG_sparse_woGNN.pickle', 'rb') as handle:
    best_model_reg = pickle.load(handle)

In [64]:
best_model_reg['GNN']=gnn_result

In [65]:
best_model_reg

{'KNN': (OptunaSearchCV(cv=[([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, ...],
                      [11526, 11527, 11528, 11529, 11530, 11531, 11532, 11533,
                       11534, 11535, 11536, 11537, 11538, 11539, 11540, 11541,
                       11542, 11543, 11544, 11545, 11546, 11547, 11548, 11549,
                       11550, 11551, 11552, 11553, 11554, 11555, ...]),
                     ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                       17, 18, 19, 20, 21...
                 param_distributions={'model__algorithm': CategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute')),
                                      'model__n_neighbors': IntDistribution(high=50, log=False, low=1, step=1),
                                      'model__weights': CategoricalDistribution(choices=('uniform', 'distance')),
                                    

In [23]:
def load_json(file_list):
    index_list = []
    n=test_data_tr.index[0]
    m=test_data_tr.index[0]
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
            })
        n += len(df_links)
        index_list.append(list(range(m, n)))
        m += len(df_links)
    return index_list

test_files = [f'Data/sparseWorlds/Test/po-1/s-{i}.json'for i in range(15, 20)]
sparse_test_index = load_json(test_files)

def split_five_instance(original_result):
    split5test = {}
    for i in original_result.keys():
        test=pd.DataFrame({
            'true_y':original_result[i][4],
            'pred_y':original_result[i][5]
        })
        split5test[i]={'all':{
            'MAE': original_result[i][1],
            'MSE': original_result[i][2],
            'ME': original_result[i][3],
        }}
        for j in range(1, len(sparse_test_index)+1):
            split5test[i][f'instance_{j}']={}
            test_df = test.loc[sparse_test_index[j-1]]
            split5test[i][f'instance_{j}']['MAE'] = mean_absolute_error(test_df['true_y'], test_df['pred_y'])
            split5test[i][f'instance_{j}']['MSE'] = mean_squared_error(test_df['true_y'], test_df['pred_y'])
            split5test[i][f'instance_{j}']['ME'] = max_error(test_df['true_y'], test_df['pred_y'])
    return split5test

In [66]:
CLFFSREG_sparse = split_five_instance(best_model_reg)
with open('CLFFSREG_sparse.pickle', 'wb') as handle:
    pickle.dump(CLFFSREG_sparse, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [67]:
CLFFSREG_sparse

{'KNN': {'all': {'MAE': 3.2885393951504307,
   'MSE': 29.044212343763892,
   'ME': 30.02127659574468},
  'instance_1': {'MAE': 3.9705876088854812,
   'MSE': 34.36243622527009,
   'ME': 21.0},
  'instance_2': {'MAE': 4.153052469236367,
   'MSE': 45.925549361734475,
   'ME': 19.74468085106383},
  'instance_3': {'MAE': 2.8896356864591803,
   'MSE': 21.1753677337907,
   'ME': 16.4468085106383},
  'instance_4': {'MAE': 2.912101412855308,
   'MSE': 26.783119640601736,
   'ME': 30.02127659574468},
  'instance_5': {'MAE': 2.493997507161444,
   'MSE': 16.841896902189816,
   'ME': 17.95744680851064}},
 'LGBM': {'all': {'MAE': 3.284863567051556,
   'MSE': 28.037389721912962,
   'ME': 27.37691128428893},
  'instance_1': {'MAE': 3.781521796075562,
   'MSE': 31.280270192947274,
   'ME': 21.0},
  'instance_2': {'MAE': 4.187024107268574,
   'MSE': 45.57342098855067,
   'ME': 19.0},
  'instance_3': {'MAE': 3.0212591665572557,
   'MSE': 22.124181366444414,
   'ME': 15.0},
  'instance_4': {'MAE': 2.89880

In [68]:
all_sparse={
    'CLF-FS-REG':CLFFSREG_sparse,
    'FS-REG':FSREG_sparse,
    'REG':REG_sparse
          }

In [69]:
df_sparse_result = pd.DataFrame.from_dict({(i, j, k): all_sparse[i][j][k]
                             for i in all_sparse.keys()
                             for j in all_sparse[i].keys()
                             for k in all_sparse[i][j].keys()},
                            orient='index')

In [70]:
with open('all_sparse.pickle', 'wb') as handle:
    pickle.dump(all_sparse, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [71]:
# Reset the index to separate the keys into individual columns
df_sparse_result = df_sparse_result.reset_index().rename(columns={"level_0": "Approaches", "level_1": "Algorithms", "level_2": "Instances"})


# Create an Excel writer object
with pd.ExcelWriter('Model result_sparse_0525.xlsx', engine='xlsxwriter') as writer:
    # Write the DataFrame to the Excel file
    df_sparse_result.to_excel(writer, sheet_name='Sheet1', index=False)
    
    # Get the workbook and worksheet objects
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']
    
    # Apply formatting to the worksheet
    header_format = workbook.add_format({'bold': True, 'bg_color': '#FFD700'})
    worksheet.set_column('A:E', 15)
    worksheet.set_column('F:G', 10)
    # worksheet.conditional_format('F2:G9', {'type': '3_color_scale'})
    
    # Write the header with the specified format
    for col_num, value in enumerate(df_sparse_result.columns.values):
        worksheet.write(0, col_num, value, header_format)

print("Excel file created successfully.")

Excel file created successfully.
