In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, make_scorer, max_error, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, ShuffleSplit, cross_validate, train_test_split
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic, ConstantKernel, Matern
from sklearn.feature_selection import RFE, SelectFromModel, RFECV, SelectKBest, chi2, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from mango import Tuner, scheduler
import xgboost as xgb
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.cluster import OPTICS, MiniBatchKMeans
from pyGRNN import GRNN
from skopt.space import Categorical, Space, Dimension, Integer
from sklearn.inspection import permutation_importance
from optuna.integration import OptunaSearchCV
import optuna
import matplotlib.pyplot as plt
from loading import load_data

  from pandas.core import (


In [15]:
numerical_features = ['start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'link_length', 'link_freespeed', 
                      'link_capacity', 'link_permlanes', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 
                      'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes',
                      'income', 'score', 'income_avg', 'score_avg'
                     ]
category_feature = ['type', 'home-activity-zone']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")

In [3]:
train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
train_data['dataset'] = 'train'
validate_data['dataset'] = 'validate'
test_data['dataset'] = 'test'
Big_data = pd.concat([train_data, validate_data, test_data], ignore_index=True)

In [4]:
# Find the indices where 'link_id' is 0
indices = Big_data.index[Big_data['link_id'] == 0].tolist()

# Add the end of the DataFrame to the indices list
indices.append(len(Big_data))

# Split the DataFrame using the indices
dfs = [Big_data.iloc[indices[n]:indices[n+1]] for n in range(len(indices)-1)]

In [5]:
list_od = []
list_nodes = []
all_files = train_files + validate_files + test_files
for i in all_files:
    with open(i) as f:
        d = json.load(f)
        list_od.append(d['o_d_pairs'])
        list_nodes.append(d['nodes_id'])
tuples_links = [ list(zip(dfs[i]['link_from'], dfs[i]['link_to'], dfs[i]['link_length'])) for i in range(20)]
list_od_tuples = [[(origin, destination) for origin, destination in list_od[i]]for i in range(20)]

In [6]:
import networkx as nx

# Assume `nodes` is a list of all node IDs and `edges` is a list of tuples (start_node, end_node, weight)
# For example:
# nodes = [1, 2, 3, 4, ...]
# edges = [(1, 2, 1.0), (2, 3, 2.5), (1, 3, 1.5), ...]
# And `o_d_pairs` is a list of tuples representing the O-D pairs:
# o_d_pairs = [(origin_1, destination_1), (origin_2, destination_2), ...]

shortest_paths_list = []
for i in range(20):
    G = nx.Graph()
    G.add_nodes_from(list_nodes[i])
    G.add_weighted_edges_from(tuples_links[i])
    shortest_paths = {}
    for origin, destination in list_od_tuples[i]:
        # This will find the shortest path by weight
        try:
            shortest_path = nx.shortest_path(G, source=origin, target=destination, weight='weight')
        except:
            shortest_path = []
        shortest_paths[(origin, destination)] = shortest_path
    shortest_paths_list.append(shortest_paths)

In [7]:
from collections import defaultdict
for i in range(20):
    link_usage_counts = defaultdict(int)

    # Iterate over each path and each link in the path
    for path in shortest_paths_list[i].values():
        for start_node, end_node in zip(path, path[1:]):
            # Order the nodes to avoid counting (node1, node2) and (node2, node1) separately
            ordered_link = tuple(sorted((start_node, end_node)))
            link_usage_counts[ordered_link] += 1

    # Now you have a dictionary with the count of usage for each link

    # Assume you have a DataFrame 'links_df' with columns ['node_start', 'node_end']
    # links_df = ...

    # Add a 'used_count' column to your links data
    dfs[i]['used_count'] = dfs[i].apply(
        lambda row: link_usage_counts[tuple(sorted((row['link_from'], row['link_to'])))],
        axis=1
    )
Big_data = pd.concat(dfs)

In [8]:
nodes_data = Big_data[['link_id', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y']]
grouped = nodes_data.groupby(['start_node_x', 'start_node_y'])
filtered_df = grouped.filter(lambda x: len(x) == 1)
filtered_df = filtered_df.drop_duplicates()
node_mapping = filtered_df.set_index(['start_node_x', 'start_node_y']).apply(
    lambda row: (row['end_node_x'], row['end_node_y']), axis=1).to_dict()

all_nodes = set(node_mapping.keys()) | set(node_mapping.values())
end_nodes = set(node_mapping.values())

start_nodes = list(all_nodes - end_nodes)

paths = []
for start_node in start_nodes:
    path = [start_node]
    while path[-1] in node_mapping:
        next_node = node_mapping[path[-1]]
        path.append(next_node)
    paths.append(path)
    
new_paths = [x for x in paths if len(x) >2]
def map_path_to_links(df, path):
    path_links = pd.DataFrame()
    for i in range(len(path) - 1):
        start_node = path[i]
        end_node = path[i+1]
        link_row = df[(df['start_node_x'] == start_node[0]) & 
                      (df['start_node_y'] == start_node[1]) & 
                      (df['end_node_x'] == end_node[0]) & 
                      (df['end_node_y'] == end_node[1])]
        if not link_row.empty:
            path_links = pd.concat([path_links, link_row])
    return path_links

# Step 3: Create separate DataFrames for each path
path_dfs = []
for path in new_paths:
    link_df = map_path_to_links(Big_data, path)
    path_dfs.append(link_df)

Big_data_drop = Big_data.copy(deep=True)
for path_df in path_dfs:
    numeric_df = path_df.select_dtypes(include=[ 'float64', 'int64'])
    column_means = numeric_df.mean()
    mean_df = pd.DataFrame([column_means])
    zone = path_df['home-activity-zone'].mode()
    type_value = path_df['type'].mode()
    dataset = path_df['dataset'].mode()
    mean_df['home-activity-zone'] = zone
    mean_df['type'] = type_value
    mean_df['dataset'] = dataset
    Big_data_drop = pd.concat([Big_data_drop, mean_df])

    try:
        Big_data_drop.drop(path_df.index, inplace=True)
    except:
        pass

In [9]:
cluster = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['x_y_coor'] = cluster.fit_predict(Big_data_drop[['start_node_x', 'start_node_y',
                                                           'end_node_x', 'end_node_y']])
cluster1 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['similar_link'] = cluster1.fit_predict(Big_data_drop[['link_length', 'link_freespeed',
                                                           'link_capacity', 'link_permlanes']])
cluster2 = MiniBatchKMeans(n_clusters=500, random_state=101)
Big_data_drop['planxml'] = cluster2.fit_predict(Big_data_drop[['income', 'score', 'rush_hour',
                                                               'max_dur', 'cemdapStopDuration_s']])

Big_data_drop = Big_data_drop.astype({'x_y_coor':'int64','similar_link':'int64', 'planxml':'int64'})

In [10]:
Big_data_tr = ct.fit_transform(Big_data_drop)
Big_data_tr['used_link'] = 1
Big_data_tr['used_link'][Big_data_tr['remainder__link_counts']==0] = 0
Big_data_tr = Big_data_tr.reset_index(drop=True)
train_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='train']
validate_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='validate']
test_data_tr = Big_data_tr[Big_data_tr['remainder__dataset']=='test']

train_index = list(train_data_tr.index)
validate_index = list(validate_data_tr.index)

temp = pd.concat([train_data_tr, validate_data_tr], ignore_index=True)
temp = temp.astype({"remainder__link_from": int, "remainder__link_to": int})

In [12]:
all_features = list(temp.columns)
nodes_features = ['remainder__link_from', 'remainder__link_to']
drop_featrues = ['remainder__dataset', 'remainder__link_counts', 'used_link']
temp_features = list(set(all_features) - set(nodes_features))
other_features = list(set(temp_features) - set(drop_featrues))

In [17]:
import torch
from torch_geometric.nn import GATConv
import torch.nn.functional as F
class GATNet(torch.nn.Module):
    def __init__(self, num_features, num_classes,
                hid, in_head, out_head, dor, extra_layer):
        super(GATNet, self).__init__()
        self.hid = hid
        self.in_head = in_head
        self.out_head = out_head
        self.dor = dor
        self.extra_layer = extra_layer
        self.gat1 = GATConv(num_features, self.hid, heads=self.in_head, dropout=self.dor)
        if self.extra_layer:
            self.gat2 = GATConv(self.hid*self.in_head, self.hid, heads=self.in_head, dropout=self.dor)
            self.gat3 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)
        else:
            self.gat2 = GATConv(self.hid*self.in_head, num_classes, concat=False, heads=self.out_head, dropout=self.dor)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dor, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=self.dor, training=self.training)
        if self.extra_layer:
            x = F.elu(self.gat2(x, edge_index))  # Add non-linearity after the second layer
            x = F.dropout(x, p=self.dor, training=self.training)
            x = self.gat3(x, edge_index) 
        else:
            x = self.gat2(x, edge_index)
        return x

In [18]:
import optuna
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

best_k = None
best_performance = float('inf')
performance_history = []

def objective(trial):
    # Hyperparameters to tune
    k = trial.suggest_int('k', 2, len(other_features))
    hid = trial.suggest_categorical('hid', [16, 32, 64, 128, 256, 512])
    in_head = trial.suggest_categorical('in_head', [1, 2, 4, 8, 16, 32])
    out_head = trial.suggest_categorical('out_head', [1, 2])
    dor = trial.suggest_categorical('dor', [0, 0.05, 0.1])
    extra_layer = trial.suggest_categorical('extra_layer', [True, False])
    
    # Create a tensor of your labels/targets
    y = torch.tensor(temp['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
    
    # Feature selection for the current k
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(temp[other_features], y)
    selected_columns = list(temp[other_features].columns[selector.get_support(indices=True)])
    
    edge_index = torch.tensor(temp[nodes_features].values.T, dtype=torch.long)
    x = torch.tensor(temp[selected_columns].values, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, y=y)

    test_edge_index = torch.tensor(test_data_tr[nodes_features].values.T, dtype=torch.long)
    test_x = torch.tensor(test_data_tr[selected_columns].values, dtype=torch.float)
    test_y = torch.tensor(test_data_tr['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
    test_data = Data(x=test_x, edge_index=test_edge_index, y=test_y)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_data = data.to(device)
    test_data = test_data.to(device)
    model = GATNet(k, 1, hid=hid, in_head=in_head, out_head=out_head, dor=dor, extra_layer=extra_layer).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion_MAE = torch.nn.L1Loss()
    def train():
        model.train()
        optimizer.zero_grad()
        out = model(train_data.x, train_data.edge_index)
        loss = criterion_MAE(out, train_data.y)
        loss.backward()
        optimizer.step()
        return loss
    for epoch in range(20):
        loss = train()

    return loss.item()

#     # Store the performance for each k
#     performance_history.append((k, test_loss))

#     # Update the best k if the current performance is better
#     if performance < best_performance:
#         best_performance = performance
#         best_k = k
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-03-13 23:09:44,471] A new study created in memory with name: no-name-7a495de2-8f58-456b-bad4-54a5885d76af
[I 2024-03-13 23:10:07,880] Trial 0 finished with value: 24.233083724975586 and parameters: {'k': 39, 'hid': 64, 'in_head': 2, 'out_head': 2, 'dor': 0.1, 'extra_layer': True}. Best is trial 0 with value: 24.233083724975586.
[I 2024-03-13 23:10:11,864] Trial 1 finished with value: 11.616863250732422 and parameters: {'k': 13, 'hid': 16, 'in_head': 2, 'out_head': 2, 'dor': 0.1, 'extra_layer': False}. Best is trial 1 with value: 11.616863250732422.
[I 2024-03-13 23:19:47,091] Trial 2 finished with value: 16.875606536865234 and parameters: {'k': 29, 'hid': 256, 'in_head': 8, 'out_head': 2, 'dor': 0, 'extra_layer': True}. Best is trial 1 with value: 11.616863250732422.
[I 2024-03-13 23:20:13,800] Trial 3 finished with value: 10.104442596435547 and parameters: {'k': 5, 'hid': 128, 'in_head': 4, 'out_head': 2, 'dor': 0.05, 'extra_layer': True}. Best is trial 3 with value: 10.104442

KeyboardInterrupt: 

In [29]:
best_params = study.best_params
best_k = best_params['k']
best_hid = best_params['hid']
best_in_head = best_params['in_head']
best_out_head = best_params['out_head']
best_dor = best_params['dor']

In [30]:
# Feature selection for the current k
selector = SelectKBest(score_func=f_regression, k=best_k)
y = torch.tensor(temp['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
X_new = selector.fit_transform(temp[other_features], y)
selected_columns = list(temp[other_features].columns[selector.get_support(indices=True)])

edge_index = torch.tensor(temp[nodes_features].values.T, dtype=torch.long)
x = torch.tensor(temp[selected_columns].values, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=y)

test_edge_index = torch.tensor(test_data_tr[nodes_features].values.T, dtype=torch.long)
test_x = torch.tensor(test_data_tr[selected_columns].values, dtype=torch.float)
test_y = torch.tensor(test_data_tr['remainder__link_counts'].values, dtype=torch.float).unsqueeze(1)
test_data = Data(x=test_x, edge_index=test_edge_index, y=test_y)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = data.to(device)
test_data = test_data.to(device)

best_model = GATNet(best_k, 1, hid=best_hid, in_head=best_in_head,
                    out_head=best_out_head, dor=best_dor).to(device)
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion_MAE = torch.nn.L1Loss()
def train():
    best_model.train()
    optimizer.zero_grad()
    out = best_model(train_data.x, train_data.edge_index)
    loss = criterion_MAE(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss
for epoch in range(250):
    loss = train()

In [31]:
criterion_MSE = torch.nn.MSELoss()
def test(test_data):
    best_model.eval()
    with torch.no_grad():
        pred = best_model(test_data.x, test_data.edge_index)
        loss_MAE = criterion_MAE(pred, test_data.y)
        loss_MSE = criterion_MSE(pred, test_data.y)
    return loss_MAE.item(), loss_MSE.item()

test_loss = test(test_data)
test_loss

(11.932363510131836, 429.4022216796875)

## Neo4j

In [71]:
from neo4j import GraphDatabase, basic_auth

In [72]:
driver = GraphDatabase.driver(
  "bolt://100.24.59.192:7687",
  auth=basic_auth("neo4j", "radiuses-vision-seasons"))

In [73]:
def import_data(driver, df):
    # Convert DataFrame to a list of dictionaries
    rows = df.to_dict('records')
    
    with driver.session() as session:
        # Use UNWIND to process each row as a separate Cypher operation
        session.run("""
            UNWIND $rows AS row
            MERGE (start:Location {x: row.num_preprocess__start_node_x, y: row.num_preprocess__start_node_y})
            MERGE (end:Location {x: row.num_preprocess__end_node_x, y: row.num_preprocess__end_node_y})
            MERGE (start)-[r:LINK]->(end)
            SET r += {
                length: row.num_preprocess__link_length,
                freespeed: row.num_preprocess__link_freespeed,
                capacity: row.num_preprocess__link_capacity,
                permlanes: row.num_preprocess__link_permlanes,
                start_count: row.num_preprocess__start_count,
                end_count: row.num_preprocess__end_count,
                link_id: row.remainder__link_id,
                link_counts: row.remainder__link_counts,
                dataset: row.remainder__dataset,
                used_count: row.remainder__used_count,
                x_y_coor: row.remainder__x_y_coor,
                similar_link: row.remainder__similar_link,
                planxml: row.remainder__planxml
            }
        """, parameters={'rows': rows})
df2 = temp.head(10)
import_data(driver, df2)



In [74]:
def create_gds_graph(driver):
    with driver.session() as session:
        session.run("""
            CALL gds.graph.project(
                'roadGraph',
                'Location',
                'LINK',
                {
                    nodeProperties: ['x', 'y'],
                    relationshipProperties: [
                        'length', 'freespeed', 'capacity', 'permlanes',
                        'start_count', 'end_count', 'link_id', 'dataset',
                        'used_count', 'x_y_coor', 'similar_link', 'planxml',
                        'link_counts'
                    ]
                }
            )
        """)

create_gds_graph(driver)


In [76]:
pipe, _ = driver.beta.pipeline.nodeClassification.create("link-pipe")
pipe.configureSplit(testFraction=0.2, validationFolds=5)

AttributeError: 'BoltDriver' object has no attribute 'beta'

In [62]:
def train_gnn_model(driver):
    with driver.session() as session:
        session.run("""
                CALL gds.beta.graphSage.train(
                  'roadGraph', 
                  {
                    modelName: 'graphSageModel',
                    featureProperties: [
                      'length', 'freespeed', 'capacity', 'permlanes'
                    ],
                    targetProperty: 'link_counts', 
                    hiddenLayerSizes: [256, 128],
                    sampleSizes: [25, 10],
                    batchSize: 32,
                    epochs: 10,
                    learningRate: 0.01
                  }
                )
                YIELD modelInfo
        """)

train_gnn_model(driver)


In [70]:
def predict_link_counts(driver):
    with driver.session() as session:
        result = session.run("""
                CALL gds.beta.graphSage.write(
                  'roadGraph',
                  {
                    modelName: 'graphSageModel',
                    writeProperty: 'predictedLinkCounts'
                  }
                )
                YIELD nodePropertiesWritten
        """)
        for row in result:
            print(row['nodeId'], row['predictedClass'])

predict_link_counts(driver)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.beta.graphSage.write`: Caused by: java.util.NoSuchElementException: Model with name `graphSageModel` does not exist.}