# Import Modules

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
import pickle as pkl

In [2]:
all_nodes = set()
for i in tqdm(range(0, 62)):
    day = i % 31 + 1
    month = i // 31 + 7
    file_name = '2020-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.gexf'
    file_path = f'contact_graphs/{file_name}'

    G = nx.read_gexf(file_path)
    all_nodes = all_nodes.union(set(G.nodes()))

node_mapping = {node:i for i, node in enumerate(all_nodes)}
# pickle the node mapping
with open('node_mapping.pkl', 'wb') as f:
    pkl.dump(node_mapping, f)

  0%|          | 0/62 [00:00<?, ?it/s]

100%|██████████| 62/62 [00:08<00:00,  7.63it/s]


In [4]:
# create a csv file with the following columns: source,destination,timestamp,state_label,w

# df = pd.DataFrame(columns=['source', 'destination', 'timestamp', 'state_label', 'w'])
dfs = []
for i in tqdm(range(0, 62)):
    day = i % 31 + 1
    month = i // 31 + 7
    file_name = '2020-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.gexf'
    file_path = f'contact_graphs/{file_name}'

    G = nx.read_gexf(file_path)
    dfs.append(pd.DataFrame([{'source': node_mapping[edge[0]], 'destination': node_mapping[edge[1]], 'timestamp': i, 'state_label': 0, 'w': 0} for edge in G.edges()]))

df = pd.concat(dfs)

csv_file_path = 'tgn_data/data.csv'
df.to_csv(csv_file_path, index=False)



100%|██████████| 62/62 [00:07<00:00,  8.04it/s]


In [12]:
# import the mappings as well as the node features
with open('node_mapping.pkl', 'rb') as f:
    node_mapping = pkl.load(f)
with open('feature_selection_method1/node_travel_data.pkl', 'rb') as f:
    node_features = pkl.load(f)

# df = pd.DataFrame(columns=['source', 'destination', 'timestamp', 'state_label', 'w'])
contact_graph_nodes = set()
for i in tqdm(range(0, 62)):
    day = i % 31 + 1
    month = i // 31 + 7
    file_name = '2020-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.gexf'
    file_path = f'contact_graphs/{file_name}'

    G = nx.read_gexf(file_path)
    contact_graph_nodes = contact_graph_nodes.union(set(G.nodes()))

mapped_nodes = []
for node_id in contact_graph_nodes:
    if node_id not in node_mapping:
        assert ("ERROR: should not happen")
    mapped_nodes.append((node_mapping[node_id], node_id))
mapped_nodes = sorted(mapped_nodes, key=lambda x: x[0])
feature1 = []
feature2 = []
for i, node_id in mapped_nodes:
    feature1.append(node_features[node_id]["avg_locations_per_day"])
    feature2.append(node_features[node_id]["avg_distance_per_day"])
feature1 = np.array(feature1)
feature2 = np.array(feature2)
# normalize feature1 and feature2
feature1_norm = (2 * (feature1 - min(feature1)) / ( max(feature1) - min(feature1) )) - 1
feature2_norm = (2 * (feature2 - min(feature2)) / ( max(feature2) - min(feature2) )) - 1
final_features = [[0, 0]]
for i in range(len(feature1_norm)):
    final_features.append([feature1_norm[i], feature2_norm[i]])
final_features = np.array(final_features)
# save the final features to an npy file
np.save('tgn_data/ml_data_node_official.npy', final_features)


100%|██████████| 62/62 [00:07<00:00,  8.06it/s]


In [10]:
print(len(contact_graph_nodes))

45391


In [11]:
npy_data = np.load('tgn_data/ml_data_node.npy')
print(npy_data.shape)
print(npy_data[0])
print(npy_data[1])

(45392, 2)
[0. 0.]
[-0.9933222  -0.98407579]
