# Data to Temporal Relational Graph

Preprocessing code to put the traffic data intothe proposed temporal relational graph data structure.

In [108]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.animation import FuncAnimation, PillowWriter

import networkx as nx
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
import torch

from models import RelationalSTAE
from datautils import make_graph_sequence, plot_traffic_graph, get_day_data, normalize_data, generate_relational_edges, generate_edges

from tqdm import tqdm
import optuna

In [92]:
def get_full_data():
    valid_days = [10, 11, 15, 16, 25] # chosen because they contain accidents of interest we want to detect (note index 10 is Oct 11 and so on)
    data = pd.read_csv('../data/data_with_label.csv')
    melted = pd.melt(data, id_vars=['day', 'milemarker', 'human_label', 'crash_record', 'unix_time'], value_vars=['lane1_speed', 'lane2_speed', 'lane3_speed', 'lane4_speed'], value_name='speed').sort_values(['unix_time', 'milemarker']).drop('variable', axis=1)
    melted2 = pd.melt(data, id_vars=['day', 'milemarker', 'human_label', 'crash_record', 'unix_time'], value_vars=['lane1_occ', 'lane2_occ', 'lane3_occ', 'lane4_occ'], value_name='occ').sort_values(['unix_time', 'milemarker']).drop('variable', axis=1)
    melted3 = pd.melt(data, id_vars=['day', 'milemarker', 'human_label', 'crash_record', 'unix_time'], value_vars=['lane1_volume', 'lane2_volume', 'lane3_volume', 'lane4_volume'], value_name='volume').sort_values(['unix_time', 'milemarker']).drop('variable', axis=1)
    melted['occ'] = melted2['occ']
    melted['volume'] = melted3['volume']

    melted = melted[melted['day'] != 17]

    train_data = melted[(melted['day'] != valid_days[0]) & (melted['day'] != valid_days[1]) & (melted['day'] != valid_days[2]) & (melted['day'] != valid_days[3]) & (melted['day'] != valid_days[4])]
    test_data = melted[(melted['day'] == valid_days[0]) | (melted['day'] == valid_days[1]) | (melted['day'] == valid_days[2]) | (melted['day'] == valid_days[3]) | (melted['day'] == valid_days[4])]
    return train_data, test_data

In [97]:
def label_anomalies(data):
    human_label_times = np.unique(data[data['human_label']==1]['unix_time'])
    for human_label_time in human_label_times:
        data.loc[(data['unix_time'] - human_label_time <= 7200) & (data['unix_time'] - human_label_time >= 0), 'anomaly'] = 1

    crash_label_times = np.unique(data[data['crash_record']==1]['unix_time'])
    for crash_label_time in crash_label_times:
        data.loc[(data['unix_time'] - crash_label_time <= 7200) & (data['unix_time'] - crash_label_time >= -1800), 'anomaly'] = 1

    data.fillna(0, inplace=True)

    return data

In [93]:
train_data, test_data = get_full_data()
train_data = normalize_data(train_data)

In [98]:
train_data = label_anomalies(train_data)

In [142]:
def sequence_rstae(data, timesteps):
    sequence = []
    relational_edges, relations = generate_relational_edges(milemarkers=list(range(49)), timesteps=timesteps)
    static_edges = generate_edges(milemarkers=list(range(49)))
    days = data['day']
    anomalies = data['anomaly']
    data_vals = data[['occ', 'speed', 'volume']]
    unix = data['unix_time']
    unique_times = np.unique(data['unix_time'])

    for t in tqdm(unique_times[timesteps:]): # skip first 'timesteps'
        data_t = []
        backward_index = range(timesteps-1, -1, -1)
        backward_times = [unique_times[i] for i in backward_index]
        curr_day = np.unique(data[data['unix_time']==backward_times[-1]]['day'])[0]
        contains_anomaly = np.any([np.unique(data[data['unix_time']==i]['anomaly'])[0] for i in backward_times])
        is_curr_day = np.any([np.unique(data[data['unix_time']==i]['day'])[0]==curr_day for i in backward_times])
        # is_curr_day = np.all([days[t-i]==curr_day for i in backward_index])

        if contains_anomaly or not is_curr_day:
            continue

        for i in backward_times:
            data_t.append(data[data['unix_time']==i][['occ', 'speed', 'volume']].to_numpy()) # assumes time indices come sequentially, with full data it may not

        node_data = np.concatenate(data_t)
        pyg_data = Data(x=torch.tensor(node_data, dtype=torch.float32), edge_index=relational_edges, edge_attr=torch.tensor(relations, dtype=torch.long))
        
        curr_data = data_t[-1]
        curr_graph = Data(x=torch.tensor(curr_data, dtype=torch.float32), edge_index=static_edges)
        sequence.append([pyg_data, curr_graph])

    return sequence

In [143]:
sequence = sequence_rstae(full_normalized, 5)

  0%|          | 0/13435 [00:00<?, ?it/s]

 25%|██▌       | 3411/13435 [01:31<04:29, 37.17it/s]


KeyboardInterrupt: 

In [113]:
len(sequence)

0

In [6]:
sequence[0]

[Data(x=[1960, 3], edge_index=[2, 38336], edge_attr=[38336]),
 Data(x=[196, 3], edge_index=[2, 1832])]

In [4]:
data_t = []
for i in range(10):
    data_t.append(day1_normalized[day1_normalized['time_index']==i])

In [5]:
data_t[9]

Unnamed: 0,time_index,milemarker,speed,occ,volume
9,9.0,53.3,1.024298,0.01,0.04
9,9.0,53.3,0.910628,0.00,0.00
9,9.0,53.3,0.892749,0.02,0.16
9,9.0,53.3,0.875764,0.01,0.04
2888,9.0,53.6,0.998993,0.01,0.08
...,...,...,...,...,...
135322,9.0,69.8,0.825243,0.00,0.00
138201,9.0,70.1,1.054311,0.00,0.00
138201,9.0,70.1,0.947011,0.01,0.04
138201,9.0,70.1,0.861294,0.01,0.00


In [13]:
edge_connections, relations = generate_relational_edges(list(range(49)), 10)
print(len(relations))

38336


In [15]:
max(edge_connections[0])

tensor(1959)

In [16]:
combined = pd.concat(data_t)
node_data = combined[['speed', 'occ', 'volume']].to_numpy()
node_data.shape

(1960, 3)

In [17]:
# data = Data(x=node_data, edge_index=edge_connections, edge_attr=np.eye(6)[np.array(relations)])
data = Data(x=torch.tensor(node_data, dtype=torch.float32), edge_index=torch.tensor(edge_connections), edge_attr=torch.tensor(relations, dtype=torch.long))


  data = Data(x=torch.tensor(node_data, dtype=torch.float32), edge_index=torch.tensor(edge_connections), edge_attr=torch.tensor(relations, dtype=torch.long))


In [18]:
data

Data(x=[1960, 3], edge_index=[2, 38336], edge_attr=[38336])

In [19]:
max(edge_connections[0])

tensor(1959)

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
import torch_geometric as pyg

class RelationalGraphEncoder(nn.Module):
    def __init__(self, num_features, hidden_dim, latent_dim, dropout_percentage=0.1):
        super().__init__()
        self.conv1 = RGCNConv(num_features, hidden_dim, num_relations=6)
        self.conv2 = RGCNConv(hidden_dim, hidden_dim, num_relations=6)
        self.fc = nn.Linear(hidden_dim, latent_dim)

        self.dropout_percentage = dropout_percentage
    
    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)

        x = F.dropout(x, p=self.dropout_percentage, training=self.training)
        x = pyg.nn.global_mean_pool(x, data.batch)
        z = self.fc(x)
        return z

In [24]:
from datautils import generate_edges
reconstructed_index = generate_edges(list(range(49)))

In [26]:
enc = RelationalGraphEncoder(3, 64, 128, 0)
z = enc(data)

torch.Size([1960, 3])
<class 'torch.Tensor'>
torch.Size([38336])


In [9]:
from models import GraphDecoder

dec = GraphDecoder(num_features=3, hidden_dim=64, latent_dim=128)
dec(z, reconstructed_index)

NameError: name 'z' is not defined

In [6]:
rstae = RelationalSTAE(num_features=3, hidden_dim=64, latent_dim=128)
rstae(sequence[0])

torch.Size([196, 3])