# STGCN

This folder follows the same structure of the files that lie in the [main](https://github.com/VeritasYin/STGCN_IJCAI-18) Github repository. Many directories not directly relevant to STGCN training have been omitted. The STGCN is loaded on Google Drive and can be trained [here](https://drive.google.com/file/d/17spvTDAU4Le-15dqlv6hhmARRofuVJO1/view?usp=sharing) to make use of Colab’s GPUs.

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In order to run this notebook, you will have to request access to [this](https://drive.google.com/drive/folders/1OrNCjHFZ71M5OiIQtu9G7C61kFl8aVGT?usp=sharing) Google Drive folder, copy into your Drive, and change the path below.

In [None]:
import os
aman_path_to_capstone_dir = 'drive/MyDrive/School/Undergrad/Spring 2022/Capstone/Models/' 
os.chdir(f'{aman_path_to_capstone_dir}STGCN Training/models')

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
!pip3 install prophet
import sys
sys.path.append('/content/drive/MyDrive/School/Undergrad/Spring 2022/Capstone/Models/STGCN Training/utils/')
from utils import load_processed_data

### Format

In [None]:
adj_mat, ind_station_mapper, speeds = load_processed_data('../data/processed/fwy_405_n_ds')

In [None]:
import yaml

with open('../models/env.yaml') as f:
    ENV = yaml.load(f, Loader=yaml.Loader)

In [None]:
# subset and choose data in may-july
speeds = speeds[speeds.index.month.isin([5, 6, 7])] 

# write to file
outfile = '../data/processed/fwy_405_n_ds/speeds_form.csv'
speeds.to_csv(outfile, index=False, header=False)

# write to file
outfile = '../data/processed/fwy_405_n_ds/adj_mat_form.csv'
pd.DataFrame(adj_mat).to_csv(outfile, index=False, header=False)

# Build Model

In [None]:
%cd ../utils/STGCN

In [None]:
!python main.py --epoch=25 --save=1

Tune:

In [None]:
from utils import STGCN_grid_search
import os

In [4]:
param_grid = {  
    'k': [2, 4, 6, 8, 10, 12],
    'epochs': [5, 10, 15, 20, 25, 30],
    'opt': ['RMSProp', 'Adam'],
    'ks': [1, 3, 5],
    'num_lags': [4, 8, 12],
    'lr': [1e-3, 1e-2, 1e-1, 1, 10]
}

In [None]:
STGCN_grid_search('rmse', ENV['cv_folds'], param_grid, f'{os.getcwd()}/main.py')

# Build Best Model

In [None]:
with open('./trained/STGCN/grid_search_STGCN.dat', 'rb') as f:
    grid_search_results = pickle.load(f)

In [None]:
!python main.py --epoch=25 --save=1 --params=str(grid_search_results['best_params'])

# Evaluate

In [None]:
actual = np.load('../../models/trained/STGCN/preds/actual.npy')
predicted = np.load('../../models/trained/STGCN/preds/predicted.npy')

In [None]:
# helper code to figure out what timestamps were used in the test set
new = [[x[0] for x in y] for y in actual]
new = np.array(new)

seq = new[0:100,5]

full = speeds.iloc[:,5].values

for i in range(len(full) - len(seq)):
    if (full[i:i+len(seq)] == seq).all():
        print(i)


In [None]:
# format into dataframes
ts_start = 11246

actual_df = pd.DataFrame([[x[0] for x in y] for y in actual])
actual_df.columns = speeds.columns
actual_df.index = speeds.index[ts_start:ts_start+new.shape[0]]

predicted_df = pd.DataFrame([[x[0] for x in y] for y in predicted])
predicted_df.columns = speeds.columns
predicted_df.index = speeds.index[ts_start:ts_start+new.shape[0]]

print(predicted_df.shape)
predicted_df.head()

In [None]:
import plotly.graph_objects as go

In [None]:
station = 771826
cutoff = int(0.75 * actual_df.index.shape[0])

fig = go.Figure()
fig.add_trace(go.Line(x=actual_df.index, y=actual_df[station], name='True Values'))
fig.add_trace(go.Line(x=predicted_df.index[:cutoff], y=predicted_df.loc[:(predicted_df.index[cutoff]), station], name='Predicted Values (Train)'))
fig.add_trace(go.Line(x=predicted_df.index[cutoff:], y=predicted_df.loc[(predicted_df.index[cutoff]):, station], name='Predicted Values (Test)'))
fig.update_layout(
    title="STGCN Forecast Results",
    xaxis_title="Time",
    yaxis_title="Forecast")

In [None]:
# fig.write_html('../plots/STGCN.html')

In [None]:
from utils import STGCN_cv

In [None]:
STGCN_cv(f'{os.getcwd()}/main.py', ENV['cv_folds'])

In [None]:
## station map plotting
meta = pd.read_csv('https://jda-cloud.s3.us-east-2.amazonaws.com/rdp_ds/meta.csv')

stations = [int(x) for x in predicted_df.columns]
locations = meta[meta['ID'].isin(list(stations))][['ID', 'Longitude', 'Latitude']]

locations['RMSE'] = [mean_squared_error(actual_df[str(x)], predicted_df[str(x)], squared=False) for x in locations['ID']]
locations['MAE'] = [mean_absolute_error(actual_df[str(x)], predicted_df[str(x)]) for x in locations['ID']]

fig = px.scatter(
    locations, 
    x="Longitude", y="Latitude",
    height=600, width=800,
    color='RMSE',
    title='RMSE By Station Location'
)

fig

In [None]:
fig = px.histogram(
    locations, 
    x="RMSE",
    height=400, width=600,
    title='RMSE Distribution'
)