In [1]:
import pandas as pd
from networkx.classes import number_of_nodes

import codes.data_utils
import importlib

importlib.reload(codes.data_utils)
from codes.data_utils import set_seed

set_seed(42)
# Load centrality data
centrality_df = pd.read_excel("data/Centrality_indices.xlsx")

In [2]:
clean_cities_df = pd.read_csv("data/clean_non_flagged_cities.csv")
clean_city_ids = set(clean_cities_df['Codmundv'].astype(int).unique())

In [3]:
import codes.extract_backbone

importlib.reload(codes.extract_backbone)
from codes.extract_backbone import extract_backbone_from_files_brazil

backbone_df = extract_backbone_from_files_brazil(
    centrality_path="data/Centrality_indices.xlsx",
    mobility_edges_path="data/Road_and_waterway_connections_database_2016.xlsx",
    alpha=0.01,
    city_whitelist=clean_city_ids
)

print(f"Backbone extracted with {len(backbone_df)} edges")
print(backbone_df[['source', 'target', 'weekly_flow', 'pij']].head())


[✓] Backbone file found at 'data/mobility_backbone_brazil.csv'. Loading it...
Backbone extracted with 4811 edges
    source   target  weekly_flow       pij
0  1503903  1506005         14.0  0.598992
1  1301902  1506005         12.5  0.406169
2  1301902  1506807         13.5  0.115288
3  2302107  2305100          0.0  1.000000
4  3520509  3552403          0.0  1.000000


In [4]:
import numpy as np
from codes.load_and_save_covid_data import load_and_save_covid_data

covid_df = load_and_save_covid_data()

print(f"Full date range: {covid_df['date'].min()} to {covid_df['date'].max()}")
print(f"Total records: {len(covid_df):,}")

negative_counts = (covid_df.select_dtypes(include=[np.number]) < 0).sum()
print("📉 Negative values per column:")
print(negative_counts[negative_counts > 0])


[✓] Found saved COVID dataset at data/covid_brazil_combined.csv. Loading it...
Full date range: 2020-02-25 00:00:00 to 2023-03-18 00:00:00
Total records: 5,830,987
📉 Negative values per column:
newDeaths                          16629
deaths                               186
newCases                           63752
totalCases                           278
deaths_per_100k_inhabitants          186
totalCases_per_100k_inhabitants      278
dtype: int64


In [5]:
from codes.preprocess_covid_brazil import filter_and_scale_covid_by_centrality

filtered_scaled_covid_df = filter_and_scale_covid_by_centrality(covid_df, city_whitelist=clean_city_ids)

[✓] Found saved preprocessed COVID data at 'data/filtered_scaled_covid.csv'. Loading...


In [6]:
from codes.graph_utils import build_pyg_graph_from_backbone

centrality_df = centrality_df[centrality_df['Codmundv'].isin(clean_city_ids)].copy()
pyg_data = build_pyg_graph_from_backbone(centrality_df, backbone_df)
print(pyg_data)

[✓] Graph built with 1305 nodes and 4811 edges.
[✓] Converted to PyTorch Geometric format.
Data(edge_index=[2, 9622], name=[1305], weight=[9622], edge_attr=[9622, 1], num_nodes=1305)


In [7]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("cpu")
    print('Using cpu')
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

Using cpu


In [8]:
from codes.data_utils import prepare_temporal_graph_data_non_overlapping, generate_sliding_temporal_graph_data, \
    prepare_temporal_graph_data_non_overlapping

# Traditional (non-sliding) approach
X_train_static, X_test_static, Y_train_static, Y_test_static = prepare_temporal_graph_data_non_overlapping(
    filtered_scaled_covid_df,
    sequence_length=15,
    feature_column="z_newCases",
    device=device
)

# Sliding window approach
X_train_slide, X_test_slide, Y_train_slide, Y_test_slide = generate_sliding_temporal_graph_data(
    filtered_scaled_covid_df,
    input_window=14,
    output_window=1,
    feature_column="z_newCases",
    device=device
)

Y_train_slide = Y_train_slide.squeeze(1)
print(Y_train_slide.shape)

Y_test_slide = Y_test_slide.squeeze(1)
print(Y_test_slide.shape)

# Compare number of samples
total_static = len(X_train_static) + len(X_test_static)
total_slide = len(X_train_slide) + len(X_test_slide)

print("📊 Sample Count Comparison")
print(f"Static window (15 input, 1 output): {total_static} samples")
print(f"Sliding window (10 input, 1 output): {total_slide} samples")
print(f"⬆️ Gain: {total_slide - total_static} samples (+{100 * (total_slide - total_static) / total_static:.2f}%)")


[📉] (Non-overlapping) X shape: torch.Size([69, 15, 1305, 1]) | Y shape: torch.Size([69, 1305, 1])
[📉] Train: torch.Size([55, 15, 1305, 1]) | Test: torch.Size([14, 15, 1305, 1])
[✓] Sliding window: X torch.Size([1104, 14, 1305, 1]), Y torch.Size([1104, 1, 1305, 1])
[✓] Train: torch.Size([883, 14, 1305, 1]), Test: torch.Size([221, 14, 1305, 1])
torch.Size([883, 1305, 1])
torch.Size([221, 1305, 1])
📊 Sample Count Comparison
Static window (15 input, 1 output): 69 samples
Sliding window (10 input, 1 output): 1104 samples
⬆️ Gain: 1035 samples (+1500.00%)


In [9]:
import codes.models.custom_gcn_spatiotemporal_transformer
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

importlib.reload(codes.models.custom_gcn_spatiotemporal_transformer)

from codes.models.custom_gcn_spatiotemporal_transformer import SpatioTemporalFusionNet

N = pyg_data.num_nodes

model = SpatioTemporalFusionNet(
    in_channels=1,           # number of time-series features per node (e.g., newCases)
    graph_feat_dim=0,        # number of static node features (e.g., population, centrality, etc.)
    trans_hidden=64,         # hidden dim for transformer and GCN
    out_channels=1,          # output features per node (e.g., predicting next-day cases)
    num_nodes=N              # number of nodes in the graph used for graph embeddings
).to(device)

# Loss function
criterion = nn.MSELoss()
# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Scheduler
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # reduce LR by half every 5 epochs

In [10]:
from torch.utils.data import Dataset, DataLoader

class CovidGraphDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [11]:
batch_size = 32  # You can tune this based on your GPU

train_dataset = CovidGraphDataset(X_train_slide, Y_train_slide)
test_dataset = CovidGraphDataset(X_test_slide, Y_test_slide)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
from tqdm import tqdm

edge_index = pyg_data.edge_index.to(device)
edge_weight = pyg_data.edge_attr.view(-1).to(device)

num_epochs = 10
model.train()

for epoch in range(num_epochs):
    total_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for batch_X, batch_Y in pbar:
        batch_X = batch_X.to(device)  # [B, T, N, F]
        batch_Y = batch_Y.to(device)  # [B, N, 1]
        
        optimizer.zero_grad()
        output = model(batch_X, edge_index, edge_weight)  # [B, N, 1]
        loss = criterion(output, batch_Y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix({"Batch Loss": f"{loss.item():.4f}", "LR": f"{current_lr:.6f}"})

    avg_loss = total_loss / len(train_loader)
    print(f"🧠 Epoch {epoch+1}/{num_epochs} — Avg Loss: {avg_loss:.4f}")
    
    scheduler.step()

Epoch 1/10:   0%|          | 0/28 [00:00<?, ?it/s]