In [1]:
import torch
import numpy as np
import pandas as pd
from haversine import haversine, Unit
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, StandardScaler




In [2]:
partition = 100

# 1. Load Dataset

In [3]:
trainpath = f'../../../data/top30groups/LongLatCombined/train1/train{partition}.csv'
testpath = f'../../../data/top30groups/LongLatCombined/test1/test{partition}.csv'
traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [4]:
combined = pd.concat([traindata, testdata], axis = 0)

In [5]:
# Assume 'combined' has 3000 attacks with 'longitude', 'latitude', 'gname'
combined['location'] = list(zip(combined['longitude'], combined['latitude']))
unique_locations = combined['location'].drop_duplicates().reset_index(drop=True)

location2id = {loc: idx for idx, loc in enumerate(unique_locations)}
combined['location_id'] = combined['location'].map(location2id)

# Encode labels
le = LabelEncoder()
combined['label'] = le.fit_transform(combined['gname'])

# Get global node features
coords = np.array([list(loc) for loc in unique_locations])  # [1790, 2]
scaler = StandardScaler()
x_global = scaler.fit_transform(coords)  # standardized features

# Build global edge list using 1km Haversine
edges = []
coords_latlon = [(lat, lon) for lon, lat in unique_locations]
for i in range(len(coords_latlon)):
    for j in range(i + 1, len(coords_latlon)):
        if haversine(coords_latlon[i], coords_latlon[j], Unit.KILOMETERS) <= 1.0:
            edges.append((i, j))
            edges.append((j, i))

global_edge_index = torch.tensor(edges, dtype=torch.long).T  # shape [2, num_edges]


In [6]:
def get_subgraph(loc_id, x_global, edge_index_global):
    # Get neighbors of loc_id
    neighbors = edge_index_global[1][edge_index_global[0] == loc_id].tolist()
    if loc_id not in neighbors:
        neighbors.append(loc_id)

    neighbors = list(set(neighbors))
    idx_map = {old: new for new, old in enumerate(neighbors)}
    sub_edge_list = []

    for i, j in zip(*edge_index_global.numpy()):
        if i in neighbors and j in neighbors:
            sub_edge_list.append((idx_map[i], idx_map[j]))

    if not sub_edge_list:
        sub_edge_list = [(0, 0)]  # self-loop on center node


    edge_index = torch.tensor(sub_edge_list, dtype=torch.long).T
    x = torch.tensor(x_global[neighbors], dtype=torch.float)
    center_idx = idx_map[loc_id]

    return x, edge_index, center_idx


In [7]:
data_list = []

for _, row in combined.iterrows():
    loc_id = row['location_id']
    label = row['label']
    x, edge_index, center_idx = get_subgraph(loc_id, x_global, global_edge_index)
    data = Data(x=x, edge_index=edge_index, y=torch.tensor(label), center=center_idx)
    data_list.append(data)


In [8]:
# After you build `data_list`, we now split based on original splits

# Step 1: Recreate location tuple for matching
traindata['location'] = list(zip(traindata['longitude'], traindata['latitude']))
testdata['location'] = list(zip(testdata['longitude'], testdata['latitude']))

# Step 2: Build a set of locations in train/test
train_locs = set(traindata['location'])
test_locs = set(testdata['location'])

# Step 3: Split data_list based on attack's location
train_data = []
test_data = []

for data, row in zip(data_list, combined.itertuples()):
    loc = row.location  # (longitude, latitude)
    if loc in train_locs:
        train_data.append(data)
    elif loc in test_locs:
        test_data.append(data)
    # else: skip (e.g., malformed entry)

# Step 4: Create DataLoaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


In [9]:
import torch.nn as nn

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.classifier = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, batch, center_indices):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        # Extract central node embeddings per graph
        out = x[center_indices]
        return F.log_softmax(self.classifier(out), dim=1)


In [18]:
traindata[['latitude', 'longitude', 'gname']].iloc[0]

latitude            -11.967368
longitude           -76.978462
gname        Shining Path (SL)
Name: 0, dtype: object

In [20]:
le.inverse_transform([25])

array(['Shining Path (SL)'], dtype=object)

In [16]:
print(traindata.iloc[0]['longitude'], traindata.iloc[0]['latitude'], traindata.iloc[0]['gname'])
print(train_data[0])


-76.978462 -11.967368 Shining Path (SL)
Data(x=[1, 2], edge_index=[2, 1], y=25, center=0)


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(in_channels=2, hidden_channels=32, out_channels=len(le.classes_)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

def train():
    model.train()
    total_loss = 0
    for batch in train_loader:
        print("Batch.x shape:", batch.x.shape)                  # [total_nodes_in_batch, num_features]
        print("Batch.edge_index shape:", batch.edge_index.shape)  # [2, num_edges]
        print("Batch.y shape:", batch.y.shape)                  # [batch_size]
        print("Batch.batch shape:", batch.batch.shape)          # [total_nodes_in_batch]
        print("Center indices:", batch.center)                  # [batch_size]
        print()
        print("Center node features:")
        print(batch.x[batch.center])
        print()
        for i in range(len(batch.y)):
            graph_idx = (batch.batch == i).nonzero(as_tuple=True)[0]
            assert batch.center[i].item() in graph_idx, f"Center index {batch.center[i]} not in graph {i}"
        break  # only one batch to inspect
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch, batch.center)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

@torch.no_grad()
def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    for batch in loader:
        batch = batch.to(device)
        out = model(batch.x, batch.edge_index, batch.batch, batch.center)
        pred = out.argmax(dim=1)
        correct += (pred == batch.y).sum().item()
        total += batch.y.size(0)
    return correct / total


In [15]:
for epoch in range(1, 2):
    loss = train()
    train_acc = evaluate(train_loader)
    test_acc = evaluate(test_loader)
    print(f"Epoch {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")


Batch.x shape: torch.Size([32, 2])
Batch.edge_index shape: torch.Size([2, 32])
Batch.y shape: torch.Size([32])
Batch.batch shape: torch.Size([32])
Center indices: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

Center node features:
tensor([[0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],
        [0.1770, 0.8962],

AssertionError: Center index 0 not in graph 1