In [1]:
import numpy as np
import pandas as pd
import os
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from build_graph_data import *


from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [2]:
path = '../../../data/top30groups/engineered_dfs/df_top30_300.csv'
data = pd.read_csv(path, encoding='ISO-8859-1')

In [3]:
# Filter dataset to only contain unique coordinates
print("Entries before dropping long/lat duplicates: ", len(data))
df_unique_geo = create_unique_geo_data(data)
print("Entries after dropping long/lat duplicates (#Nodes): ", len(df_unique_geo))


Entries before dropping long/lat duplicates:  9000
Entries after dropping long/lat duplicates (#Nodes):  4379


In [4]:
# creates train and test data, first 70% of each group is added to train and remaining 30% to test
def handle_leakage(df):
    train_frames = []
    test_frames = []

    #first 70% of each groups attacks to training set, remainin 30% to testing set
    for _, group_data in df.groupby('gname'):
        split_point = int(len(group_data) * 0.7)  # 70% for training
        train_frames.append(group_data.iloc[:split_point])
        test_frames.append(group_data.iloc[split_point:])           


    # Concatenate all the group-specific splits into final train and test DataFrames
    train_df = pd.concat(train_frames)
    test_df = pd.concat(test_frames)

    # Shuffle each DataFrame separately
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)

    print(len(train_df))
    print(len(test_df))

    return train_df, test_df

In [5]:
train_df, test_df = handle_leakage(df_unique_geo)

3049
1330


In [6]:
# 1. Build coord_to_index from the full dataset (unique coordinate to node index mapping)
full_coords = df_unique_geo[['longitude', 'latitude']]
coord_to_index = {(row['longitude'], row['latitude']): i for i, row in full_coords.iterrows()}

# 2. Build the global graph from the full dataset (used for both train and test)
adj_matrix, feature_matrix, label_index = build_graph_data(df_unique_geo, coord_to_index)

train_nodes = []
train_labels = []
for _, row in train_df.iterrows():
    train_nodes.append(coord_to_index[(row['longitude'], row['latitude'])])
    train_labels.append(label_index[row['gname']])

test_nodes = []
test_labels = []
for _, row in test_df.iterrows():
    test_nodes.append(coord_to_index[(row['longitude'], row['latitude'])])
    test_labels.append(label_index[row['gname']])


Number of total nodes (unique coordinates): 4379
Number of unique labels in this set: 30


In [7]:
print(adj_matrix.shape)

(4379, 4379)


# A simple GCN

In [8]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class PyTorchGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


In [9]:
from torch_geometric.data import Data
from scipy.sparse import coo_matrix

def run_epoch(model, data, labels, mask, optimizer=None):
    is_training = optimizer is not None
    if is_training:
        model.train()
    else:
        model.eval()

    out = model(data.x, data.edge_index)
    loss_fn = torch.nn.CrossEntropyLoss()

    loss = loss_fn(out[mask], labels[mask])

    if is_training:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Accuracy
    pred = out[mask].argmax(dim=1)
    acc = (pred == labels[mask]).float().mean().item()
    return acc, loss.item()


In [10]:
from torch_geometric.data import Data
from scipy.sparse import coo_matrix

# 1. Convert adjacency matrix to edge_index
A_coo = coo_matrix(adj_matrix)
edge_index = torch.tensor(np.vstack((A_coo.row, A_coo.col)), dtype=torch.long)

# Feature Matrix Tensor
coords = np.array(list(coord_to_index.keys()), dtype=np.float32)
feature_matrix = coords  # shape: (N, 2), with [longitude, latitude]
x = torch.tensor(feature_matrix, dtype=torch.float32)

num_nodes = x.shape[0]

# Label Tensor
y = torch.full((num_nodes,), -1, dtype=torch.long)  # -1 for unlabeled

# Create Masks, indicates which ndoes are used in training and testing
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# Assign group labels for train, set nodes as part of training set
for _, row in train_df.iterrows():
    coord = (row['longitude'], row['latitude'])
    idx = coord_to_index[coord]
    y[idx] = label_index[row['gname']]
    train_mask[idx] = True

# Assign group labels for test, set nodes as part of testing set
for _, row in test_df.iterrows():
    coord = (row['longitude'], row['latitude'])
    idx = coord_to_index[coord]
    y[idx] = label_index[row['gname']]
    test_mask[idx] = True

# Create PyG Data object
data = Data(x=x, edge_index=edge_index)

# Initialize model and optimizer
model = PyTorchGCN(in_channels=x.shape[1], hidden_channels=16, num_classes=len(label_index))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
max_test_acc = 0
for epoch in range(50):
    train_acc, train_loss = run_epoch(model, data, y, train_mask, optimizer)
    test_acc, test_loss = run_epoch(model, data, y, test_mask)
    if test_acc > max_test_acc:
        max_test_acc = test_acc
        max_test_acc_epoch = epoch + 1
    print(f"Epoch {epoch+1:02d} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f} | Train Loss: {train_loss:.4f}")

print('-----------------------')
print(f'Best test acc in epoch {max_test_acc_epoch}, accuracy: {max_test_acc}')


Epoch 01 | Train Acc: 0.0272 | Test Acc: 0.0263 | Train Loss: 23.1428
Epoch 02 | Train Acc: 0.0269 | Test Acc: 0.0722 | Train Loss: 19.3205
Epoch 03 | Train Acc: 0.0610 | Test Acc: 0.0556 | Train Loss: 15.9042
Epoch 04 | Train Acc: 0.0489 | Test Acc: 0.0947 | Train Loss: 13.4023
Epoch 05 | Train Acc: 0.1315 | Test Acc: 0.0970 | Train Loss: 11.4181
Epoch 06 | Train Acc: 0.0879 | Test Acc: 0.1293 | Train Loss: 9.9545
Epoch 07 | Train Acc: 0.1279 | Test Acc: 0.1226 | Train Loss: 8.8048
Epoch 08 | Train Acc: 0.1230 | Test Acc: 0.1226 | Train Loss: 7.8461
Epoch 09 | Train Acc: 0.1220 | Test Acc: 0.0910 | Train Loss: 7.0990
Epoch 10 | Train Acc: 0.0872 | Test Acc: 0.1120 | Train Loss: 6.3451
Epoch 11 | Train Acc: 0.1332 | Test Acc: 0.0662 | Train Loss: 5.7556
Epoch 12 | Train Acc: 0.1089 | Test Acc: 0.0135 | Train Loss: 5.3496
Epoch 13 | Train Acc: 0.0538 | Test Acc: 0.0391 | Train Loss: 4.8766
Epoch 14 | Train Acc: 0.0630 | Test Acc: 0.0353 | Train Loss: 4.3737
Epoch 15 | Train Acc: 0.0794 