In [1]:
import numpy as np
import pandas as pd
import os
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from build_graph_data import *
from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [2]:
trainpath = '../../../data/top30groups/traindata/train100.csv'
testpath = '../../../data/top30groups/testdata/test100.csv'
traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [3]:
combined = pd.concat([traindata, testdata], axis = 0)

In [4]:
combined.shape

(3000, 26)

In [5]:
data = combined

In [6]:
data.columns

Index(['Unnamed: 0', 'iyear', 'imonth', 'iday', 'extended', 'country',
       'region', 'provstate', 'city', 'latitude', 'longitude', 'specificity',
       'vicinity', 'multiple', 'success', 'suicide', 'attacktype1',
       'targtype1', 'target1', 'natlty1', 'individual', 'weaptype1', 'nkill',
       'property', 'ishostkid', 'gname'],
      dtype='object')

In [7]:
# Filter dataset to only contain unique coordinates
print("Entries before dropping long/lat duplicates: ", data.shape)

#data.sort_values(by=['longitude', 'latitude', 'attack_date'], inplace=True)

# Keep only relevant columns
data = data[['longitude', 'latitude', 'gname']]
# Drop duplicates based on location, keep the earliest attack
data['longlat'] = list(zip(data['longitude'], data['latitude']))
df_unique = data.drop_duplicates(subset=['longlat'], keep='first').reset_index(drop=True)

df_unique = df_unique.drop(columns=['longitude', 'latitude'])
print(df_unique.columns)
print("Entries after dropping long/lat duplicates (#Nodes): ", df_unique.shape)


Entries before dropping long/lat duplicates:  (3000, 26)
Index(['gname', 'longlat'], dtype='object')
Entries after dropping long/lat duplicates (#Nodes):  (1790, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['longlat'] = list(zip(data['longitude'], data['latitude']))


In [8]:
split_point = int(0.7 * len(df_unique))
train_df = df_unique[:split_point]
test_df = df_unique[split_point:]

In [9]:
import ast

def to_tuple_if_needed(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val  # already a tuple

df_unique['longlat'] = df_unique['longlat'].apply(to_tuple_if_needed)


In [10]:
# 1. Build coord_to_index from the full dataset (unique longlat to node index mapping)
full_coords = df_unique[['longlat']]  # Ensure this column contains (lon, lat) tuples

# Optional: convert 'longlat' to tuple if stored as string
# full_coords['longlat'] = full_coords['longlat'].apply(eval)

coord_to_index = {row['longlat']: i for i, row in full_coords.iterrows()}

# 2. Build the global graph from the full dataset (used for both train and test)
adj_matrix, feature_matrix, label_index = build_graph_data(df_unique, coord_to_index)

# 3. Build train node indices and labels
train_nodes = []
train_labels = []
for _, row in train_df.iterrows():
    train_nodes.append(coord_to_index[row['longlat']])
    train_labels.append(label_index[row['gname']])

# 4. Build test node indices and labels
test_nodes = []
test_labels = []
for _, row in test_df.iterrows():
    test_nodes.append(coord_to_index[row['longlat']])
    test_labels.append(label_index[row['gname']])


Number of total nodes (unique coordinates): 1790
Number of unique labels in this set: 30


In [11]:
print(adj_matrix.shape)

(1790, 1790)


# A simple GCN

In [12]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class PyTorchGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x




In [13]:
from torch_geometric.data import Data
from scipy.sparse import coo_matrix

def run_epoch(model, data, labels, mask, optimizer=None):
    is_training = optimizer is not None
    if is_training:
        model.train()
    else:
        model.eval()

    out = model(data.x, data.edge_index)
    loss_fn = torch.nn.CrossEntropyLoss()

    loss = loss_fn(out[mask], labels[mask])

    if is_training:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Accuracy
    pred = out[mask].argmax(dim=1)
    acc = (pred == labels[mask]).float().mean().item()
    return acc, loss.item()


In [21]:
from torch_geometric.data import Data
from scipy.sparse import coo_matrix

# 1. Convert adjacency matrix to edge_index
A_coo = coo_matrix(adj_matrix)
edge_index = torch.tensor(np.vstack((A_coo.row, A_coo.col)), dtype=torch.long)

# 2. Feature Matrix Tensor (coords are now the longlat tuples)
coords = np.array(list(coord_to_index.keys()), dtype=np.float32)  # shape (N, 2): [longitude, latitude]
x = torch.tensor(coords, dtype=torch.float32)

num_nodes = x.shape[0]

# 3. Label Tensor (-1 = unlabeled initially)
y = torch.full((num_nodes,), -1, dtype=torch.long)

# 4. Create train/test masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# 5. Assign labels and training mask
for _, row in train_df.iterrows():
    coord = row['longlat']
    idx = coord_to_index[coord]
    y[idx] = label_index[row['gname']]
    train_mask[idx] = True

# 6. Assign labels and testing mask
for _, row in test_df.iterrows():
    coord = row['longlat']
    idx = coord_to_index[coord]
    y[idx] = label_index[row['gname']]
    test_mask[idx] = True

# 7. Create PyG Data object
data = Data(x=x, edge_index=edge_index)

# 8. Initialize model and optimizer
model = PyTorchGCN(in_channels=x.shape[1], hidden_channels=16, num_classes=len(label_index))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 9. Training loop
max_test_acc = 0
for epoch in range(1500):
    train_acc, train_loss = run_epoch(model, data, y, train_mask, optimizer)
    test_acc, test_loss = run_epoch(model, data, y, test_mask)
    if test_acc > max_test_acc:
        max_test_acc = test_acc
        max_test_acc_epoch = epoch + 1
    print(f"Epoch {epoch+1:02d} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f} | Train Loss: {train_loss:.4f}")

print('-----------------------')
print(f'Best test acc in epoch {max_test_acc_epoch}, accuracy: {max_test_acc}')

Epoch 01 | Train Acc: 0.0008 | Test Acc: 0.0000 | Train Loss: 26.6874
Epoch 02 | Train Acc: 0.0008 | Test Acc: 0.0317 | Train Loss: 22.7650
Epoch 03 | Train Acc: 0.0295 | Test Acc: 0.0819 | Train Loss: 19.1601
Epoch 04 | Train Acc: 0.0750 | Test Acc: 0.0819 | Train Loss: 16.0071
Epoch 05 | Train Acc: 0.0750 | Test Acc: 0.0782 | Train Loss: 13.1551
Epoch 06 | Train Acc: 0.0702 | Test Acc: 0.0782 | Train Loss: 11.3321
Epoch 07 | Train Acc: 0.0702 | Test Acc: 0.0838 | Train Loss: 9.7463
Epoch 08 | Train Acc: 0.0958 | Test Acc: 0.0670 | Train Loss: 8.6427
Epoch 09 | Train Acc: 0.0918 | Test Acc: 0.0521 | Train Loss: 7.7066
Epoch 10 | Train Acc: 0.0662 | Test Acc: 0.0801 | Train Loss: 6.8069
Epoch 11 | Train Acc: 0.0926 | Test Acc: 0.0615 | Train Loss: 5.9608
Epoch 12 | Train Acc: 0.0710 | Test Acc: 0.0205 | Train Loss: 5.1285
Epoch 13 | Train Acc: 0.0303 | Test Acc: 0.0242 | Train Loss: 4.5946
Epoch 14 | Train Acc: 0.0319 | Test Acc: 0.1099 | Train Loss: 4.1539
Epoch 15 | Train Acc: 0.1221