In [1]:
import numpy as np
import pandas as pd
import os
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from build_graph_data import *
from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [2]:
partition = 200

In [3]:
trainpath = f'../../../data/top30groups/noGeographic/train1/train{partition}.csv'
testpath = f'../../../data/top30groups/noGeographic/test1/test{partition}.csv'
traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [4]:
combined = pd.concat([traindata, testdata], axis = 0)
data = combined

In [5]:
import itertools

def create_node_dataframe(data, node_features, label_column='gname'):
    relevant_data = data[node_features + [label_column]].copy()
    relevant_data['combination'] = list(zip(*(relevant_data[feat] for feat in node_features)))
    df_unique = relevant_data.drop_duplicates(subset=['combination'], keep='first').reset_index(drop=True)
    return df_unique[['combination', label_column]]

# All possible features to try
all_features = ['attacktype1', 'target1', 'weaptype1', 'nkill']
label_column = 'gname'

min_nodes = 1000
combination_dfs = {}
kept_combinations = []  # To store info about kept subsets

for r in range(2, len(all_features) + 1):
    for feature_subset in itertools.combinations(all_features, r):
        feature_subset = list(feature_subset)
        df_filtered = create_node_dataframe(data, feature_subset, label_column=label_column)

        n_nodes = df_filtered.shape[0]
        key = ", ".join(feature_subset)

        if n_nodes >= min_nodes:
            combination_dfs[key] = df_filtered
            kept_combinations.append((key, feature_subset, n_nodes))
            print(f"Subset: {key}, Nodes: {n_nodes}")
        else:
            print(f"Subset: {key} had too few unique combinations: {n_nodes}, discarding.")

# After the loop, print a summary of kept combinations
print("\n--- Kept Combinations ---")
for key, features, n_nodes in kept_combinations:
    print(f"Key: {key} | Features: {features} | Nodes: {n_nodes}")



Subset: attacktype1, target1, Nodes: 3658
Subset: attacktype1, weaptype1 had too few unique combinations: 35, discarding.
Subset: attacktype1, nkill had too few unique combinations: 195, discarding.
Subset: target1, weaptype1, Nodes: 3614
Subset: target1, nkill, Nodes: 4077
Subset: weaptype1, nkill had too few unique combinations: 162, discarding.
Subset: attacktype1, target1, weaptype1, Nodes: 3730
Subset: attacktype1, target1, nkill, Nodes: 4322
Subset: attacktype1, weaptype1, nkill had too few unique combinations: 286, discarding.
Subset: target1, weaptype1, nkill, Nodes: 4289
Subset: attacktype1, target1, weaptype1, nkill, Nodes: 4392

--- Kept Combinations ---
Key: attacktype1, target1 | Features: ['attacktype1', 'target1'] | Nodes: 3658
Key: target1, weaptype1 | Features: ['target1', 'weaptype1'] | Nodes: 3614
Key: target1, nkill | Features: ['target1', 'nkill'] | Nodes: 4077
Key: attacktype1, target1, weaptype1 | Features: ['attacktype1', 'target1', 'weaptype1'] | Nodes: 3730
Ke

In [6]:
def to_tuple_if_needed(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val


In [7]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class PyTorchGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x




In [8]:
from torch_geometric.data import Data
from scipy.sparse import coo_matrix

def run_epoch(model, data, labels, mask, optimizer=None):
    is_training = optimizer is not None
    if is_training:
        model.train()
    else:
        model.eval()

    out = model(data.x, data.edge_index)
    loss_fn = torch.nn.CrossEntropyLoss()

    loss = loss_fn(out[mask], labels[mask])

    if is_training:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Accuracy
    pred = out[mask].argmax(dim=1)
    acc = (pred == labels[mask]).float().mean().item()
    return acc, loss.item()


In [9]:
import os 
if not os.path.isdir(f"Results"):
    os.mkdir(f"Results")

In [10]:
import os
import ast
from torch_geometric.data import Data
from scipy.sparse import coo_matrix

# Ensure results directory exists
os.makedirs("Results", exist_ok=True)

output_path = f"Results/gcn_{partition}.txt"

# Initialize results and clear output file
results = {}
with open(output_path, "w") as f:
    f.write("Subset\tTest Accuracy\tEpoch\tFeatures Used\n")

# Loop through each feature combination
for key, df_unique in combination_dfs.items():
    print(f"\n--- Processing feature subset: {key} ---")

    # Ensure tuple format
    df_unique = df_unique.copy()
    df_unique['combination'] = df_unique['combination'].apply(to_tuple_if_needed)

    # Build coord_to_index
    coord_to_index = {row['combination']: i for i, row in df_unique.iterrows()}

    # Build graph data
    adj_matrix, feature_matrix, label_index = build_graph_data(df_unique, coord_to_index)

    # Get original data mapped to current node set
    features_used = key.split(', ')
    full_data = data[features_used + ['gname']].copy()
    full_data['combination'] = list(zip(*(full_data[feat] for feat in features_used)))
    full_data['combination'] = full_data['combination'].apply(to_tuple_if_needed)
    full_data = full_data[full_data['combination'].isin(coord_to_index)]

    # Split into train/test
    split_point = int(0.7 * len(full_data))
    train_df = full_data[:split_point]
    test_df = full_data[split_point:]

    # Feature matrix
    coords = np.array(list(coord_to_index.keys()), dtype=np.float32)
    x = torch.tensor(coords, dtype=torch.float32)

    # Label and mask tensors
    num_nodes = x.shape[0]
    y = torch.full((num_nodes,), -1, dtype=torch.long)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    for _, row in train_df.iterrows():
        idx = coord_to_index[row['combination']]
        y[idx] = label_index[row['gname']]
        train_mask[idx] = True

    for _, row in test_df.iterrows():
        idx = coord_to_index[row['combination']]
        y[idx] = label_index[row['gname']]
        test_mask[idx] = True

    # Graph edges
    A_coo = coo_matrix(adj_matrix)
    edge_index = torch.tensor(np.vstack((A_coo.row, A_coo.col)), dtype=torch.long)

    # Build graph object
    data_obj = Data(x=x, edge_index=edge_index)

    # Model and optimizer
    model = PyTorchGCN(in_channels=x.shape[1], hidden_channels=16, num_classes=len(label_index))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Training loop
    max_test_acc = 0
    max_test_epoch = -1
    for epoch in range(2000):
        train_acc, train_loss = run_epoch(model, data_obj, y, train_mask, optimizer)
        test_acc, test_loss = run_epoch(model, data_obj, y, test_mask)
        if test_acc > max_test_acc:
            max_test_acc = test_acc
            max_test_epoch = epoch + 1
        if (epoch + 1) % 100 == 0 or epoch == 1999:
            print(f"Epoch {epoch+1} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f} | "
                  f"Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}")

    print(f"Best test accuracy for {key}: {max_test_acc:.4f} at epoch {max_test_epoch}")
    results[key] = max_test_acc

    # Append result to summary file
    with open(output_path, "a") as f:
        f.write(f"Subset: {key}\n")
        f.write(f"Best test Acc: {max_test_acc:.4f}\n")
        f.write(f"Best epoch: {max_test_epoch}\n")
        f.write(f"{'-' * 37}\n\n")




--- Processing feature subset: attacktype1, target1 ---
Number of total nodes (unique coordinates): 3658
Number of unique labels in this set: 30
Epoch 100 | Train Acc: 0.0529 | Test Acc: 0.0473 | Train Loss: 8.9092 | Test Loss: 9.5715
Epoch 200 | Train Acc: 0.0795 | Test Acc: 0.0669 | Train Loss: 3.1478 | Test Loss: 3.3488
Epoch 300 | Train Acc: 0.1426 | Test Acc: 0.0914 | Train Loss: 2.9594 | Test Loss: 3.1003
Epoch 400 | Train Acc: 0.1715 | Test Acc: 0.1265 | Train Loss: 2.8847 | Test Loss: 2.9973
Epoch 500 | Train Acc: 0.1970 | Test Acc: 0.1502 | Train Loss: 2.8723 | Test Loss: 3.1698
Epoch 600 | Train Acc: 0.1250 | Test Acc: 0.1322 | Train Loss: 5.6485 | Test Loss: 5.5387
Epoch 700 | Train Acc: 0.2424 | Test Acc: 0.1894 | Train Loss: 2.7532 | Test Loss: 2.8698
Epoch 800 | Train Acc: 0.2454 | Test Acc: 0.1698 | Train Loss: 2.7161 | Test Loss: 2.8374
Epoch 900 | Train Acc: 0.2694 | Test Acc: 0.2024 | Train Loss: 2.6830 | Test Loss: 2.8073
Epoch 1000 | Train Acc: 0.2720 | Test Acc: 0