In [1]:
import torch
import numpy as np
import pandas as pd
from haversine import haversine, Unit
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, StandardScaler




In [2]:
partition = 100

# 1. Load Dataset

In [3]:
trainpath = f'../../../data/top30groups/LongLatCombined/train1/train{partition}.csv'
testpath = f'../../../data/top30groups/LongLatCombined/test1/test{partition}.csv'
traindata = pd.read_csv(trainpath, encoding='ISO-8859-1')
testdata = pd.read_csv(testpath, encoding='ISO-8859-1')

In [4]:
combined = pd.concat([traindata, testdata], axis = 0)

### Find unique locations and construct global graph

In [5]:
# Extract unique locations for node creation
combined['location'] = list(zip(combined['longitude'], combined['latitude']))
unique_locations = combined['location'].drop_duplicates().reset_index(drop=True)

# Map locations to an identity
location2id = {loc: idx for idx, loc in enumerate(unique_locations)}
combined['location_id'] = combined['location'].map(location2id)

# Encode labels
le = LabelEncoder()
combined['label'] = le.fit_transform(combined['gname'])

# Get global node features
coords = np.array([list(loc) for loc in unique_locations])  # [1790, 2]
print("Feature Matrix shape: ", coords.shape)

# Standardize features
scaler = StandardScaler()
x_global = scaler.fit_transform(coords)  # standardized features

# Build global edge list using 1km Haversine
edges = []
coords_latlon = [(lat, lon) for lon, lat in unique_locations]
for i in range(len(coords_latlon)):
    for j in range(i + 1, len(coords_latlon)):
        if haversine(coords_latlon[i], coords_latlon[j], Unit.KILOMETERS) <= 1.0:
            edges.append((i, j))
            edges.append((j, i))

global_edge_index = torch.tensor(edges, dtype=torch.long).T  # shape [2, num_edges]


Feature Matrix shape:  (1790, 2)


In [6]:
global_edge_index.shape

torch.Size([2, 242])

In [7]:
unique_nodes = torch.unique(global_edge_index)
print("Nodes with at least one neighbor: ", len(unique_nodes))

Nodes with at least one neighbor:  161


### Creating subgraphs for each node depending on its neighbors

In [8]:
def get_subgraph(center_id, edge_index, x_global):
    # Get neighbors (indices) of center node
    neighbors = edge_index[1][edge_index[0] == center_id]
    node_ids = torch.cat([torch.tensor([center_id]), neighbors]).unique()

    # Remap node indices locally
    id_map = {old_id.item(): i for i, old_id in enumerate(node_ids)}
    new_edges = []
    for source, destination in zip(*edge_index):
        if source in node_ids and destination in node_ids:
            new_edges.append((id_map[source.item()], id_map[destination.item()]))

    # If no edges exist, add a self-loop
    if len(new_edges) == 0:
        center_local_idx = 0  # only node in subgraph
        new_edges = [(0, 0)]
    else:
        center_local_idx = id_map[center_id.item()]

    sub_x = x_global[node_ids]
    sub_edge_index = torch.tensor(new_edges).T

    return sub_x, sub_edge_index, center_local_idx


In [9]:
from torch_geometric.data import Data

traindata_list = []
for _, row in traindata.iterrows():
    center_id = location2id[(row['longitude'], row['latitude'])]
    label = le.transform([row['gname']])[0]
    
    x, edge_index, center_idx = get_subgraph(torch.tensor(center_id), global_edge_index, torch.tensor(x_global, dtype=torch.float))
    
    traindata_obj = Data(x=x, edge_index=edge_index, y=torch.tensor(label), center=center_idx)
    traindata_list.append(traindata_obj)


In [10]:
test_data_list = []
for _, row in testdata.iterrows():
    loc = (row['longitude'], row['latitude'])
    
    # Skip if location not in mapping (just in case)
    if loc not in location2id:
        continue
    
    center_id = location2id[loc]
    label = le.transform([row['gname']])[0]
    
    x, edge_index, center_idx = get_subgraph(
        torch.tensor(center_id),
        global_edge_index,
        torch.tensor(x_global, dtype=torch.float)
    )

    testdata_obj = Data(x=x, edge_index=edge_index, y=torch.tensor(label), center=center_idx)
    test_data_list.append(testdata_obj)

### GCN Model

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.classifier = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, batch=None, center_indices=None):
        """
        x: [num_nodes_in_subgraph, in_channels]
        edge_index: [2, num_edges] — edges for this subgraph
        center_indices: indices of center nodes (usually len=1), used to extract prediction
        """
        #print(f"Node {center_indices}: edges for this subgraph, {edge_index.shape[1]}")
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        # Pick only the embeddings of the center nodes
        center_embeddings = x[center_indices]  # shape: [1, hidden_dim]
        
        out = self.classifier(center_embeddings)  # shape: [1, out_channels]
        return F.log_softmax(out, dim=1)


### Training and Testing

In [12]:
print(len(le.classes_))

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GCN(in_channels=2, hidden_channels=64, out_channels=len(le.classes_)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

30


In [13]:
from sklearn.model_selection import train_test_split

train_set, val_set = train_test_split(traindata_list, test_size=0.2, random_state=42)


In [14]:
def train():
    model.train()
    total_loss = 0
    for data in train_set:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, center_indices=torch.tensor([data.center], device=device))
        loss = criterion(out, data.y.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_set)

@torch.no_grad()
def evaluate(dataset):
    model.eval()
    correct = 0
    for data in dataset:
        data = data.to(device)
        out = model(data.x, data.edge_index, center_indices=torch.tensor([data.center], device=device))
        pred = out.argmax(dim=1)
        correct += (pred == data.y).item()
    return correct / len(dataset)


In [15]:
best_model_state = model.state_dict()
best_val_acc = 0.0
patience = 30
patience_counter = 0

for epoch in range(1, 201):
    loss = train()
    train_acc = evaluate(train_set)
    val_acc = evaluate(val_set)

    print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        best_model_state = model.state_dict()
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

# Final test evaluation
model.load_state_dict(best_model_state)
test_acc = evaluate(test_data_list)
print(f"\nFinal Test Accuracy after early stopping: {test_acc:.4f}")


Epoch 001 | Loss: 1.7513 | Train Acc: 0.4661 | Val Acc: 0.4452
Epoch 002 | Loss: 1.2956 | Train Acc: 0.5976 | Val Acc: 0.5714
Epoch 003 | Loss: 1.1636 | Train Acc: 0.5976 | Val Acc: 0.5690
Epoch 004 | Loss: 1.1069 | Train Acc: 0.6113 | Val Acc: 0.5929
Epoch 005 | Loss: 1.0845 | Train Acc: 0.6119 | Val Acc: 0.5881
Epoch 006 | Loss: 1.0577 | Train Acc: 0.6339 | Val Acc: 0.6167
Epoch 007 | Loss: 1.0108 | Train Acc: 0.6113 | Val Acc: 0.5857
Epoch 008 | Loss: 1.0121 | Train Acc: 0.6458 | Val Acc: 0.6286
Epoch 009 | Loss: 0.9933 | Train Acc: 0.6321 | Val Acc: 0.6214
Epoch 010 | Loss: 0.9779 | Train Acc: 0.6274 | Val Acc: 0.6214
Epoch 011 | Loss: 0.9498 | Train Acc: 0.6292 | Val Acc: 0.6333
Epoch 012 | Loss: 0.9461 | Train Acc: 0.6327 | Val Acc: 0.6476
Epoch 013 | Loss: 0.9523 | Train Acc: 0.6190 | Val Acc: 0.6333
Epoch 014 | Loss: 0.9433 | Train Acc: 0.6476 | Val Acc: 0.6738
Epoch 015 | Loss: 0.9049 | Train Acc: 0.6631 | Val Acc: 0.6857
Epoch 016 | Loss: 0.8978 | Train Acc: 0.6536 | Val Acc: