In [1]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch.optim import Adam
import scipy.sparse as sp
import numpy as np
import json
from sklearn.model_selection import KFold
import copy
import matplotlib.pyplot as plt


In [2]:
def load_data():
    adj = sp.load_npz('./data_2024/adj.npz')
    features = np.load('./data_2024/features.npy')
    labels = np.load('./data_2024/labels.npy')
    with open('./data_2024/splits.json', 'r') as file:
        splits = json.load(file)
    idx_train, idx_test = splits['idx_train'], splits['idx_test']
    # convert adjacency matrix to edge index
    adj = adj.tocoo()
    edge_index = np.vstack((adj.row, adj.col))

    # normalize features
    features = features / features.sum(1, keepdims=True)

    # convert to tensors
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    features = torch.tensor(features, dtype=torch.float)
    labels = torch.tensor(labels, dtype=torch.long)

    # create torch geometric data object
    data = Data(x=features, edge_index=edge_index, y=labels)

    return data, torch.tensor(idx_train, dtype=torch.long), torch.tensor(idx_test, dtype=torch.long)

data, idx_train, idx_test = load_data()


In [3]:
#TODO: modify GCN architecture for better performance. Try derivatives of published architectures?
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [4]:
# full set of labels with a default value (-1)
full_labels = torch.full((2480,), -1, dtype=torch.long)
full_labels[idx_train] = data.y

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0 # folds for training (do NOT apply for testing)

avg_training_losses = [] # lists for saving results
avg_validation_accuracies = []

best_validation_accuracy = 0 # initial objects
best_model_state = None

for train_index, val_index in kf.split(idx_train.numpy()):
    fold += 1 # increase counter
    model = GCN(num_features=1390, num_classes=7)
    optimizer = Adam(model.parameters(), lr=0.01)
    
    training_losses = [] # more lists for storage
    validation_accuracies = []
    
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        # compute loss on the training part
        train_mask = full_labels[idx_train[train_index]] != -1
        loss = F.nll_loss(out[idx_train[train_index]][train_mask], full_labels[idx_train[train_index]][train_mask])
        loss.backward()
        optimizer.step()
        
        training_losses.append(loss.item())

        model.eval()
        with torch.no_grad(): # no gradient on validation
            # validation accuracy
            out = model(data)
            _, pred = out.max(1)
            val_mask = full_labels[idx_train[val_index]] != -1  # Ensure valid labels for val subset
            correct = pred[idx_train[val_index]][val_mask].eq(full_labels[idx_train[val_index]][val_mask]).sum().item()
            accuracy = correct / val_mask.sum().item()
            validation_accuracies.append(accuracy)

            if accuracy > best_validation_accuracy:
                best_validation_accuracy = accuracy
                best_model_state = copy.deepcopy(model.state_dict())

        if epoch % 10 == 0:
            print(f'Fold {fold}, Epoch {epoch}: Loss {loss.item()}, Validation Accuracy: {accuracy}')
    #TODO: add early stopping?
    avg_training_losses.append(np.mean(training_losses))
    avg_validation_accuracies.append(np.mean(validation_accuracies))

# load the best model state
model.load_state_dict(best_model_state)

Fold 1, Epoch 0: Loss 1.9459484815597534, Validation Accuracy: 0.28
Fold 1, Epoch 10: Loss 1.7955107688903809, Validation Accuracy: 0.28
Fold 1, Epoch 20: Loss 1.6562772989273071, Validation Accuracy: 0.28
Fold 1, Epoch 30: Loss 1.4853079319000244, Validation Accuracy: 0.49
Fold 1, Epoch 40: Loss 1.2850769758224487, Validation Accuracy: 0.63
Fold 1, Epoch 50: Loss 1.0980970859527588, Validation Accuracy: 0.71
Fold 1, Epoch 60: Loss 0.8571550846099854, Validation Accuracy: 0.79
Fold 1, Epoch 70: Loss 0.7523890733718872, Validation Accuracy: 0.78
Fold 1, Epoch 80: Loss 0.6127892732620239, Validation Accuracy: 0.82
Fold 1, Epoch 90: Loss 0.5103626847267151, Validation Accuracy: 0.84
Fold 1, Epoch 100: Loss 0.4417681396007538, Validation Accuracy: 0.84
Fold 1, Epoch 110: Loss 0.3694544732570648, Validation Accuracy: 0.84
Fold 1, Epoch 120: Loss 0.3411328196525574, Validation Accuracy: 0.84
Fold 1, Epoch 130: Loss 0.2920863926410675, Validation Accuracy: 0.84
Fold 1, Epoch 140: Loss 0.28077

<All keys matched successfully>

In [5]:
# now run model on true testing data
model.eval()
with torch.no_grad():
    out = model(data)
    _, pred = out.max(1)
    test_labels_pred = pred[idx_test].numpy()

In [6]:
# write out results
submission_file_path = 'submission.txt'
with open(submission_file_path, 'w') as file:
    for label in test_labels_pred:
        file.write(f'{label}\n')