# Cora Graph Neural Networks Project

Author: Ankush Joshi   
Date: 30 August, 2025

The goal of this project is to predict the subject category of a paper by using its own words and the structure of the citation work. The core idea is that a paper about Neural Networks is likely to cite another paper about Neural Networks and this justification gives the GNN a powerful advantage over models since they are all connected.

---

## Importing Libraries

In [None]:
import torch 
import torch.nn.functional as F 
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
import numpy as np

## Loading in Cora Dataset

In [16]:
print("Preparing to load Cora Dataset")
dataset = Planetoid(root='.', name='Cora')
data = dataset[0]

Preparing to load Cora Dataset


## Gathering Information about data set  
Number of nodes, edges, features, etc

In [33]:
print("\nCora Dataset")
print("----------------------------------------------")
print(f"Dataset: {dataset.name}")
print("==============================================")
print(f"Number of nodes (papers): {data.num_nodes}")
print(f"Number of edges (citations): {data.num_edges}")
print(f"Number of features per node (words): {dataset.num_node_features}")
print(f"Number of classes (subjects): {dataset.num_classes}")
print(f"Number of Training nodes: {data.train_mask.sum()}")
print(f"Number of Validation nodes: {data.val_mask.sum()}")
print(f"Numbner of Test Nodes: {data.test_mask.sum()}")
print("==============================================")



Cora Dataset
----------------------------------------------
Dataset: Cora
Number of nodes (papers): 2708
Number of edges (citations): 10556
Number of features per node (words): 1433
Number of classes (subjects): 7
Number of Training nodes: 140
Number of Validation nodes: 500
Numbner of Test Nodes: 1000


## Building the Model

In [35]:
# This class will define our Graph Convolutional Network (GCN)
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()
        # GNN Layer that takes node features and learns a 16-dimensional embedding
        self.conv1 = GCNConv(num_node_features, 16)
        # A second GNN Layer that takes the 16 dimensional embedding and outputs predictions per class
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # Pass data through first layer and apply a non-linear activation formula
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        # Apply dropout for regularization
        x = F.dropout(x, training=self.training)
        # Pass through final layer for prediction
        x = self.conv2(x, edge_index)

        # Return the log-softmax probabilites for each class
        return F.log_softmax(x, dim=1)
    

## Training and Evaluating the Model

In [37]:
# Checking if GPU is avaiable, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [41]:
# Moving data and model to selected device based on previous line
model = GCN(dataset.num_node_features, dataset.num_classes).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

print("\nTraining Model")
for epoch in range(200):        
    optimizer.zero_grad()       # Clearing Gradients from previous step
    out = model(data)           # Perform a single forward pass through the model
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask]) # Calculate loss only on training nodes
    loss.backward()             # Derive gradients
    optimizer.step()            # Update model parameters

    if(epoch + 1) % 20 == 0:
        print(f'Epoch: [{epoch+1}/200, Loss: {loss.item():.4f}]')

print("Training Complete")


Training Model
Epoch: [20/200, Loss: 0.2356]
Epoch: [40/200, Loss: 0.0669]
Epoch: [60/200, Loss: 0.0438]
Epoch: [80/200, Loss: 0.0394]
Epoch: [100/200, Loss: 0.0291]
Epoch: [120/200, Loss: 0.0229]
Epoch: [140/200, Loss: 0.0329]
Epoch: [160/200, Loss: 0.0446]
Epoch: [180/200, Loss: 0.0181]
Epoch: [200/200, Loss: 0.0211]
Training Complete


In [None]:
# Setting model to evaluation mode
model.eval()

# Making predictions on the entire graph
pred = model(data).argmax(dim=1)

In [None]:
# Checking accuracy on test nodes
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f"\nFinal Accuracy on Test set: {acc:.4f}")


Final Accuracy on Test set: 0.8010


---

## Testing Model Predictions


In [52]:
# Compiling a list of class names based on Cora dataset
class_names = [
    "Theory", "Reinforcement_Learning", "Genetic_Algorithms",
    "Neural_Networks", "Probabilistic_Methods", "Case_Based", "Rule_Learning"
]

test_node_indices = np.where(data.test_mask.cpu().numpy())[0]

for i in range(10):
    node_index = test_node_indices[i]
    predicted_class_index = pred[node_index].item()
    true_class_index = data.y[node_index].item()

    predicted_class_name = class_names[predicted_class_index]
    true_class_name = class_names[true_class_index]
    is_correct = "Correct" if predicted_class_index == true_class_index else "Incorrect"

    print(f"Node {node_index}:")
    print(f"\tPredicted Class: {predicted_class_name}")
    print(f"\tTrue Class:      {true_class_name}")
    print(f"\tResult:          {is_correct}\n")

NameError: name 'np' is not defined