In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from torch_geometric.nn import GCNConv
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, precision_score, recall_score
import optuna
from tqdm.notebook import tqdm
import ast
from functools import partial
import warnings
import networkx as nx
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Setup
Make sure to change the name of the train and test dataset files to the directory that has the file. By default, it is located in the file that has 

In [None]:
def load_data(file_path):
    """Load the dataset and convert edge lists to the correct format."""
    df = pd.read_csv(file_path)
    df['edgelist'] = df['edgelist'].apply(ast.literal_eval)
    return df

# Load training data
train_df = load_data('./data/train.csv')
print(f"Loaded {len(train_df)} training samples")

# Load test data
test_df = load_data('./data/test.csv')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Graph Convolutional Network
The following blocks show the setup of the structure and features of the GCN.

In [None]:
class GraphConvolutionNetwork(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_layers, dropout_rate):
        super().__init__()
        
        # Initial feature transformation
        self.lin1 = torch.nn.Linear(num_node_features, hidden_channels)
        
        # GCN layers
        self.convs = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()
        
        for _ in range(num_layers):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        
        # Output layers
        self.lin2 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.lin3 = torch.nn.Linear(hidden_channels, 1)
        
        self.dropout_rate = dropout_rate
    
    def forward(self, x, edge_index):
        # Initial feature transformation
        x = self.lin1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        
        # Graph convolution layers
        for conv, batch_norm in zip(self.convs, self.batch_norms):
            x = conv(x, edge_index)
            x = batch_norm(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout_rate, training=self.training)
        
        # Final prediction layers
        x = self.lin2(x)
        x = F.relu(x)
        x = self.lin3(x)
        
        # Reshape output to match target dimensions
        x = x.squeeze(-1)
        
        # Output log probabilities
        return F.log_softmax(x, dim=0)

In [None]:
def create_node_features(num_nodes, edge_index, device, language=None):
    """Create node features with different options for feature engineering."""
    features = []
    
    # Create NetworkX graph for centrality calculations
    G = nx.Graph()
    G.add_nodes_from(range(num_nodes))
    edges = edge_index.cpu().numpy().T
    G.add_edges_from(edges)

    # Closeness centrality
    closeness = nx.closeness_centrality(G)
    closeness_tensor = torch.tensor([closeness[i] for i in range(num_nodes)], device=device)
    features.append(closeness_tensor.unsqueeze(1))

    # Betweenness centrality
    betweenness = nx.betweenness_centrality(G)
    betweenness_tensor = torch.tensor([betweenness[i] for i in range(num_nodes)], device=device)
    features.append(betweenness_tensor.unsqueeze(1))
        
    # PageRank centrality
    pagerank = nx.pagerank(G)
    pagerank_tensor = torch.tensor([pagerank[i] for i in range(num_nodes)], device=device)
    features.append(pagerank_tensor.unsqueeze(1))

    # Eigenvector centrality
    eigenvector = nx.eigenvector_centrality(G,  max_iter=1000, tol=1e-04)
    eigenvector_tensor = torch.tensor([eigenvector[i] for i in range(num_nodes)], device=device)
    features.append(eigenvector_tensor.unsqueeze(1))
        
    # Eccentricity
    eccentricity = nx.eccentricity(G)
    eccentricity_tensor = torch.tensor([eccentricity[i] for i in range(num_nodes)], device=device)
    features.append(eccentricity_tensor.unsqueeze(1))
    
    # Add language-based features if languages are provided
    if language is not None:
        # Language groups mapping
        language_groups = {
            'head_final_sov': ['Japanese', 'Korean', 'Turkish'],
            'romance_svo': ['Spanish', 'Portuguese', 'French', 'Italian', 'Galician'],
            'germanic_v2': ['German', 'Swedish', 'Icelandic'],
            'free_order_case': ['Russian', 'Polish', 'Czech', 'Finnish'],
            'analytic': ['English', 'Chinese', 'Thai', 'Indonesian'],
            'other': ['Arabic', 'Hindi']
        }
        
        # Create one-hot encoding for languages
        one_hot = torch.zeros((num_nodes, 21), device=device)
        lang_idx = list(language_groups.values())[0].index(language) if language in list(language_groups.values())[0] else 20
        one_hot[:, lang_idx] = 1
        features.append(one_hot)
        
        # Create language group features
        group_features = torch.zeros((num_nodes, len(language_groups)), device=device) 
        for group_idx, (group_name, group_langs) in enumerate(language_groups.items()):
            if language in group_langs:
                group_features[:, group_idx] = 1
                break
        features.append(group_features)
    
    return torch.cat(features, dim=1)


In [None]:
def create_pytorch_geometric_data(df_row, device, feature_type='basic'):
    """Convert a single graph data row to PyTorch Geometric Data object."""
    edge_list = df_row['edgelist']
    num_nodes = df_row['n']
    languages = df_row['language']
    
    # Only get root if present (i.e., not in test set)
    root = df_row['root'] - 1 if 'root' in df_row else None

    # Create edge index with both directions for undirected graph
    edges = []
    for src, dst in edge_list:
        edges.append([src-1, dst-1])  # forward edge
        edges.append([dst-1, src-1])  # backward edge

    edge_index = torch.tensor(edges, dtype=torch.long, device=device).t()

    # Create node features
    x = create_node_features(num_nodes, edge_index, device, languages)

    # Create Data object
    if root is not None:
        y = torch.tensor([root], dtype=torch.long, device=device)
        return Data(x=x, edge_index=edge_index, y=y, language=languages)
    else:
        return Data(x=x, edge_index=edge_index, language=languages)


In [None]:
def validate_with_metrics(model, val_data, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    all_languages = []
    
    with torch.no_grad():
        for data in val_data:
            data = data.to(device)
            out = model(data.x, data.edge_index)
            
            target = data.y.view(-1)
            out = out.unsqueeze(0)
            
            loss = F.nll_loss(out, target)
            total_loss += loss.item()
            
            pred = out.squeeze(0).argmax(dim=0)
            
            # Convert to binary classification (root vs non-root)
            # Create binary labels for all nodes
            num_nodes = len(data.x)
            binary_target = torch.zeros(num_nodes, device=device)
            binary_target[target] = 1
            
            binary_pred = torch.zeros(num_nodes, device=device)
            binary_pred[pred] = 1
            
            all_targets.extend(binary_target.cpu().numpy())
            all_preds.extend(binary_pred.cpu().numpy())
            
            # Get language from the data if available
            if hasattr(data, 'language'):
                all_languages.extend([data.language] * num_nodes)
    
    # Calculate overall metrics
    accuracy = sum(p == t for p, t in zip(all_preds, all_targets)) / len(all_targets)
    f1 = f1_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds, zero_division=0)
    recall = recall_score(all_targets, all_preds, zero_division=0)
    
    # Calculate per-language metrics if languages are available
    per_language_metrics = {}
    if all_languages:
        unique_languages = set(all_languages)
        for lang in unique_languages:
            lang_mask = [l == lang for l in all_languages]
            lang_targets = [t for t, m in zip(all_targets, lang_mask) if m]
            lang_preds = [p for p, m in zip(all_preds, lang_mask) if m]
            
            if len(lang_targets) > 0:  # Only calculate metrics if we have samples for this language
                per_language_metrics[lang] = {
                    'precision': precision_score(lang_targets, lang_preds, zero_division=0),
                    'recall': recall_score(lang_targets, lang_preds, zero_division=0),
                    'f1': f1_score(lang_targets, lang_preds, zero_division=0)
                }
    
    return {
        'loss': total_loss / len(val_data),
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall,
        'per_language_metrics': per_language_metrics
    }

In [None]:
def train_epoch(model, optimizer, train_data, device, class_weights=None):
    model.train()
    total_loss = 0
    correct = 0
    
    for data in train_data:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        
        target = data.y.view(-1)
        out = out.unsqueeze(0)

        if class_weights is not None:
            weight = class_weights[target.item()]
            loss = F.nll_loss(out, target) * weight
        else:
            loss = F.nll_loss(out, target)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pred = out.squeeze(0).argmax(dim=0)
        correct += int(pred == target)
    
    return total_loss / len(train_data), correct / len(train_data)

### Optimal Hyperparamters

- F1-score: 0.2783809523809524
- Params: 
  - hidden_channels: 128
  - num_layers: 2
  - dropout_rate: 0.24007533818963195
  - learning_rate: 0.00967001569505872
  - weight_decay: 0.00017412411662641109

# Validation

In [None]:
# Create final dataset
dataset = [create_pytorch_geometric_data(row, device) 
          for _, row in train_df.iterrows()]

# Initialize model with best parameters
num_node_features = dataset[0].x.size(1)

# Train final model with best parameters
final_model = GraphConvolutionNetwork(
    num_node_features=num_node_features,
    hidden_channels=128,
    num_layers=2,
    dropout_rate=0.24007533818963195
).to(device)

optimizer = torch.optim.Adam(
    final_model.parameters(),
    lr= 0.00967001569505872,
    weight_decay=0.00017412411662641109
)

In [None]:
# Create train/validation split (3 folds for training, 1 for validation)
n_folds = 4
kf = GroupKFold(n_splits=n_folds)
groups = train_df['sentence'].values

# Use the last fold as validation set
val_fold = n_folds - 1  # This will be fold 3 (0-based indexing)
train_data = []
val_data = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset, groups=groups)):
    if fold == val_fold:
        val_data = [dataset[i] for i in val_idx]
    else:
        train_data.extend([dataset[i] for i in train_idx])

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

# Training loop
best_f1_score = 0
patience_counter = 0

for epoch in tqdm(range(100)):
    # Training
    train_loss, train_acc = train_epoch(final_model, optimizer, train_data, device)
    
    # Validation
    val_metrics = validate_with_metrics(final_model, val_data, device)
    val_f1 = val_metrics['f1_score']
    
    print(f'Epoch {epoch:03d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val F1: {val_f1:.4f}')
    
    # Early stopping
    if val_f1 > best_f1_score:
        best_f1_score = val_f1
        patience_counter = 0
        # Save best model
        torch.save(final_model.state_dict(), 'validation_gcn.pt')
    else:
        patience_counter += 1
        if patience_counter >= 10:
            print('Early stopping!')
            break

# Load best model and evaluate final performance
final_model.load_state_dict(torch.load('validation_gcn.pt'))
final_metrics = validate_with_metrics(final_model, val_data, device)

print("\nFinal Validation Performance:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Precision: {final_metrics['precision']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"F1 Score: {final_metrics['f1_score']:.4f}")
    
# Print per-language metrics
print("\nPer-Language Performance:")
for lang, metrics in final_metrics['per_language_metrics'].items():
    print(f"\n{lang}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}")

# Test

In [None]:
# Training loop
best_f1_score = 0
patience_counter = 0

for epoch in tqdm(range(100)):
    # Training
    train_loss, train_acc = train_epoch(
                    final_model, optimizer, train_data, device)
    
    # Validation with F1 score
    val_metrics = validate_with_metrics(final_model, dataset, device)
    val_f1 = val_metrics['f1_score']
    
    print(f'Epoch {epoch:03d}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val F1: {val_f1:.4f}')
    
    # Early stopping based on F1 score
    if val_f1 > best_f1_score:
        best_f1_score = val_f1
        patience_counter = 0
        # Save best model
        torch.save(final_model.state_dict(), 'best_model_gcn.pt')
    else:
        patience_counter += 1
        if patience_counter >= 10:
            print('Early stopping!')
            break

In [None]:
# Load best model
final_model.load_state_dict(torch.load('best_model_gcn.pt'))
final_model.eval()

# Make predictions
predictions = []
with torch.no_grad():
    for _, row in test_df.iterrows():
        data = create_pytorch_geometric_data(row, device)
        data = data.to(device)
        out = final_model(data.x, data.edge_index)
        pred = out.argmax(dim=0).item() + 1  # Convert back to 1-based indexing
        predictions.append(pred)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(predictions) + 1),
    'root': predictions
})

# Save predictions
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")