In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from torch_geometric.nn import GCNConv
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, precision_score, recall_score
import optuna
from tqdm.notebook import tqdm
import ast
from functools import partial
import warnings
import networkx as nx
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Setup
Make sure to change the name of the train and test dataset files to the directory that has the file. By default, it is located in the file that has 

In [None]:
def load_data(file_path):
    """Load the dataset and convert edge lists to the correct format."""
    df = pd.read_csv(file_path)
    df['edgelist'] = df['edgelist'].apply(ast.literal_eval)
    return df

# Load training data
train_df = load_data('./data/train.csv')
print(f"Loaded {len(train_df)} training samples")

# Load test data
test_df = load_data('./data/test.csv')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Graph Convolutional Network
The following blocks show the setup of the structure and features of the GCN.

In [None]:
class GraphConvolutionNetwork(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_layers, dropout_rate):
        super().__init__()
        
        # Initial feature transformation
        self.lin1 = torch.nn.Linear(num_node_features, hidden_channels)
        
        # GCN layers
        self.convs = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()
        
        for _ in range(num_layers):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        
        # Output layers
        self.lin2 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.lin3 = torch.nn.Linear(hidden_channels, 1)
        
        self.dropout_rate = dropout_rate
    
    def forward(self, x, edge_index):
        # Initial feature transformation
        x = self.lin1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        
        # Graph convolution layers
        for conv, batch_norm in zip(self.convs, self.batch_norms):
            x = conv(x, edge_index)
            x = batch_norm(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout_rate, training=self.training)
        
        # Final prediction layers
        x = self.lin2(x)
        x = F.relu(x)
        x = self.lin3(x)
        
        # Reshape output to match target dimensions
        x = x.squeeze(-1)
        
        # Output log probabilities
        return F.log_softmax(x, dim=0)

In [None]:
def create_node_features(num_nodes, edge_index, device, language=None):
    """Create node features with different options for feature engineering."""
    features = []
    
    # Create NetworkX graph for centrality calculations
    G = nx.Graph()
    G.add_nodes_from(range(num_nodes))
    edges = edge_index.cpu().numpy().T
    G.add_edges_from(edges)

    # Closeness centrality
    closeness = nx.closeness_centrality(G)
    closeness_tensor = torch.tensor([closeness[i] for i in range(num_nodes)], device=device)
    features.append(closeness_tensor.unsqueeze(1))

    # Betweenness centrality
    betweenness = nx.betweenness_centrality(G)
    betweenness_tensor = torch.tensor([betweenness[i] for i in range(num_nodes)], device=device)
    features.append(betweenness_tensor.unsqueeze(1))
        
    # PageRank centrality
    pagerank = nx.pagerank(G)
    pagerank_tensor = torch.tensor([pagerank[i] for i in range(num_nodes)], device=device)
    features.append(pagerank_tensor.unsqueeze(1))

    # Eigenvector centrality
    eigenvector = nx.eigenvector_centrality(G,  max_iter=1000, tol=1e-04)
    eigenvector_tensor = torch.tensor([eigenvector[i] for i in range(num_nodes)], device=device)
    features.append(eigenvector_tensor.unsqueeze(1))
        
    # Eccentricity
    eccentricity = nx.eccentricity(G)
    eccentricity_tensor = torch.tensor([eccentricity[i] for i in range(num_nodes)], device=device)
    features.append(eccentricity_tensor.unsqueeze(1))
    
    # Add language-based features if languages are provided
    if language is not None:
        # Language groups mapping
        language_groups = {
            'head_final_sov': ['Japanese', 'Korean', 'Turkish'],
            'romance_svo': ['Spanish', 'Portuguese', 'French', 'Italian', 'Galician'],
            'germanic_v2': ['German', 'Swedish', 'Icelandic'],
            'free_order_case': ['Russian', 'Polish', 'Czech', 'Finnish'],
            'analytic': ['English', 'Chinese', 'Thai', 'Indonesian'],
            'other': ['Arabic', 'Hindi']
        }
        
        # Create one-hot encoding for languages
        one_hot = torch.zeros((num_nodes, 21), device=device)
        lang_idx = list(language_groups.values())[0].index(language) if language in list(language_groups.values())[0] else 20
        one_hot[:, lang_idx] = 1
        features.append(one_hot)
        
        # Create language group features
        group_features = torch.zeros((num_nodes, len(language_groups)), device=device) 
        for group_idx, (group_name, group_langs) in enumerate(language_groups.items()):
            if language in group_langs:
                group_features[:, group_idx] = 1
                break
        features.append(group_features)
    
    return torch.cat(features, dim=1)


In [None]:
def create_pytorch_geometric_data(df_row, device, feature_type='basic'):
    """Convert a single graph data row to PyTorch Geometric Data object."""
    edge_list = df_row['edgelist']
    num_nodes = df_row['n']
    languages = df_row['language']
    
    # Only get root if present (i.e., not in test set)
    root = df_row['root'] - 1 if 'root' in df_row else None

    # Create edge index with both directions for undirected graph
    edges = []
    for src, dst in edge_list:
        edges.append([src-1, dst-1])  # forward edge
        edges.append([dst-1, src-1])  # backward edge

    edge_index = torch.tensor(edges, dtype=torch.long, device=device).t()

    # Create node features
    x = create_node_features(num_nodes, edge_index, device, languages)

    # Create Data object
    if root is not None:
        y = torch.tensor([root], dtype=torch.long, device=device)
        return Data(x=x, edge_index=edge_index, y=y, language=languages)
    else:
        return Data(x=x, edge_index=edge_index, language=languages)


In [None]:
def validate_with_metrics(model, val_data, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    all_languages = []
    
    with torch.no_grad():
        for data in val_data:
            data = data.to(device)
            out = model(data.x, data.edge_index)
            
            target = data.y.view(-1)
            out = out.unsqueeze(0)
            
            loss = F.nll_loss(out, target)
            total_loss += loss.item()
            
            pred = out.squeeze(0).argmax(dim=0)
            
            # Convert to binary classification (root vs non-root)
            # Create binary labels for all nodes
            num_nodes = len(data.x)
            binary_target = torch.zeros(num_nodes, device=device)
            binary_target[target] = 1
            
            binary_pred = torch.zeros(num_nodes, device=device)
            binary_pred[pred] = 1
            
            all_targets.extend(binary_target.cpu().numpy())
            all_preds.extend(binary_pred.cpu().numpy())
            
            # Get language from the data if available
            if hasattr(data, 'language'):
                all_languages.extend([data.language] * num_nodes)
    
    # Calculate overall metrics
    accuracy = sum(p == t for p, t in zip(all_preds, all_targets)) / len(all_targets)
    f1 = f1_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds, zero_division=0)
    recall = recall_score(all_targets, all_preds, zero_division=0)
    
    # Calculate per-language metrics if languages are available
    per_language_metrics = {}
    if all_languages:
        unique_languages = set(all_languages)
        for lang in unique_languages:
            lang_mask = [l == lang for l in all_languages]
            lang_targets = [t for t, m in zip(all_targets, lang_mask) if m]
            lang_preds = [p for p, m in zip(all_preds, lang_mask) if m]
            
            if len(lang_targets) > 0:  # Only calculate metrics if we have samples for this language
                per_language_metrics[lang] = {
                    'precision': precision_score(lang_targets, lang_preds, zero_division=0),
                    'recall': recall_score(lang_targets, lang_preds, zero_division=0),
                    'f1': f1_score(lang_targets, lang_preds, zero_division=0)
                }
    
    return {
        'loss': total_loss / len(val_data),
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall,
        'per_language_metrics': per_language_metrics
    }

In [None]:
def train_epoch(model, optimizer, train_data, device, class_weights=None):
    model.train()
    total_loss = 0
    correct = 0
    
    for data in train_data:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        
        target = data.y.view(-1)
        out = out.unsqueeze(0)

        if class_weights is not None:
            weight = class_weights[target.item()]
            loss = F.nll_loss(out, target) * weight
        else:
            loss = F.nll_loss(out, target)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pred = out.squeeze(0).argmax(dim=0)
        correct += int(pred == target)
    
    return total_loss / len(train_data), correct / len(train_data)

## Optuna Hyperparameter Tuning

In [None]:
def objective(trial, train_df, device, feature_type='basic'):
    params = {
        'hidden_channels': trial.suggest_categorical('hidden_channels', [64, 128, 256]),
        'num_layers': trial.suggest_int('num_layers', 2, 4),  
        'dropout_rate': trial.suggest_float('dropout_rate', 0.2, 0.4),  
        'learning_rate': trial.suggest_float('learning_rate', 3e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-4, 1e-3, log=True),
        'max_epochs': 100,  
        'patience': 10,  
        'batch_size': 32
    }
    
    # Create folds
    n_folds = 4
    kf = GroupKFold(n_splits=n_folds)
    
    # Convert data to PyG format
    dataset = [create_pytorch_geometric_data(row, device, feature_type) 
              for _, row in train_df.iterrows()]
    
    # Group by sentence for GroupKFold
    groups = train_df['sentence'].values
    
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset, groups=groups)):
        trial.set_user_attr(f'fold_{fold+1}_status', 'starting')
        
        try:
            # Split data
            train_data = [dataset[i] for i in train_idx]
            val_data = [dataset[i] for i in val_idx]
            
            # Initialize model
            num_node_features = dataset[0].x.size(1)
            model = GraphConvolutionNetwork(
                num_node_features=num_node_features,
                hidden_channels=params['hidden_channels'],
                num_layers=params['num_layers'],
                dropout_rate=params['dropout_rate']
            ).to(device)
            
            optimizer = torch.optim.Adam(
                model.parameters(),
                lr=params['learning_rate'],
                weight_decay=params['weight_decay']
            )
            
            # Training loop with early stopping
            best_f1_score = 0  
            patience_counter = 0
        
            for epoch in range(params['max_epochs']):
                train_loss, train_acc = train_epoch(
                    model, optimizer, train_data, device)
                val_metrics = validate_with_metrics(model, val_data, device)
    
                # Use F1 score for early stopping
                if val_metrics['f1_score'] > best_f1_score:
                    best_f1_score = val_metrics['f1_score']
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= params['patience']:
                        break
                
                # Report F1 score to Optuna
                trial.report(val_metrics['f1_score'], epoch)
                
                if trial.should_prune():
                    raise optuna.TrialPruned()
            
            scores.append(best_f1_score)
            trial.set_user_attr(f'fold_{fold+1}_score', best_f1_score)
            trial.set_user_attr(f'fold_{fold+1}_status', 'completed')
            
        except Exception as e:
            print(f"Trial {trial.number} Fold {fold+1} failed with exception: {str(e)}")
            trial.set_user_attr(f'fold_{fold+1}_status', f'failed_exception: {str(e)}')
            continue
    
    if not scores:
        raise optuna.TrialPruned()
    
    return np.mean(scores)

In [None]:
# Create study
study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=2,
        interval_steps=1
    )
)

# Create objective function with fixed parameters
objective_func = partial(objective, train_df=train_df, device=device)

# Optimize
study.optimize(objective_func, n_trials=10, show_progress_bar=True)

# Print results
print("\nBest trial:")
trial = study.best_trial
print(f"  F1-score: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")