# Serbian Legal Named Entity Recognition (NER) Pipeline - 5-Fold Cross-Validation with Class Weights

This notebook implements 5-fold cross-validation for the Serbian Legal NER pipeline using the base BERT model (classla/bcms-bertic) **with class weights** to handle class imbalance.

## Key Features
- **5-Fold Cross-Validation**: Robust evaluation across different data splits
- **Base BERT Architecture**: Uses classla/bcms-bertic for token classification
- **Class Weights**: Weighted loss function to handle imbalanced entity distribution
- **Sliding Window Tokenization**: Handles long sequences without truncation
- **Comprehensive Metrics**: Precision, recall, F1-score, and accuracy tracking
- **Statistical Analysis**: Mean and standard deviation across folds

## Entity Types
- **COURT**: Court institutions
- **DECISION_DATE**: Dates of legal decisions
- **CASE_NUMBER**: Case identifiers
- **CRIMINAL_ACT**: Criminal acts/charges
- **PROSECUTOR**: Prosecutor entities
- **DEFENDANT**: Defendant entities
- **JUDGE**: Judge names
- **REGISTRAR**: Court registrar
- **SANCTION**: Sanctions/penalties
- **SANCTION_TYPE**: Type of sanction
- **SANCTION_VALUE**: Value/duration of sanction
- **PROVISION**: Legal provisions
- **PROCEDURE_COSTS**: Legal procedure costs

## 1. Environment Setup and Dependencies

In [None]:
# Install required packages
!pip install transformers torch datasets tokenizers scikit-learn seqeval pandas numpy matplotlib seaborn tqdm

In [None]:
# Import shared modules
import sys
import os

sys.path.append('/shared/')


import importlib
import shared
import shared.model_utils
import shared.data_processing
import shared.dataset
import shared.evaluation
import shared.config
importlib.reload(shared.config)
importlib.reload(shared.data_processing)
importlib.reload(shared.dataset)
importlib.reload(shared.model_utils)
importlib.reload(shared.evaluation)
importlib.reload(shared)

# Import from shared modules
from shared import (
    # Configuration
    ENTITY_TYPES, BIO_LABELS,
    get_default_model_config, setup_environment,

    # Data processing
    LabelStudioToBIOConverter, load_labelstudio_data,
    analyze_labelstudio_data, validate_bio_examples,

    # Dataset
    NERDataset, tokenize_and_align_labels_with_sliding_window,
    create_huggingface_datasets,

    # Model utilities
    load_model_and_tokenizer, create_training_arguments,
    detailed_evaluation, setup_device_and_seed,
    PerClassMetricsCallback,

    # Comprehensive tracking
    analyze_entity_distribution_per_fold,
    generate_detailed_classification_report,
    # Aggregate functions
    create_aggregate_report_across_folds
)

# Standard imports
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.model_selection import KFold
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from transformers import DataCollatorForTokenClassification, AutoTokenizer, Trainer
from collections import Counter

# Setup device and random seed
device = setup_device_and_seed(42)

## 2. Configuration and Environment Setup

In [None]:
# Setup environment and paths
env_setup = setup_environment(use_local=False, create_dirs=False)
paths = env_setup['paths']

# Model configuration
MODEL_NAME = "classla/bcms-bertic"
model_config = get_default_model_config()

# Output directory
OUTPUT_DIR = f"{paths['models_dir']}/bertic_base_class_weights_5fold_cv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"üîß Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Entity types: {len(ENTITY_TYPES)}")
print(f"  BIO labels: {len(BIO_LABELS)}")

## 3. Data Loading and Analysis

In [None]:
# Load LabelStudio data
labelstudio_data = load_labelstudio_data(paths['labelstudio_json'])

# Analyze the data
if labelstudio_data:
    analysis = analyze_labelstudio_data(labelstudio_data)
else:
    print("‚ùå No data loaded. Please check your paths.")
    exit()

## 4. Data Preprocessing and BIO Conversion

In [None]:
# Convert LabelStudio data to BIO format
converter = LabelStudioToBIOConverter(
    judgments_dir=paths['judgments_dir'],
    labelstudio_files_dir=paths.get('labelstudio_files_dir')
)

bio_examples = converter.convert_to_bio(labelstudio_data)
print(f"‚úÖ Converted {len(bio_examples)} examples to BIO format")

# Validate BIO examples
valid_examples, stats = validate_bio_examples(bio_examples)
print(f"üìä Validation complete: {stats['valid_examples']} valid examples")

## 5. Dataset Preparation

In [None]:
# Create NER dataset
ner_dataset = NERDataset(valid_examples)
prepared_examples = ner_dataset.prepare_for_training()

print(f"üìä Dataset statistics:")
print(f"  Number of unique labels: {ner_dataset.get_num_labels()}")
print(f"  Prepared examples: {len(prepared_examples)}")

# Get label statistics
label_stats = ner_dataset.get_label_statistics()
print(f"  Total tokens: {label_stats['total_tokens']}")
print(f"  Entity types found: {len(label_stats['entity_counts'])}")

## 6. Class Weights Implementation

In [None]:
# ============================================================================
# CLASS WEIGHTS FUNCTIONS
# ============================================================================

def calculate_class_weights_from_tokenized(tokenized_examples, label_to_id):
    """
    Calculate class weights based on label frequency in tokenized training data.
    
    Args:
        tokenized_examples: List of tokenized training examples (with integer label IDs)
        label_to_id: Dictionary mapping labels to IDs
    
    Returns:
        torch.Tensor: Class weights tensor
    """
    # Collect all label IDs from training examples, filtering out -100 (ignore index)
    all_label_ids = []
    for example in tokenized_examples:
        # Filter out -100 values (used for padding/subword tokens)
        valid_labels = [label for label in example['labels'] if label != -100]
        all_label_ids.extend(valid_labels)
    
    # Get unique classes that actually appear in the training data
    unique_labels_in_data = np.array(sorted(list(set(all_label_ids))))
    
    # Calculate class weights using sklearn's balanced approach for labels that appear
    class_weights_for_present = compute_class_weight(
        class_weight='balanced',
        classes=unique_labels_in_data,
        y=np.array(all_label_ids)
    )
    
    # Create full weight array for all possible labels
    num_labels = len(label_to_id)
    class_weights = np.ones(num_labels)  # Default weight of 1.0 for unseen labels
    
    # Fill in calculated weights for labels that appear in training data
    for label_id, weight in zip(unique_labels_in_data, class_weights_for_present):
        class_weights[label_id] = weight
    
    # Convert to tensor
    class_weights_tensor = torch.FloatTensor(class_weights)
    
    print(f"üìä Class weights calculated:")
    print(f"  Total label types: {num_labels}")
    print(f"  Labels present in training: {len(unique_labels_in_data)}")
    print(f"  Labels absent from training: {num_labels - len(unique_labels_in_data)}")
    print(f"  Total valid tokens: {len(all_label_ids)}")
    print(f"  Weight range: {class_weights.min():.4f} - {class_weights.max():.4f}")
    print(f"  Mean weight: {class_weights.mean():.4f}")
    
    return class_weights_tensor


class WeightedTrainer(Trainer):
    """
    Custom Trainer that uses weighted CrossEntropyLoss for handling class imbalance.
    """
    
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute weighted loss for token classification.
        """
        labels = inputs.get("labels")
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        if labels is not None:
            # Move class weights to the same device as logits
            if self.class_weights is not None:
                class_weights = self.class_weights.to(logits.device)
            else:
                class_weights = None
            
            # Create weighted loss function
            loss_fct = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)
            
            # Flatten for loss calculation
            active_loss = labels.view(-1) != -100
            active_logits = logits.view(-1, logits.shape[-1])
            active_labels = torch.where(
                active_loss,
                labels.view(-1),
                torch.tensor(loss_fct.ignore_index).type_as(labels)
            )
            
            loss = loss_fct(active_logits, active_labels)
        else:
            loss = None
        
        return (loss, outputs) if return_outputs else loss

print("‚úÖ Class weights functions defined successfully!")

## 7. K-Fold Cross-Validation Setup

In [None]:
# Set up 5-fold cross-validation
N_FOLDS = 5
kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Convert to numpy array for easier indexing
examples_array = np.array(prepared_examples, dtype=object)

print(f"Setting up {N_FOLDS}-fold cross-validation")
print(f"Total examples: {len(prepared_examples)}")
print(f"Examples per fold (approx): {len(prepared_examples) // N_FOLDS}")

# Load tokenizer (will be used across all folds)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\nLoaded tokenizer for {MODEL_NAME}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

# Store results from all folds
fold_results = []

## 8. K-Fold Cross-Validation Helper Functions

In [None]:
# ============================================================================
# K-FOLD CROSS-VALIDATION HELPER FUNCTIONS WITH CLASS WEIGHTS
# ============================================================================

def prepare_fold_data_with_class_weights(train_examples, val_examples, tokenizer, ner_dataset):
    """
    Prepare training and validation datasets for a specific fold with class weights.

    Args:
        train_examples: Training examples for this fold
        val_examples: Validation examples for this fold
        tokenizer: Tokenizer instance
        ner_dataset: NER dataset instance

    Returns:
        tuple: (train_dataset, val_dataset, data_collator, class_weights)
    """
    # Tokenize datasets with sliding window
    train_tokenized = tokenize_and_align_labels_with_sliding_window(
        train_examples, tokenizer, ner_dataset.label_to_id,
        max_length=model_config['max_length'], stride=model_config['stride']
    )

    val_tokenized = tokenize_and_align_labels_with_sliding_window(
        val_examples, tokenizer, ner_dataset.label_to_id,
        max_length=model_config['max_length'], stride=model_config['stride']
    )

    # Calculate class weights from training data
    print("‚öñÔ∏è  Calculating class weights from training data...")
    class_weights = calculate_class_weights_from_tokenized(train_tokenized, ner_dataset.label_to_id)

    # Create HuggingFace datasets
    train_dataset, val_dataset, _ = create_huggingface_datasets(
        train_tokenized, val_tokenized, val_tokenized  # Using val as placeholder for test
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer,
        padding=True,
        return_tensors="pt"
    )

    return train_dataset, val_dataset, data_collator, class_weights

print("‚úÖ K-fold helper functions with class weights defined successfully!")

In [None]:
def create_model_and_weighted_trainer(fold_num, train_dataset, val_dataset, data_collator, tokenizer, ner_dataset, class_weights, device):
    """
    Create model and weighted trainer for a specific fold with comprehensive metrics tracking.

    Args:
        fold_num: Current fold number
        train_dataset: Training dataset for this fold
        val_dataset: Validation dataset for this fold
        data_collator: Data collator
        tokenizer: Tokenizer instance
        ner_dataset: NER dataset instance
        class_weights: Class weights tensor
        device: Device to use (cuda/cpu)

    Returns:
        tuple: (model, trainer, metrics_callback, fold_output_dir)
    """
    # Create fold-specific output directory
    fold_output_dir = f"{OUTPUT_DIR}/fold_{fold_num}"
    import os
    os.makedirs(fold_output_dir, exist_ok=True)

    # Load fresh model for this fold
    model, _ = load_model_and_tokenizer(
        MODEL_NAME,
        ner_dataset.get_num_labels(),
        ner_dataset.id_to_label,
        ner_dataset.label_to_id
    )

    # Move model to device
    model.to(device)

    # Create training arguments for this fold
    training_args = create_training_arguments(
        output_dir=fold_output_dir,
        num_epochs=model_config['num_epochs'],
        batch_size=model_config['batch_size'],
        learning_rate=model_config['learning_rate'],
        warmup_steps=500,
        weight_decay=0.01,
        logging_steps=50,
        eval_steps=100,
        save_steps=500,
        early_stopping_patience=3
    )

    # Create metrics callback for comprehensive tracking
    metrics_callback = PerClassMetricsCallback(id_to_label=ner_dataset.id_to_label)

    # Import compute_metrics from model_utils (not exported in __init__.py)
    from shared.model_utils import compute_metrics
    from transformers import EarlyStoppingCallback
    
    # Create compute_metrics function with id_to_label bound
    def compute_metrics_fn(eval_pred):
        return compute_metrics(eval_pred, ner_dataset.id_to_label)

    # Build callbacks list
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3), metrics_callback]

    # Create weighted trainer with class weights
    print(f"‚öñÔ∏è  Creating WeightedTrainer with class weights for fold {fold_num}")
    trainer = WeightedTrainer(
        class_weights=class_weights,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_fn,
        callbacks=callbacks
    )

    print(f"Weighted trainer initialized for fold {fold_num} with comprehensive metrics tracking")
    return model, trainer, metrics_callback, fold_output_dir

print("‚úÖ Model and weighted trainer creation function defined successfully!")

In [None]:
def train_and_evaluate_fold(fold_num, trainer, val_dataset, ner_dataset):
    """
    Train and evaluate a model for a specific fold.

    Args:
        fold_num: Current fold number
        trainer: Trainer instance
        val_dataset: Validation dataset for this fold
        ner_dataset: NER dataset instance

    Returns:
        dict: Fold results including metrics
    """
    print(f"\nüèãÔ∏è  Training fold {fold_num} with class weights...")

    # Train the model
    trainer.train()

    print(f"üíæ Saving model for fold {fold_num}...")
    trainer.save_model()

    # Evaluate on validation set
    print(f"üìä Evaluating fold {fold_num}...")
    eval_results = detailed_evaluation(
        trainer, val_dataset, f"Fold {fold_num} Validation", ner_dataset.id_to_label
    )

    # Extract metrics
    fold_result = {
        'fold': fold_num,
        'precision': eval_results['precision'],
        'recall': eval_results['recall'],
        'f1': eval_results['f1'],
        'accuracy': eval_results['accuracy'],
        'true_predictions': eval_results['true_predictions'],
        'true_labels': eval_results['true_labels']
    }

    print(f"\nFold {fold_num} completed successfully!")
    return fold_result

print("‚úÖ Training and evaluation helper function defined successfully!")

## 9. K-Fold Cross-Validation Training Loop with Class Weights

In [None]:
# ============================================================================
# MAIN K-FOLD CROSS-VALIDATION LOOP WITH CLASS WEIGHTS
# ============================================================================

print(f"Using device: {device}")
print(f"\n{'='*80}")
print(f"STARTING 5-FOLD CROSS-VALIDATION WITH CLASS WEIGHTS")
print(f"{'='*80}")
print(f"Total examples: {len(prepared_examples)}")
print(f"Model: {MODEL_NAME}")
print(f"Device: {device}")

for fold_num, (train_idx, val_idx) in enumerate(kfold.split(examples_array), 1):
    print(f"\n{'='*80}")
    print(f"FOLD {fold_num}/{N_FOLDS}")
    print(f"{'='*80}")
    print(f"Train indices: {len(train_idx)}, Val indices: {len(val_idx)}")
    
    # Split data for this fold
    train_examples = examples_array[train_idx].tolist()
    val_examples = examples_array[val_idx].tolist()
    
    print(f"Training examples: {len(train_examples)}")
    print(f"Validation examples: {len(val_examples)}")
    
    # Analyze entity distributions for this fold
    print(f"\nüìä Analyzing entity distributions...")
    train_dist = analyze_entity_distribution_per_fold(train_examples, f"Fold {fold_num} - Training")
    val_dist = analyze_entity_distribution_per_fold(val_examples, f"Fold {fold_num} - Validation")
    
    # Prepare data for this fold with class weights
    print(f"\nüî§ Preparing data for fold {fold_num} with class weights...")
    train_dataset, val_dataset, data_collator, class_weights = prepare_fold_data_with_class_weights(
        train_examples, val_examples, tokenizer, ner_dataset
    )
    
    print(f"üì¶ Fold {fold_num} datasets:")
    print(f"  Training: {len(train_dataset)} examples")
    print(f"  Validation: {len(val_dataset)} examples")
    
    # Create model and weighted trainer for this fold
    print(f"\nü§ñ Creating model and weighted trainer for fold {fold_num}...")
    model, trainer, metrics_callback, fold_output_dir = create_model_and_weighted_trainer(
        fold_num, train_dataset, val_dataset, data_collator, tokenizer, ner_dataset, class_weights, device
    )
    
    # Train and evaluate this fold
    fold_result = train_and_evaluate_fold(fold_num, trainer, val_dataset, ner_dataset)
    
    # Get predictions and labels for aggregation
    print(f"\nüìä Getting predictions for fold {fold_num}...")
    predictions, labels, _ = trainer.predict(val_dataset)
    predictions = np.argmax(predictions, axis=2)
    
    # Convert to label names
    true_labels = [[ner_dataset.id_to_label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[ner_dataset.id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
                   for prediction, label in zip(predictions, labels)]
    
    # Generate per-class metrics and confusion matrix for this fold
    from sklearn.metrics import confusion_matrix
    flat_true = [label for seq in true_labels for label in seq]
    flat_pred = [label for seq in pred_labels for label in seq]
    all_labels = sorted(list(set(flat_true + flat_pred)))
    cm = confusion_matrix(flat_true, flat_pred, labels=all_labels)
    
    # Generate classification report for this fold
    per_class_metrics = generate_detailed_classification_report(
        true_labels, pred_labels, fold_output_dir, fold_num, "Class Weights Validation"
    )
    
    # Store comprehensive data for aggregation
    fold_result['distributions'] = {'train': train_dist, 'val': val_dist}
    fold_result['per_class_metrics'] = per_class_metrics
    fold_result['confusion_matrix'] = cm
    fold_result['labels'] = all_labels
    fold_result['training_history'] = metrics_callback.get_training_history()
    fold_results.append(fold_result)
    
    # Clean up to free memory
    del model, trainer, train_dataset, val_dataset, metrics_callback
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

print(f"\n{'='*80}")
print(f"5-FOLD CROSS-VALIDATION WITH CLASS WEIGHTS COMPLETED")
print(f"{'='*80}")

## 10. Aggregate Results Across Folds

In [None]:
# ============================================================================
# AGGREGATE RESULTS ACROSS ALL FOLDS
# ============================================================================

print(f"\n{'='*80}")
print(f"GENERATING AGGREGATE REPORT ACROSS ALL {N_FOLDS} FOLDS")
print(f"{'='*80}")

# Create aggregate report with all visualizations displayed in notebook
aggregate_report = create_aggregate_report_across_folds(
    fold_results=fold_results,
    model_name="BERTiƒá Base with Class Weights",
    display=True
)

# Calculate overall metrics from fold results
precisions = [fold['precision'] for fold in fold_results]
recalls = [fold['recall'] for fold in fold_results]
f1_scores = [fold['f1'] for fold in fold_results]
accuracies = [fold['accuracy'] for fold in fold_results]

# Print summary
print(f"\n{'='*80}")
print(f"FINAL RESULTS - BERTiƒá Base with Class Weights ({N_FOLDS}-Fold CV)")
print(f"{'='*80}")
print(f"\nOverall Metrics (Mean ¬± Std):")
print(f"  Precision: {np.mean(precisions):.4f} ¬± {np.std(precisions):.4f}")
print(f"  Recall:    {np.mean(recalls):.4f} ¬± {np.std(recalls):.4f}")
print(f"  F1-score:  {np.mean(f1_scores):.4f} ¬± {np.std(f1_scores):.4f}")
print(f"  Accuracy:  {np.mean(accuracies):.4f} ¬± {np.std(accuracies):.4f}")

# Save aggregate report
import json
aggregate_report_path = f"{OUTPUT_DIR}/aggregate_report.json"
with open(aggregate_report_path, 'w') as f:
    json.dump({
        'model_name': 'BERTiƒá Base with Class Weights',
        'n_folds': N_FOLDS,
        'overall_metrics': {
            'precision_mean': float(np.mean(precisions)),
            'precision_std': float(np.std(precisions)),
            'recall_mean': float(np.mean(recalls)),
            'recall_std': float(np.std(recalls)),
            'f1_mean': float(np.mean(f1_scores)),
            'f1_std': float(np.std(f1_scores)),
            'accuracy_mean': float(np.mean(accuracies)),
            'accuracy_std': float(np.std(accuracies))
        },
        'fold_results': fold_results
    }, f, indent=2, default=str)
print(f"\nüíæ Saved aggregate report to {aggregate_report_path}")

print(f"\n‚úÖ All results saved to: {OUTPUT_DIR}")
print(f"\nüìä All visualizations displayed in notebook above.")