# Serbian Legal NER Pipeline with XLM-RoBERTa BERTiƒá - Refactored

This notebook demonstrates the multilingual XLM-RoBERTa BERTiƒá approach for Serbian Legal NER using shared modules.
XLM-RoBERTa BERTiƒá combines multilingual capabilities with Serbian language specialization.

## Key Features:
- **Multilingual Foundation**: XLM-RoBERTa base with cross-lingual knowledge
- **Serbian Specialization**: Fine-tuned on Serbian texts (BERTiƒá)
- **Cross-lingual Transfer**: Benefits from multilingual pretraining
- **Robust Performance**: Better generalization across language variants

## Model Comparison:
- **BERTiƒá**: Monolingual Serbian BERT
- **XLM-R BERTiƒá**: Multilingual XLM-RoBERTa adapted for Serbian

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 1. Environment Setup and Dependencies

In [None]:
# Install required packages
!pip install transformers torch datasets tokenizers scikit-learn seqeval pandas numpy matplotlib seaborn tqdm

In [None]:
# Import shared modules
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add the shared modules to path
sys.path.append('/content/drive/MyDrive/NER_Master/ner/shared')

# Reload shared modules to get latest changes
import importlib
import shared
import shared.model_utils
import shared.data_processing
import shared.dataset
import shared.evaluation
import shared.config
importlib.reload(shared.config)
importlib.reload(shared.data_processing)
importlib.reload(shared.dataset)
importlib.reload(shared.model_utils)
importlib.reload(shared.evaluation)
importlib.reload(shared)

# Import from shared modules
from shared import (
    # Configuration
    ENTITY_TYPES, BIO_LABELS, DEFAULT_TRAINING_ARGS,
    get_model_config, get_paths, setup_environment,
    
    # Data processing
    LabelStudioToBIOConverter, load_labelstudio_data, 
    analyze_labelstudio_data, validate_bio_examples,
    
    # Dataset
    NERDataset, split_dataset, tokenize_and_align_labels_with_sliding_window,
    print_sequence_analysis, create_huggingface_datasets,
    
    # Model utilities
    load_model_and_tokenizer, create_training_arguments, create_trainer,
    detailed_evaluation, save_model_info, setup_device_and_seed,
    
    # Inference and Evaluation
    load_inference_pipeline,
    generate_evaluation_report, plot_training_history, plot_entity_distribution
)

from transformers import DataCollatorForTokenClassification

# Setup device and random seed
device = setup_device_and_seed(42)
print(f"üîß Using device: {device}")

## 2. Configuration and Environment Setup

In [None]:
# Setup environment and paths for Google Colab
env_setup = setup_environment(use_local=False, create_dirs=True)
paths = env_setup['paths']

# Model configuration for XLM-R BERTiƒá
MODEL_NAME = "classla/xlm-r-bertic"
model_config = get_model_config(MODEL_NAME)

# Output directory
OUTPUT_DIR = f"{paths['models_dir']}/xlm_r_bertic_refactored"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"üîß XLM-R BERTiƒá Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Entity types: {len(ENTITY_TYPES)}")
print(f"  BIO labels: {len(BIO_LABELS)}")
print(f"  Max length: {model_config['max_length']}")
print(f"  Learning rate: {model_config['learning_rate']}")
print(f"  Batch size: {model_config['batch_size']}")
print(f"  Epochs: {model_config['num_epochs']}")

print(f"\nüåç Multilingual advantages:")
print(f"  ‚úÖ Cross-lingual knowledge transfer")
print(f"  ‚úÖ Better handling of code-switching")
print(f"  ‚úÖ Robust to language variants")
print(f"  ‚úÖ Larger vocabulary coverage")

## 3. Data Loading and Preprocessing

In [None]:
# Load and analyze LabelStudio data
labelstudio_data = load_labelstudio_data(paths['labelstudio_json'])

if labelstudio_data:
    analysis = analyze_labelstudio_data(labelstudio_data)
    
    # Convert to BIO format
    converter = LabelStudioToBIOConverter(
        judgments_dir=paths['judgments_dir'],
        labelstudio_files_dir=paths.get('labelstudio_files_dir')
    )
    
    bio_examples = converter.convert_to_bio(labelstudio_data)
    print(f"‚úÖ Converted {len(bio_examples)} examples to BIO format")
    
    # Validate BIO examples
    valid_examples, stats = validate_bio_examples(bio_examples)
    print(f"üìä Validation complete: {stats['valid_examples']} valid examples")
else:
    print("‚ùå No data loaded. Please check your paths.")
    raise Exception("Data loading failed")

## 4. Dataset Preparation and Splitting

In [None]:
# Create NER dataset
ner_dataset = NERDataset(valid_examples)
prepared_examples = ner_dataset.prepare_for_training()

# Split dataset
train_examples, val_examples, test_examples = split_dataset(
    prepared_examples, test_size=0.2, val_size=0.1, random_state=42
)

print(f"üìä Dataset split:")
print(f"  Training: {len(train_examples)} examples")
print(f"  Validation: {len(val_examples)} examples")
print(f"  Test: {len(test_examples)} examples")
print(f"  Total labels: {ner_dataset.get_num_labels()}")

# Show label statistics
label_stats = ner_dataset.get_label_statistics()
print(f"\nüìà Label distribution:")
for entity, count in sorted(label_stats['entity_counts'].items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {entity}: {count}")

## 5. XLM-R BERTiƒá Model Loading

In [None]:
# Load XLM-R BERTiƒá model and tokenizer
model, tokenizer = load_model_and_tokenizer(
    MODEL_NAME, 
    ner_dataset.get_num_labels(),
    ner_dataset.id_to_label,
    ner_dataset.label_to_id
)

print(f"‚úÖ XLM-R BERTiƒá model loaded successfully")
print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"üåç Multilingual model with Serbian specialization")

# Show tokenizer info
print(f"\nüî§ Tokenizer information:")
print(f"  Vocabulary size: {tokenizer.vocab_size:,}")
print(f"  Model max length: {tokenizer.model_max_length}")
print(f"  Special tokens: {len(tokenizer.special_tokens_map)}")

## 6. Data Tokenization with XLM-R

In [None]:
# Analyze sequence lengths with XLM-R tokenizer
print("üìè Sequence length analysis with XLM-R tokenizer:")
print("\nTraining set:")
print_sequence_analysis(train_examples, tokenizer)
print("\nValidation set:")
print_sequence_analysis(val_examples, tokenizer)

# Tokenize datasets with sliding window
print("\nüî§ Tokenizing datasets with XLM-R tokenizer...")

train_tokenized = tokenize_and_align_labels_with_sliding_window(
    train_examples, tokenizer, ner_dataset.label_to_id, 
    max_length=model_config['max_length'], 
    stride=model_config['stride']
)

val_tokenized = tokenize_and_align_labels_with_sliding_window(
    val_examples, tokenizer, ner_dataset.label_to_id,
    max_length=model_config['max_length'], 
    stride=model_config['stride']
)

test_tokenized = tokenize_and_align_labels_with_sliding_window(
    test_examples, tokenizer, ner_dataset.label_to_id,
    max_length=model_config['max_length'], 
    stride=model_config['stride']
)

# Create HuggingFace datasets
train_dataset, val_dataset, test_dataset = create_huggingface_datasets(
    train_tokenized, val_tokenized, test_tokenized
)

# Data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

print("‚úÖ Tokenization complete")

## 7. Training Setup

In [None]:
# Create training arguments for XLM-R BERTiƒá
training_args = create_training_arguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=model_config['num_epochs'],
    per_device_train_batch_size=model_config['batch_size'],
    per_device_eval_batch_size=model_config['batch_size'],
    learning_rate=model_config['learning_rate'],
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=100,
    save_steps=500,
    early_stopping_patience=3
)

# Create trainer
trainer = create_trainer(
    model=model,
    training_args=training_args,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    id_to_label=ner_dataset.id_to_label,
    early_stopping_patience=3
)

print("üèãÔ∏è  XLM-R BERTiƒá trainer created successfully")

## 8. Model Training

In [None]:
# Start XLM-R BERTiƒá training
print("üöÄ Starting XLM-R BERTiƒá training...")
print("üåç Leveraging multilingual knowledge for Serbian legal NER")

trainer.train()

print("üíæ Saving XLM-R BERTiƒá model...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

# Save model info with multilingual details
save_model_info(
    output_dir=OUTPUT_DIR,
    model_name=MODEL_NAME,
    model_type="xlm_r_bertic",
    num_labels=ner_dataset.get_num_labels(),
    id_to_label=ner_dataset.id_to_label,
    label_to_id=ner_dataset.label_to_id,
    training_args=training_args,
    additional_info={
        "base_architecture": "XLM-RoBERTa",
        "language_specialization": "Serbian (BERTiƒá)",
        "multilingual": True,
        "vocab_size": tokenizer.vocab_size,
        "cross_lingual": True
    }
)

print("‚úÖ XLM-R BERTiƒá training completed!")

## 9. Model Evaluation

In [None]:
# Evaluate XLM-R BERTiƒá model on test set
print("üìä Evaluating XLM-R BERTiƒá model on test set...")

test_results = detailed_evaluation(
    trainer=trainer,
    dataset=test_dataset,
    dataset_name="Test (XLM-R BERTiƒá)",
    id_to_label=ner_dataset.id_to_label
)

print(f"\nüìà XLM-R BERTiƒá Test Results:")
print(f"  Precision: {test_results['precision']:.4f}")
print(f"  Recall: {test_results['recall']:.4f}")
print(f"  F1-score: {test_results['f1']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")

print(f"\nüí° Expected advantages of XLM-R BERTiƒá:")
print(f"  ‚úÖ Better handling of multilingual contexts")
print(f"  ‚úÖ Robust to language variations")
print(f"  ‚úÖ Cross-lingual knowledge transfer")
print(f"  ‚úÖ Larger vocabulary coverage")
print(f"  ‚úÖ Better generalization capabilities")

## 10. Comprehensive Analysis

In [None]:
# Generate comprehensive evaluation report
evaluation_report = generate_evaluation_report(
    true_labels=test_results['true_labels'],
    predictions=test_results['true_predictions'],
    dataset_name="Test (XLM-R BERTiƒá)",
    focus_entities=["COURT", "JUDGE", "DEFENDANT", "CRIMINAL_ACT", "PROVISION"]
)

print("\nüåç Multilingual Model Analysis:")
print("\nüî§ Tokenization Comparison:")
sample_text = "–û—Å–Ω–æ–≤–Ω–∏ —Å—É–¥ —É –ë–µ–æ–≥—Ä–∞–¥—É"
tokens = tokenizer.tokenize(sample_text)
print(f"  Text: '{sample_text}'")
print(f"  XLM-R tokens: {tokens}")
print(f"  Token count: {len(tokens)}")

print("\nüéØ Model Strengths:")
print("  ‚Ä¢ Cross-lingual pretraining on 100+ languages")
print("  ‚Ä¢ Serbian language specialization via BERTiƒá")
print("  ‚Ä¢ Robust subword tokenization")
print("  ‚Ä¢ Better handling of out-of-vocabulary terms")
print("  ‚Ä¢ Improved performance on code-switching scenarios")

## 11. Training History and Visualization

In [None]:
# Plot training history
plot_training_history(trainer)

# Plot entity distribution
label_stats = ner_dataset.get_label_statistics()
plot_entity_distribution(label_stats['entity_counts'])

## 12. Inference Pipeline Testing

In [None]:
# Load XLM-R BERTiƒá inference pipeline
pipeline = load_inference_pipeline(
    model_path=OUTPUT_DIR,
    max_length=model_config['max_length'],
    stride=model_config['stride']
)

# Test with sample text
sample_text = """–û—Å–Ω–æ–≤–Ω–∏ —Å—É–¥ —É –ë–µ–æ–≥—Ä–∞–¥—É –¥–æ–Ω–µ–æ —ò–µ –ø—Ä–µ—Å—É–¥—É —É –∫—Ä–∏–≤–∏—á–Ω–æ–º –ø—Ä–µ–¥–º–µ—Ç—É –ö-1234/2023 –ø—Ä–æ—Ç–∏–≤ –æ–ø—Ç—É–∂–µ–Ω–æ–≥ –ú–∞—Ä–∫–∞ –ü–µ—Ç—Ä–æ–≤–∏—õ–∞ –∑–∞ –∫—Ä–∏–≤–∏—á–Ω–æ –¥–µ–ª–æ –∫—Ä–∞—í–µ –∏–∑ —á–ª–∞–Ω–∞ 203 –ö—Ä–∏–≤–∏—á–Ω–æ–≥ –∑–∞–∫–æ–Ω–∏–∫–∞. –°—É–¥–∏—ò–∞ –ê–Ω–∞ –ù–∏–∫–æ–ª–∏—õ –∏–∑—Ä–µ–∫–ª–∞ —ò–µ –∫–∞–∑–Ω—É –∑–∞—Ç–≤–æ—Ä–∞ —É —Ç—Ä–∞—ò–∞—ö—É –æ–¥ 6 –º–µ—Å–µ—Ü–∏."""

print("üîç Testing XLM-R BERTiƒá inference pipeline:")
print(f"Input text: {sample_text}")
print("\nüìã Detected entities (with XLM-R BERTiƒá):")

entities = pipeline.predict(sample_text)
for entity in entities:
    print(f"  {entity['label']}: '{entity['text']}' (tokens {entity['start']}-{entity['end']})")

print(f"\n‚úÖ Found {len(entities)} entities using XLM-R BERTiƒá")
print("üåç Model benefits from multilingual knowledge and Serbian specialization")

# Test with mixed language content (if applicable)
mixed_text = "–°—É–¥ —ò–µ –¥–æ–Ω–µ–æ decision —É –ø—Ä–µ–¥–º–µ—Ç—É case number –ö-1234/2023."
print(f"\nüîÑ Testing with mixed language text:")
print(f"Input: {mixed_text}")
mixed_entities = pipeline.predict(mixed_text)
print(f"Entities: {len(mixed_entities)} found")
for entity in mixed_entities:
    print(f"  {entity['label']}: '{entity['text']}'")

## 13. Model Comparison Summary

In [None]:
print("\nüìä MODEL COMPARISON: BERTiƒá vs XLM-R BERTiƒá")
print("=" * 60)

print("\nüá∑üá∏ BERTiƒá (Monolingual):")
print("  ‚úÖ Specialized for Serbian language")
print("  ‚úÖ Smaller model size")
print("  ‚úÖ Faster inference")
print("  ‚ùå Limited to Serbian only")
print("  ‚ùå No cross-lingual knowledge")

print("\nüåç XLM-R BERTiƒá (Multilingual):")
print("  ‚úÖ Cross-lingual knowledge transfer")
print("  ‚úÖ Better handling of code-switching")
print("  ‚úÖ Robust to language variations")
print("  ‚úÖ Larger vocabulary coverage")
print("  ‚ùå Larger model size")
print("  ‚ùå Potentially slower inference")

print(f"\nüéØ Current XLM-R BERTiƒá Performance:")
print(f"  Precision: {test_results['precision']:.4f}")
print(f"  Recall: {test_results['recall']:.4f}")
print(f"  F1-score: {test_results['f1']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")

## 14. Summary and Results

In [None]:
print("\nüéØ XLM-R BERTiƒá FINAL SUMMARY")
print("=" * 50)
print(f"Model: {MODEL_NAME}")
print(f"Training examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")
print(f"Test examples: {len(test_examples)}")
print(f"Entity types: {len(ENTITY_TYPES)}")
print(f"BIO labels: {len(BIO_LABELS)}")
print(f"\nModel Configuration:")
print(f"  Architecture: XLM-RoBERTa + Serbian specialization")
print(f"  Vocabulary size: {tokenizer.vocab_size:,}")
print(f"  Max length: {model_config['max_length']}")
print(f"  Learning rate: {model_config['learning_rate']}")
print(f"  Epochs: {model_config['num_epochs']}")
print(f"\nTest Performance:")
print(f"  Precision: {test_results['precision']:.4f}")
print(f"  Recall: {test_results['recall']:.4f}")
print(f"  F1-score: {test_results['f1']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")
print(f"\nModel saved to: {OUTPUT_DIR}")
print("\n‚úÖ XLM-R BERTiƒá pipeline completed successfully!")
print("\nüí° XLM-R BERTiƒá advantages:")
print("   ‚Ä¢ Multilingual foundation with Serbian specialization")
print("   ‚Ä¢ Cross-lingual knowledge transfer")
print("   ‚Ä¢ Better generalization capabilities")
print("   ‚Ä¢ Robust handling of language variations")