# Serbian Legal NER Pipeline with Domain-Adaptive Pretraining (DAPT) - Refactored

This notebook demonstrates Domain-Adaptive Pretraining (DAPT) using Masked Language Modeling (MLM) 
on Serbian legal documents before fine-tuning for NER using shared modules.

## Key Features:
- **Domain-Adaptive Pretraining**: MLM on unlabeled legal documents
- **Two-Stage Training**: MLM pretraining → NER fine-tuning
- **Legal Domain Knowledge**: Better understanding of legal terminology
- **Improved Performance**: Better generalization on legal NER tasks

## Training Pipeline:
1. **Stage 1**: MLM pretraining on unlabeled legal documents (3 epochs, 5e-5 LR)
2. **Stage 2**: NER fine-tuning on labeled data using adapted model

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 1. Environment Setup and Dependencies

In [None]:
# Install required packages
!pip install transformers torch datasets tokenizers scikit-learn seqeval pandas numpy matplotlib seaborn tqdm

In [None]:
# Import shared modules
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add the shared modules to path
sys.path.append('/content/drive/MyDrive/NER_Master/ner/shared')

# Reload shared modules to get latest changes
import importlib
import shared
import shared.model_utils
import shared.data_processing
import shared.dataset
import shared.evaluation
import shared.config
importlib.reload(shared.config)
importlib.reload(shared.data_processing)
importlib.reload(shared.dataset)
importlib.reload(shared.model_utils)
importlib.reload(shared.evaluation)
importlib.reload(shared)

# Import from shared modules
from shared import (
    # Configuration
    ENTITY_TYPES, BIO_LABELS, DEFAULT_TRAINING_ARGS,
    get_default_model_config, get_paths, setup_environment, get_default_training_args,
    
    # Data processing
    LabelStudioToBIOConverter, load_labelstudio_data, 
    analyze_labelstudio_data, validate_bio_examples, load_mlm_documents,
    
    # Dataset
    NERDataset, split_dataset, tokenize_and_align_labels_with_sliding_window,
    print_sequence_analysis, create_huggingface_datasets,
    
    # Model utilities
    load_model_and_tokenizer, create_training_arguments, create_trainer,
    detailed_evaluation, save_model_info, setup_device_and_seed,
    load_inference_pipeline,
    
    
    # Evaluation
    generate_evaluation_report, plot_training_history, plot_entity_distribution
)

from transformers import DataCollatorForTokenClassification, AutoTokenizer

# Setup device and random seed
device = setup_device_and_seed(42)
print(f"🔧 Using device: {device}")

## 2. Configuration and Environment Setup

In [None]:
# Setup environment and paths for Google Colab
env_setup = setup_environment(use_local=False, create_dirs=True)
paths = env_setup['paths']

# Model configuration for DAPT
BASE_MODEL_NAME = "classla/bcms-bertic"

# MLM Configuration (notebook-specific)
mlm_config = {
    "num_epochs": 3,
    "batch_size": 8,
    "learning_rate": 5e-5,
    "weight_decay": 0.01,
    "warmup_steps": 500,
    "max_length": 512,
    "stride": 256,
    "mlm_probability": 0.15,
    "save_steps": 1000,
    "logging_steps": 100
}

# NER Configuration
experiment_config = {
    "num_train_epochs": 8,
    "learning_rate": 3e-5,
    "batch_size": 4,
    "max_length": 512,
    "stride": 128
}

# Output directories
MLM_OUTPUT_DIR = f"{paths['models_dir']}/bertic_dapt_mlm_adapted"
NER_OUTPUT_DIR = f"{paths['models_dir']}/bertic_dapt_ner_refactored"
os.makedirs(MLM_OUTPUT_DIR, exist_ok=True)
os.makedirs(NER_OUTPUT_DIR, exist_ok=True)

print(f"🔧 DAPT Configuration:")
print(f"  Base model: {BASE_MODEL_NAME}")
print(f"  MLM output: {MLM_OUTPUT_DIR}")
print(f"  NER output: {NER_OUTPUT_DIR}")
print(f"  MLM data: {paths['mlm_data_dir']}")
print(f"\n📚 MLM Configuration:")
print(f"  Epochs: {mlm_config['num_epochs']}")
print(f"  Batch size: {mlm_config['batch_size']}")
print(f"  Learning rate: {mlm_config['learning_rate']}")
print(f"  Max length: {mlm_config['max_length']}")

## 3. MLM Pretraining Functions (Notebook-specific)

In [None]:
# MLM Pretraining Functions (notebook-specific)
from datasets import Dataset
from transformers import (
    AutoModelForMaskedLM, DataCollatorForLanguageModeling,
    TrainingArguments, Trainer
)
from tqdm import tqdm

def preprocess_mlm_data(documents: List[str], tokenizer, max_length: int = 512, stride: int = 256):
    """Preprocess documents for MLM training"""
    # Split long documents into chunks
    chunks = []
    for doc in tqdm(documents, desc="Processing MLM documents"):
        # Tokenize document
        tokens = tokenizer.tokenize(doc)
        
        # Split into overlapping chunks
        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i:i + max_length - 2]  # -2 for [CLS] and [SEP]
            if len(chunk_tokens) > 10:  # Only keep chunks with sufficient content
                chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
                chunks.append(chunk_text)
    
    print(f"📄 Created {len(chunks)} text chunks for MLM training")
    
    # Create dataset
    dataset = Dataset.from_dict({"text": chunks})
    
    # Tokenize for MLM
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding=True,
            max_length=max_length,
            return_special_tokens_mask=True
        )
    
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    
    return tokenized_dataset

def perform_mlm_pretraining(model, tokenizer, train_dataset, output_dir: str, mlm_config: dict):
    """Perform MLM pretraining on legal documents"""
    # Convert model to MLM model if needed
    if not hasattr(model, 'cls'):
        model = AutoModelForMaskedLM.from_pretrained(model.name_or_path)
    
    # Data collator for MLM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=mlm_config['mlm_probability']
    )
    
    # Training arguments for MLM
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=mlm_config['num_epochs'],
        per_device_train_batch_size=mlm_config['batch_size'],
        save_steps=mlm_config['save_steps'],
        save_total_limit=2,
        prediction_loss_only=True,
        learning_rate=mlm_config['learning_rate'],
        weight_decay=mlm_config['weight_decay'],
        warmup_steps=mlm_config['warmup_steps'],
        logging_dir=f"{output_dir}/logs",
        logging_steps=mlm_config['logging_steps'],
        dataloader_num_workers=0,
        remove_unused_columns=False,
        push_to_hub=False,
        report_to=None
    )
    
    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    
    print(f"🚀 Starting MLM pretraining...")
    trainer.train()
    
    # Save the adapted model
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)
    
    print(f"✅ MLM pretraining completed. Model saved to {output_dir}")
    return trainer

## 4. Stage 1: Domain-Adaptive Pretraining (MLM)

In [None]:
# Load MLM documents
print("📚 Loading MLM documents for domain adaptation...")
mlm_documents = load_mlm_documents(paths['mlm_data_dir'])

if mlm_documents:
    print(f"✅ Loaded {len(mlm_documents)} documents for MLM training")
    
    # Show document statistics
    total_chars = sum(len(doc) for doc in mlm_documents)
    avg_chars = total_chars / len(mlm_documents)
    print(f"📊 MLM Data Statistics:")
    print(f"  Total documents: {len(mlm_documents)}")
    print(f"  Total characters: {total_chars:,}")
    print(f"  Average document length: {avg_chars:.0f} characters")
else:
    print("❌ No MLM documents found. Please check your MLM data directory.")
    print(f"Expected path: {paths['mlm_data_dir']}")
    raise Exception("MLM data loading failed")

In [None]:
# Perform MLM pretraining
print("🚀 Starting Domain-Adaptive Pretraining (MLM)...")

try:
    # Load base model and tokenizer for MLM
    from transformers import AutoModelForMaskedLM, AutoTokenizer
    
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL_NAME)
    
    # Preprocess MLM data
    print("📄 Preprocessing MLM data...")
    mlm_dataset = preprocess_mlm_data(
        mlm_documents, 
        tokenizer, 
        max_length=mlm_config['max_length'], 
        stride=mlm_config['stride']
    )
    
    # Perform MLM pretraining
    print("🚀 Starting MLM pretraining...")
    mlm_trainer = perform_mlm_pretraining(
        model=model,
        tokenizer=tokenizer,
        train_dataset=mlm_dataset,
        output_dir=MLM_OUTPUT_DIR,
        mlm_config=mlm_config
    )
    
    print(f"✅ MLM pretraining completed successfully!")
    print(f"📁 Domain-adapted model saved to: {MLM_OUTPUT_DIR}")
    ADAPTED_MODEL_NAME = MLM_OUTPUT_DIR
    mlm_success = True
    
except Exception as e:
    print(f"❌ MLM pretraining failed: {e}")
    print("Using base model for NER training")
    ADAPTED_MODEL_NAME = BASE_MODEL_NAME
    mlm_success = False

print(f"\n🎯 Will use model for NER: {ADAPTED_MODEL_NAME}")

## 4. Stage 2: NER Data Loading and Preprocessing

In [None]:
# Load and analyze LabelStudio data for NER
print("📋 Loading NER training data...")
labelstudio_data = load_labelstudio_data(paths['labelstudio_json'])

if labelstudio_data:
    analysis = analyze_labelstudio_data(labelstudio_data)
    
    # Convert to BIO format
    converter = LabelStudioToBIOConverter(
        judgments_dir=paths['judgments_dir'],
        labelstudio_files_dir=paths.get('labelstudio_files_dir')
    )
    
    bio_examples = converter.convert_to_bio(labelstudio_data)
    print(f"✅ Converted {len(bio_examples)} examples to BIO format")
    
    # Validate BIO examples
    valid_examples, stats = validate_bio_examples(bio_examples)
    print(f"📊 Validation complete: {stats['valid_examples']} valid examples")
else:
    print("❌ No NER data loaded. Please check your paths.")
    raise Exception("NER data loading failed")

## 5. NER Dataset Preparation

In [None]:
# Create NER dataset
ner_dataset = NERDataset(valid_examples)
prepared_examples = ner_dataset.prepare_for_training()

# Split dataset
train_examples, val_examples, test_examples = split_dataset(
    prepared_examples, test_size=0.2, val_size=0.1, random_state=42
)

print(f"📊 NER Dataset split:")
print(f"  Training: {len(train_examples)} examples")
print(f"  Validation: {len(val_examples)} examples")
print(f"  Test: {len(test_examples)} examples")
print(f"  Total labels: {ner_dataset.get_num_labels()}")

## 6. Load Domain-Adapted Model for NER

In [None]:
# Load domain-adapted model and tokenizer for NER
print(f"🔄 Loading domain-adapted model for NER fine-tuning...")

model, tokenizer = load_model_and_tokenizer(
    ADAPTED_MODEL_NAME,  # Use the MLM-adapted model
    ner_dataset.get_num_labels(),
    ner_dataset.id_to_label,
    ner_dataset.label_to_id
)

print(f"✅ Domain-adapted model loaded for NER")
print(f"📊 Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"🧠 Model has been pre-adapted to legal domain via MLM")

## 7. NER Data Tokenization

In [None]:
# Tokenize NER datasets with sliding window
print("🔤 Tokenizing NER datasets with domain-adapted tokenizer...")

train_tokenized = tokenize_and_align_labels_with_sliding_window(
    train_examples, tokenizer, ner_dataset.label_to_id, 
    max_length=experiment_config['max_length'], 
    stride=experiment_config['stride']
)

val_tokenized = tokenize_and_align_labels_with_sliding_window(
    val_examples, tokenizer, ner_dataset.label_to_id,
    max_length=experiment_config['max_length'], 
    stride=experiment_config['stride']
)

test_tokenized = tokenize_and_align_labels_with_sliding_window(
    test_examples, tokenizer, ner_dataset.label_to_id,
    max_length=experiment_config['max_length'], 
    stride=experiment_config['stride']
)

# Create HuggingFace datasets
train_dataset, val_dataset, test_dataset = create_huggingface_datasets(
    train_tokenized, val_tokenized, test_tokenized
)

# Data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

print("✅ NER tokenization complete")

## 8. NER Training Setup

In [None]:
# Create training arguments for NER fine-tuning
training_args = create_training_arguments(
    output_dir=NER_OUTPUT_DIR,
    num_train_epochs=experiment_config['num_train_epochs'],
    per_device_train_batch_size=experiment_config['per_device_train_batch_size'],
    per_device_eval_batch_size=experiment_config['per_device_eval_batch_size'],
    learning_rate=experiment_config['learning_rate'],
    warmup_steps=experiment_config['warmup_steps'],
    weight_decay=experiment_config['weight_decay'],
    logging_steps=50,
    eval_steps=100,
    save_steps=500,
    early_stopping_patience=3
)

# Create trainer for NER fine-tuning
trainer = create_trainer(
    model=model,
    training_args=training_args,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    id_to_label=ner_dataset.id_to_label,
    early_stopping_patience=3
)

print("🏋️  NER trainer created for domain-adapted model")

## 9. NER Fine-tuning

In [None]:
# Start NER fine-tuning on domain-adapted model
print("🚀 Starting NER fine-tuning on domain-adapted model...")
print("🧠 Model already adapted to legal domain via MLM pretraining")

trainer.train()

print("💾 Saving DAPT NER model...")
trainer.save_model()
tokenizer.save_pretrained(NER_OUTPUT_DIR)

# Save model info with DAPT details
save_model_info(
    output_dir=NER_OUTPUT_DIR,
    model_name=BASE_MODEL_NAME,
    model_type="dapt_ner",
    num_labels=ner_dataset.get_num_labels(),
    id_to_label=ner_dataset.id_to_label,
    label_to_id=ner_dataset.label_to_id,
    training_args=training_args,
    additional_info={
        "base_model": BASE_MODEL_NAME,
        "mlm_adapted_model": ADAPTED_MODEL_NAME,
        "mlm_epochs": mlm_config['num_epochs'],
        "mlm_lr": mlm_config['learning_rate'],
        "uses_dapt": True,
        "training_stages": ["MLM_pretraining", "NER_finetuning"]
    }
)

print("✅ DAPT NER training completed!")

## 10. Model Evaluation

In [None]:
# Evaluate DAPT model on test set
print("📊 Evaluating DAPT model on test set...")

test_results = detailed_evaluation(
    trainer=trainer,
    dataset=test_dataset,
    dataset_name="Test (DAPT)",
    id_to_label=ner_dataset.id_to_label
)

print(f"\n📈 DAPT Test Results:")
print(f"  Precision: {test_results['precision']:.4f}")
print(f"  Recall: {test_results['recall']:.4f}")
print(f"  F1-score: {test_results['f1']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")

print(f"\n💡 Expected improvements with DAPT:")
print(f"  ✅ Better understanding of legal terminology")
print(f"  ✅ Improved contextual representations")
print(f"  ✅ Better generalization on legal texts")
print(f"  ✅ Enhanced domain-specific knowledge")

## 11. Comprehensive Analysis

In [None]:
# Generate comprehensive evaluation report
evaluation_report = generate_evaluation_report(
    true_labels=test_results['true_labels'],
    predictions=test_results['true_predictions'],
    dataset_name="Test (DAPT)",
    focus_entities=["COURT", "JUDGE", "DEFENDANT", "CRIMINAL_ACT", "PROVISION"]
)

print("\n🧠 DAPT Benefits Analysis:")
print("\n📚 MLM Pretraining Stage:")
print(f"  • Trained on {len(mlm_documents)} legal documents")
print(f"  • {mlm_config['num_epochs']} epochs of domain adaptation")
print(f"  • Learning rate: {mlm_config['learning_rate']}")
print(f"  • Model adapted to legal vocabulary and syntax")

print("\n🎯 NER Fine-tuning Stage:")
print(f"  • Started from domain-adapted model")
print(f"  • {experiment_config['num_train_epochs']} epochs of NER training")
print(f"  • Learning rate: {experiment_config['learning_rate']}")
print(f"  • Better initialization for legal NER task")

## 12. Training History and Visualization

In [None]:
# Plot training history
plot_training_history(trainer)

# Plot entity distribution
label_stats = ner_dataset.get_label_statistics()
plot_entity_distribution(label_stats['entity_counts'])

## 13. Inference Pipeline Testing

In [None]:
# Load DAPT inference pipeline
pipeline = load_inference_pipeline(
    model_path=NER_OUTPUT_DIR,
    max_length=experiment_config['max_length'],
    stride=experiment_config['stride']
)

# Test with sample text
sample_text = """Основни суд у Београду донео је пресуду у кривичном предмету К-1234/2023 против оптуженог Марка Петровића за кривично дело крађе из члана 203 Кривичног законика. Судија Ана Николић изрекла је казну затвора у трајању од 6 месеци."""

print("🔍 Testing DAPT inference pipeline:")
print(f"Input text: {sample_text}")
print("\n📋 Detected entities (with DAPT):")

entities = pipeline.predict(sample_text)
for entity in entities:
    print(f"  {entity['label']}: '{entity['text']}' (tokens {entity['start']}-{entity['end']})")

print(f"\n✅ Found {len(entities)} entities using DAPT model")
print("🧠 Model benefits from legal domain knowledge via MLM pretraining")

## 14. Summary and Results

In [None]:
print("\n🎯 DAPT FINAL SUMMARY")
print("=" * 50)
print(f"Base model: {BASE_MODEL_NAME}")
print(f"Training examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")
print(f"Test examples: {len(test_examples)}")
print(f"Entity types: {len(ENTITY_TYPES)}")
print(f"BIO labels: {len(BIO_LABELS)}")
print(f"\n🧠 DAPT Configuration:")
print(f"  MLM documents: {len(mlm_documents)}")
print(f"  MLM epochs: {mlm_config['num_epochs']}")
print(f"  MLM learning rate: {mlm_config['learning_rate']}")
print(f"  NER epochs: {experiment_config['num_train_epochs']}")
print(f"  NER learning rate: {experiment_config['learning_rate']}")
print(f"\nTest Performance:")
print(f"  Precision: {test_results['precision']:.4f}")
print(f"  Recall: {test_results['recall']:.4f}")
print(f"  F1-score: {test_results['f1']:.4f}")
print(f"  Accuracy: {test_results['accuracy']:.4f}")
print(f"\nModels saved to:")
print(f"  MLM adapted: {MLM_OUTPUT_DIR}")
print(f"  NER final: {NER_OUTPUT_DIR}")
print("\n✅ DAPT pipeline completed successfully!")
print("\n💡 DAPT advantages:")
print("   • Domain-specific language understanding")
print("   • Better legal terminology comprehension")
print("   • Improved contextual representations")
print("   • Enhanced generalization on legal texts")