# Serbian Legal Named Entity Recognition (NER) Pipeline


## Entity Types
- **COURT**: Court institutions
- **DECISION_DATE**: Dates of legal decisions
- **CASE_NUMBER**: Case identifiers
- **CRIMINAL_ACT**: Criminal acts/charges
- **PROSECUTOR**: Prosecutor entities
- **DEFENDANT**: Defendant entities
- **JUDGE**: Judge names
- **REGISTRAR**: Court registrar
- **SANCTION**: Sanctions/penalties
- **SANCTION_TYPE**: Type of sanction
- **SANCTION_VALUE**: Value/duration of sanction
- **PROVISION**: Legal provisions
- **PROCEDURE_COSTS**: Legal procedure costs

## 1. Environment Setup and Dependencies

In [None]:
# Install required packages
# !pip install transformers torch datasets tokenizers scikit-learn seqeval pandas numpy matplotlib seaborn tqdm



In [39]:
import json
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.4.1
CUDA available: False


## 2. Data Loading and Analysis

In [47]:
LABELSTUDIO_JSON_PATH = "annotations.json"
JUDGMENTS_DIR = "labelstudio_files"
MODEL_NAME = "classla/bcms-bertic"
OUTPUT_DIR = "./models/serbian-legal-ner"

# Output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load LabelStudio annotations
with open(LABELSTUDIO_JSON_PATH, 'r', encoding='utf-8') as f:
    labelstudio_data = json.load(f)

print(f"Loaded {len(labelstudio_data)} annotated documents")
print(f"Available judgment files: {len(list(Path(JUDGMENTS_DIR).glob('*.txt')))}")

Loaded 109 annotated documents
Available judgment files: 109


In [48]:
# Analyze the annotation structure
def analyze_labelstudio_data(data):
    """Analyze the structure of LabelStudio annotations"""
    total_annotations = 0
    entity_counts = {}
    
    for item in data:
        annotations = item.get('annotations', [])
        
        for annotation in annotations:
            if 'result' in annotation:
                for result in annotation['result']:
                    if result.get('type') == 'labels':
                        labels = result['value']['labels']
                        for label in labels:
                            entity_counts[label] = entity_counts.get(label, 0) + 1
                            total_annotations += 1
    
    return total_annotations, entity_counts

total_annotations, entity_counts = analyze_labelstudio_data(labelstudio_data)

print(f"Total annotations: {total_annotations}")
print("\nEntity distribution:")
for entity, count in sorted(entity_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {entity}: {count}")

Total annotations: 3003

Entity distribution:
  DEFENDANT: 440
  PROVISION_MATERIAL: 434
  PROVISION_PROCEDURAL: 299
  CRIMINAL_ACT: 293
  COURT: 227
  PROSECUTOR: 224
  JUDGE: 214
  REGISTRAR: 207
  DECISION_DATE: 179
  CASE_NUMBER: 109
  VERDICT: 107
  PROCEDURE_COSTS: 92
  SANCTION_TYPE: 90
  SANCTION_VALUE: 88


## 3. Data Preprocessing and BIO Conversion

### Download LabelStudio Files (Recommended)

For best results, download the actual files that LabelStudio used for annotation. This eliminates any offset issues.

**Option 1: Manual download using PowerShell/curl:**
```bash
# Create directory
mkdir labelstudio_files

# Download files using PowerShell (Windows)
Invoke-WebRequest -Uri "https://app.humansignal.com/storage-data/uploaded/?filepath=upload/186137/5534cab7-judgment_K_959_2012.txt" -Headers @{Authorization = "Token 99cb57616d2c7b5b67da2d60d24dd5590605b89b"} -Method GET -OutFile "labelstudio_files/5534cab7-judgment_K_959_2012.txt"

# Or using curl (Linux/Mac)
curl -H "Authorization: Token 99cb57616d2c7b5b67da2d60d24dd5590605b89b" "https://app.humansignal.com/storage-data/uploaded/?filepath=upload/186137/5534cab7-judgment_K_959_2012.txt" -o "labelstudio_files/5534cab7-judgment_K_959_2012.txt"
```

**Option 2: Programmatic download (uncomment and run if needed):**

In [20]:
# Uncomment and run this cell to download LabelStudio files programmatically
# 
# import requests
# import os
# 
# def download_labelstudio_files():
#     TOKEN = "99cb57616d2c7b5b67da2d60d24dd5590605b89b"
#     os.makedirs("labelstudio_files", exist_ok=True)
#     
#     # Extract unique file paths from the data
#     file_paths = set()
#     for item in labelstudio_data:
#         file_path = item.get('data', {}).get('text', '') or item.get('file_upload', '')
#         if file_path:
#             file_paths.add(file_path)
#     
#     print(f"Downloading {len(file_paths)} files...")
#     
#     for file_path in file_paths:
#         filename = file_path.split('/')[-1]
#         output_path = f"labelstudio_files/{filename}"
#         
#         if os.path.exists(output_path):
#             print(f"Skipping {filename} (already exists)")
#             continue
#         
#         url = f"https://app.humansignal.com/storage-data/uploaded/?filepath={file_path}"
#         headers = {"Authorization": f"Token {TOKEN}"}
#         
#         try:
#             response = requests.get(url, headers=headers)
#             response.raise_for_status()
#             
#             with open(output_path, 'w', encoding='utf-8') as f:
#                 f.write(response.text)
#             
#             print(f"Downloaded: {filename} ({len(response.text)} chars)")
#         except Exception as e:
#             print(f"Error downloading {filename}: {e}")
# 
# # Uncomment the next line to run the download
# # download_labelstudio_files()

In [36]:
class LabelStudioToBIOConverter:
    """Convert LabelStudio annotations to BIO format"""

    def __init__(
        self,
        judgments_dir: str = None,
        labelstudio_files_dir: str = "labelstudio_files",
    ):
        self.judgments_dir = Path(judgments_dir) if judgments_dir else None
        self.labelstudio_files_dir = Path(labelstudio_files_dir)
        self.entity_types = set()

    def load_text_file(self, filename: str) -> Optional[str]:
        """Load text content from LabelStudio files or judgment files"""

        # Extract the actual filename from the path
        if "/" in filename:
            actual_filename = filename.split("/")[-1]  # Get last part after /
        else:
            actual_filename = filename

        labelstudio_file = self.labelstudio_files_dir / actual_filename
        if labelstudio_file.exists():
            try:
                with open(labelstudio_file, "r", encoding="utf-8") as f:
                    print(f"Using LabelStudio file: {labelstudio_file}")
                    return f.read()
            except Exception as e:
                print(f"Error reading LabelStudio file {labelstudio_file}: {e}")

        print(f"Warning: Could not find text file for {filename}")
        return None

    def convert_to_bio(self, labelstudio_data: List[Dict]) -> List[Dict]:
        """Convert LabelStudio data to BIO format"""
        bio_examples = []

        for item in labelstudio_data:
            # Get text content from LabelStudio data structure
            file_path = item.get("file_upload", "")

            text_content = self.load_text_file(file_path)
            annotations = item.get("annotations", [])

            if not text_content or not annotations:
                continue

            # Process each annotation
            for annotation in annotations:
                result = annotation.get("result", [])
                bio_example = self._create_bio_example(text_content, result)
                if bio_example:
                    bio_examples.append(bio_example)

        return bio_examples

    def _create_bio_example(self, text: str, annotations: List[Dict]) -> Optional[Dict]:
        # sudija Babovic Dragan uz ucesce namjestenika suda Dragovic Katarine kao zapisnica
        # ["sudija" "Babovic" "Dragan" "uz" "ucesce" "namjestenika" "suda" "Dragovic" "Katarine" "kao" "zapisnica"]
        # labels = ["O", "B-JUDGE", "I-JUDGE", "O", "O", "O", "O", "B-REPORTER", "I-REPORTER", "O", "O"]
        tokens = text.split()
        labels = ["O"] * len(tokens)

        char_to_token = {}
        current_pos = 0

        for i, token in enumerate(tokens):
            token_start = text.find(token, current_pos)
            if token_start == -1:
                continue

            token_end = token_start + len(token)

            for char_pos in range(token_start, token_end):
                char_to_token[char_pos] = i

            current_pos = token_end

        for annotation in annotations:
            if annotation.get("type") != "labels":
                continue

            value = annotation.get("value", {})
            start = value.get("start")
            end = value.get("end")
            entity_labels = value.get("labels", [])

            if start is None or end is None or not entity_labels:
                continue

            # # No offset needed when using actual LabelStudio files
            # # Skip if positions are invalid
            # if start < 0 or end < 0 or start >= len(text) or end > len(text):
            #     continue

            entity_type = entity_labels[0]  # Take first label
            self.entity_types.add(entity_type)

            start_token = None
            end_token = None

            for char_pos in range(start, min(end, len(text))):
                if char_pos in char_to_token:
                    token_idx = char_to_token[char_pos]
                    if start_token is None:
                        start_token = token_idx
                    end_token = token_idx

            # Apply BIO tagging
            if start_token is not None and end_token is not None:
                for token_idx in range(start_token, end_token + 1):
                    if token_idx == start_token:
                        labels[token_idx] = f"B-{entity_type}"
                    else:
                        labels[token_idx] = f"I-{entity_type}"

        return {"tokens": tokens, "labels": labels, "text": text}


converter = LabelStudioToBIOConverter(
    judgments_dir=JUDGMENTS_DIR, labelstudio_files_dir="labelstudio_files"
)
bio_examples = converter.convert_to_bio(labelstudio_data)

print(f"Converted {len(bio_examples)} examples to BIO format")
print(f"Found entity types: {sorted(converter.entity_types)}")

# Check if conversion was successful
if len(bio_examples) == 0:
    print("\n⚠️  WARNING: BIO scheme conversion did not produce any examples.")
    
import json
with open('bio_examples.json', 'w', encoding='utf-8') as f:
    json.dump(bio_examples, f, indent=2, ensure_ascii=False)



if bio_examples:
    print("\nSample BIO example:")
    sample = bio_examples[0]
    for i, (token, label) in enumerate(
        zip(sample["tokens"], sample["labels"])
    ):
        print(f"{i:2d}: {token:15s} -> {label}")
    # if len(sample["tokens"]) > 40:
    #     print("    ... (truncated)")

Using LabelStudio file: labelstudio_files\b420de30-deepseek_text_20250901_71ebae.txt
Using LabelStudio file: labelstudio_files\74c5911f-K.br._23517.txt
Using LabelStudio file: labelstudio_files\bf648348-K_1972012.txt
Using LabelStudio file: labelstudio_files\db519945-judgment_K_4_2015.txt
Using LabelStudio file: labelstudio_files\dfab3d86-judgment_K_1_2020.txt
Using LabelStudio file: labelstudio_files\5534cab7-judgment_K_959_2012.txt
Using LabelStudio file: labelstudio_files\68d18bb4-judgment_K_530_2020.txt
Using LabelStudio file: labelstudio_files\baf3de9e-judgment_K_530_2012.txt
Using LabelStudio file: labelstudio_files\cb49475e-judgment_K_224_2011.txt
Using LabelStudio file: labelstudio_files\e33d115d-judgment_K_190_2011.txt
Using LabelStudio file: labelstudio_files\2e9c5fc0-judgment_K_181_2013.txt
Using LabelStudio file: labelstudio_files\514043dc-judgment_K_169_2021.txt
Using LabelStudio file: labelstudio_files\9aeb9ab1-judgment_K_166_2011.txt
Using LabelStudio file: labelstudio_f

## 4. Label Encoding and Dataset Preparation

In [37]:
class NERDataset:
    """Dataset class for NER training"""
    
    def __init__(self, bio_examples: List[Dict]):
        self.examples = bio_examples
        self.label_to_id = self._create_label_mapping()
        self.id_to_label = {v: k for k, v in self.label_to_id.items()}
        
    def _create_label_mapping(self) -> Dict[str, int]:
        """Create mapping from labels to IDs"""
        all_labels = set(['O'])  # Start with 'O' label
        
        for example in self.examples:
            all_labels.update(example['labels'])
        
        # Sort labels to ensure consistent ordering
        sorted_labels = sorted(all_labels)
        return {label: idx for idx, label in enumerate(sorted_labels)}
    
    def encode_labels(self, labels: List[str]) -> List[int]:
        """Convert labels to IDs"""
        return [self.label_to_id[label] for label in labels]
    
    def decode_labels(self, label_ids: List[int]) -> List[str]:
        """Convert IDs back to labels"""
        return [self.id_to_label[label_id] for label_id in label_ids]
    
    def get_num_labels(self) -> int:
        """Get number of unique labels"""
        return len(self.label_to_id)
    
    def prepare_for_training(self) -> List[Dict]:
        """Prepare examples for training"""
        prepared_examples = []
        
        for example in self.examples:
            prepared_examples.append({
                'tokens': example['tokens'],
                'labels': self.encode_labels(example['labels']),
                'text': example['text']
            })
        
        return prepared_examples

# Create dataset
ner_dataset = NERDataset(bio_examples)
prepared_examples = ner_dataset.prepare_for_training()

print(f"Number of unique labels: {ner_dataset.get_num_labels()}")
print(f"Label mapping: {ner_dataset.label_to_id}")
print(f"Prepared {len(prepared_examples)} examples for training")

Number of unique labels: 28
Label mapping: {'B-CASE_NUMBER': 0, 'B-COURT': 1, 'B-CRIMINAL_ACT': 2, 'B-DECISION_DATE': 3, 'B-DEFENDANT': 4, 'B-JUDGE': 5, 'B-PROCEDURE_COSTS': 6, 'B-PROSECUTOR': 7, 'B-PROVISION_MATERIAL': 8, 'B-PROVISION_PROCEDURAL': 9, 'B-REGISTRAR': 10, 'B-SANCTION_TYPE': 11, 'B-SANCTION_VALUE': 12, 'B-VERDICT': 13, 'I-COURT': 14, 'I-CRIMINAL_ACT': 15, 'I-DECISION_DATE': 16, 'I-DEFENDANT': 17, 'I-JUDGE': 18, 'I-PROCEDURE_COSTS': 19, 'I-PROSECUTOR': 20, 'I-PROVISION_MATERIAL': 21, 'I-PROVISION_PROCEDURAL': 22, 'I-REGISTRAR': 23, 'I-SANCTION_TYPE': 24, 'I-SANCTION_VALUE': 25, 'I-VERDICT': 26, 'O': 27}
Prepared 60 examples for training


## 5. Data Splitting and Tokenization

In [38]:
train_examples, temp_examples = train_test_split(
    prepared_examples, test_size=0.3, random_state=42, shuffle=True
)
val_examples, test_examples = train_test_split(
    temp_examples, test_size=0.5, random_state=42, shuffle=True
)

print(f"Training examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")
print(f"Test examples: {len(test_examples)}")

Training examples: 42
Validation examples: 9
Test examples: 9


## Smart Filtering: Reduce Class Imbalance

Remove most sentences with only 'O' labels while keeping some for model generalization.

In [32]:
import random

print("🔍 APPLYING SMART FILTERING TO REDUCE CLASS IMBALANCE")
print("=" * 60)

# Make sure we know what O is
O_id = ner_dataset.label_to_id["O"]
print(f"'O' label ID: {O_id}")

def filter_all_O(examples, keep_ratio=0.2):
    """Remove sentences with only O, but keep some (keep_ratio)."""
    positive = [ex for ex in examples if not all(l == O_id for l in ex["labels"])]
    negative = [ex for ex in examples if all(l == O_id for l in ex["labels"])]
    
    print(f"Examples with entities: {len(positive):,} (keeping all)")
    print(f"Examples with only 'O': {len(negative):,}")
    
    # Downsample negative examples
    if keep_ratio > 0 and len(negative) > 0:
        keep_n = max(1, int(len(negative) * keep_ratio))
        negative_sample = random.sample(negative, keep_n)
    else:
        negative_sample = []
    
    return positive + negative_sample

# Set random seed for reproducibility
random.seed(42)

# Store original sizes for comparison
original_train_size = len(train_examples)
original_val_size = len(val_examples)
original_test_size = len(test_examples)

# Apply to all splits
train_examples = filter_all_O(train_examples, keep_ratio=0.2)
val_examples = filter_all_O(val_examples, keep_ratio=0.2)
test_examples = filter_all_O(test_examples, keep_ratio=0.2)

print(f"Train size after filtering: {len(train_examples)} (was {original_train_size})")
print(f"Val size after filtering: {len(val_examples)} (was {original_val_size})")
print(f"Test size after filtering: {len(test_examples)} (was {original_test_size})")

# Calculate reductions
train_reduction = (original_train_size - len(train_examples)) / original_train_size * 100
val_reduction = (original_val_size - len(val_examples)) / original_val_size * 100
test_reduction = (original_test_size - len(test_examples)) / original_test_size * 100

print(f"\n📊 Reductions:")
print(f"Training: {train_reduction:.1f}% reduction")
print(f"Validation: {val_reduction:.1f}% reduction")
print(f"Test: {test_reduction:.1f}% reduction")

print(f"\n✅ Smart filtering applied! Keeping 20% of 'O'-only examples for generalization.")

🔍 APPLYING SMART FILTERING TO REDUCE CLASS IMBALANCE
'O' label ID: 27
Examples with entities: 42 (keeping all)
Examples with only 'O': 0
Examples with entities: 9 (keeping all)
Examples with only 'O': 0
Examples with entities: 9 (keeping all)
Examples with only 'O': 0
Train size after filtering: 42 (was 42)
Val size after filtering: 9 (was 9)
Test size after filtering: 9 (was 9)

📊 Reductions:
Training: 0.0% reduction
Validation: 0.0% reduction
Test: 0.0% reduction

✅ Smart filtering applied! Keeping 20% of 'O'-only examples for generalization.


## Load Tokenizer

Continue with the existing pipeline using the filtered dataset.

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"\nLoaded tokenizer for {MODEL_NAME}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

### Sliding Window Implementation

- **No Truncation**: Long sequences are split into overlapping chunks
- **Context Preservation**: 128-token overlap maintains entity context
- **Complete Coverage**: Every part of every document is processed
- **Better Training**: Model learns from complete documents

The implementation follows the recommended 3-step process for BERT-based NER.

In [9]:
# Debug cell to check the label format
print("Checking label format in train_examples:")
print(f"Number of examples: {len(train_examples)}")

if train_examples:
    first_example = train_examples[0]
    print(f"First example keys: {first_example.keys()}")
    print(f"First few tokens: {first_example['tokens'][:10]}")
    print(f"First few labels: {first_example['labels'][:10]}")
    print(f"Label types: {[type(label) for label in first_example['labels'][:5]]}")
    
    # Check if labels are integers or strings
    if first_example['labels']:
        first_label = first_example['labels'][0]
        print(f"First label: {first_label} (type: {type(first_label)})")
        
        if isinstance(first_label, int):
            print("Labels are integers - need to convert back to strings")
            print(f"Label to ID mapping: {ner_dataset.label_to_id}")
            print(f"ID to label mapping: {ner_dataset.id_to_label}")
        else:
            print("Labels are strings - should work fine")


Checking label format in train_examples:
Number of examples: 42
First example keys: dict_keys(['tokens', 'labels', 'text'])
First few tokens: ['K.br.17/20', 'U', 'IME', 'CRNE', 'GORE', 'OSNOVNI', 'SUD', 'u', 'HERCEG', 'NOVOM,']
First few labels: [0, 27, 27, 27, 27, 1, 14, 14, 14, 14]
Label types: [<class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>]
First label: 0 (type: <class 'int'>)
Labels are integers - need to convert back to strings
Label to ID mapping: {'B-CASE_NUMBER': 0, 'B-COURT': 1, 'B-CRIMINAL_ACT': 2, 'B-DECISION_DATE': 3, 'B-DEFENDANT': 4, 'B-JUDGE': 5, 'B-PROCEDURE_COSTS': 6, 'B-PROSECUTOR': 7, 'B-PROVISION_MATERIAL': 8, 'B-PROVISION_PROCEDURAL': 9, 'B-REGISTRAR': 10, 'B-SANCTION_TYPE': 11, 'B-SANCTION_VALUE': 12, 'B-VERDICT': 13, 'I-COURT': 14, 'I-CRIMINAL_ACT': 15, 'I-DECISION_DATE': 16, 'I-DEFENDANT': 17, 'I-JUDGE': 18, 'I-PROCEDURE_COSTS': 19, 'I-PROSECUTOR': 20, 'I-PROVISION_MATERIAL': 21, 'I-PROVISION_PROCEDURAL': 22, 'I-REGISTRAR': 23, 'I-SAN

In [None]:
from sliding_window_tokenizer import (
    tokenize_and_align_labels_with_sliding_window,
    print_sequence_analysis
)

# First, let's analyze the sequence lengths in our dataset
print("Analyzing sequence lengths before tokenization...")
print("\nTraining set:")
print_sequence_analysis(train_examples, tokenizer)
print("\nValidation set:")
print_sequence_analysis(val_examples, tokenizer)
print("\nTest set:")
print_sequence_analysis(test_examples, tokenizer)

print("\n" + "="*60)
print("TOKENIZING WITH SLIDING WINDOWS")
print("="*60)

# Use sliding window tokenization
print("\nTraining set:")
train_tokenized = tokenize_and_align_labels_with_sliding_window(
    train_examples, tokenizer, ner_dataset.label_to_id, max_length=512, stride=128
)

print("\nValidation set:")
val_tokenized = tokenize_and_align_labels_with_sliding_window(
    val_examples, tokenizer, ner_dataset.label_to_id, max_length=512, stride=128
)

print("\nTest set:")
test_tokenized = tokenize_and_align_labels_with_sliding_window(
    test_examples, tokenizer, ner_dataset.label_to_id, max_length=512, stride=128
)

print(f"\n" + "="*60)
print(f"FINAL TOKENIZED COUNTS:")
print(f"Training chunks: {len(train_tokenized)} (from {len(train_examples)} examples)")
print(f"Validation chunks: {len(val_tokenized)} (from {len(val_examples)} examples)")
print(f"Test chunks: {len(test_tokenized)} (from {len(test_examples)} examples)")
print(f"="*60)

Analyzing sequence lengths before tokenization...

Training set:
=== Sequence Length Analysis ===
Total sequences: 42
Min length: 581
Max length: 4007
Mean length: 942.3
Median length: 722
Sequences > 512 tokens: 42 (100.0%)
Sequences > 256 tokens: 42 (100.0%)

Length percentiles:
  50th percentile: 722 tokens
  75th percentile: 768 tokens
  90th percentile: 1704 tokens
  95th percentile: 1840 tokens
  99th percentile: 4007 tokens

Validation set:
=== Sequence Length Analysis ===
Total sequences: 9
Min length: 623
Max length: 1787
Mean length: 1055.9
Median length: 723
Sequences > 512 tokens: 9 (100.0%)
Sequences > 256 tokens: 9 (100.0%)

Length percentiles:
  50th percentile: 723 tokens
  75th percentile: 1487 tokens
  90th percentile: 1787 tokens
  95th percentile: 1787 tokens
  99th percentile: 1787 tokens

Test set:
=== Sequence Length Analysis ===
Total sequences: 9
Min length: 612
Max length: 3186
Mean length: 1179.7
Median length: 750
Sequences > 512 tokens: 9 (100.0%)
Sequences

## 6. Model Setup and Training Configuration

In [11]:
# Load pre-trained model
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=ner_dataset.get_num_labels(),
    id2label=ner_dataset.id_to_label,
    label2id=ner_dataset.label_to_id
)

print(f"Loaded model: {MODEL_NAME}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Number of labels: {ner_dataset.get_num_labels()}")

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: classla/bcms-bertic
Model parameters: 110,048,284
Number of labels: 28
Using device: cpu


In [12]:
# Convert to HuggingFace datasets
train_dataset = HFDataset.from_list(train_tokenized)
val_dataset = HFDataset.from_list(val_tokenized)
test_dataset = HFDataset.from_list(test_tokenized)

# Data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

print("Created HuggingFace datasets and data collator")

Created HuggingFace datasets and data collator


## 7. Evaluation Metrics

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [ner_dataset.id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ner_dataset.id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # Compute metrics using seqeval
    results = {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }
    
    return results

print("Defined evaluation metrics function")

Defined evaluation metrics function


## 8. Training Configuration and Fine-tuning

In [14]:
%pip install transformers[torch]
%pip install accelerate -U

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    dataloader_pin_memory=False,
    gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if CUDA available
    report_to=None  # Disable wandb/tensorboard logging
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision: {training_args.fp16}")

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Training configuration:
  Epochs: 5
  Batch size: 8
  Learning rate: 2e-05
  Mixed precision: False


In [15]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Initialized trainer with early stopping")
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Initialized trainer with early stopping
Training dataset size: 205
Validation dataset size: 51


In [None]:

print("Starting training...")
trainer.train()

print("Training completed!")

trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to {OUTPUT_DIR}")

## 9. Model Evaluation

In [None]:
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=test_dataset)

print("\nTest Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        metric_name = key.replace('eval_', '')
        print(f"  {metric_name}: {value:.4f}")

In [None]:
# Detailed evaluation with per-entity metrics
def detailed_evaluation(trainer, dataset, dataset_name="Test"):
    """Perform detailed evaluation with per-entity metrics"""
    predictions = trainer.predict(dataset)
    y_pred = np.argmax(predictions.predictions, axis=2)
    y_true = predictions.label_ids
    
    # Convert to label strings
    true_predictions = []
    true_labels = []
    
    for pred_seq, true_seq in zip(y_pred, y_true):
        pred_labels = []
        true_labels_seq = []
        
        for pred, true in zip(pred_seq, true_seq):
            if true != -100:  # Skip special tokens
                pred_labels.append(ner_dataset.id_to_label[pred])
                true_labels_seq.append(ner_dataset.id_to_label[true])
        
        true_predictions.append(pred_labels)
        true_labels.append(true_labels_seq)
    
    # Flatten for sklearn metrics
    flat_true = [label for seq in true_labels for label in seq]
    flat_pred = [label for seq in true_predictions for label in seq]
    
    # Print classification report
    print(f"\n{dataset_name} Set - Detailed Classification Report:")
    print(classification_report(flat_true, flat_pred, zero_division=0))
    
    return true_predictions, true_labels

# Run detailed evaluation
test_predictions, test_true_labels = detailed_evaluation(trainer, test_dataset, "Test")

## 10. Inference Pipeline

In [None]:
# Import the sliding window inference pipeline
from sliding_window_inference import (
    SerbianLegalNERPipelineWithSlidingWindow,
    SerbianLegalNERPipeline  # Legacy version that uses sliding windows
)

# Create inference pipeline with sliding window support
print("Creating inference pipeline with sliding window support...")
ner_pipeline = SerbianLegalNERPipelineWithSlidingWindow(
    OUTPUT_DIR, 
    max_length=512, 
    stride=128
)
print("Created sliding window inference pipeline")
print(f"Max length: {ner_pipeline.max_length}")
print(f"Stride: {ner_pipeline.stride}")
print(f"Device: {ner_pipeline.device}")

### Sliding Window Implementation Benefits

The sliding window approach provides several key advantages:

#### 🔧 **Technical Implementation**
1. **Step 1: Whitespace-based BIO tagging** - Already completed in our data preprocessing
2. **Step 2: WordPiece BIO conversion** - Proper label alignment where:
   - First subword inherits the original label (B- or I-)
   - Remaining subwords get I-<ENTITY> if inside entity, O if outside
3. **Step 3: Sliding window chunking** - Overlapping chunks with configurable stride

#### 📊 **Key Benefits**
- **No Information Loss**: Unlike truncation, we process the entire document
- **Context Preservation**: Overlapping windows maintain entity context across boundaries
- **Scalability**: Can handle documents of any length
- **Robust Inference**: Majority voting resolves conflicts in overlapping regions

#### ⚙️ **Configuration**
- **Max Length**: 512 tokens (BERT's limit)
- **Stride**: 128 tokens (25% overlap for context preservation)
- **Special Tokens**: [CLS] and [SEP] automatically handled

Let's analyze how this works with actual data:

### Sliding Window Analysis

Let's analyze how the sliding window approach handles long sequences:

In [None]:
# Test with a long text to demonstrate sliding window functionality
long_test_text = """
OSNOVNI SUD U HERCEG NOVOM, po sudiji Leković Branislavu, 
u krivičnom predmetu protiv okrivljenog K.M., zbog krivičnog 
djela iz čl.220 st.1 KZ CG, donio je presudu dana 10.02.2015. godine.
Okrivljeni je kriv što je dana 15.01.2015. godine, oko 14:00 časova,
u Herceg Novom, u ulici Njegoševa br. 25, protivno odredbama 
Zakona o javnom redu i miru, narušio javni red i mir tako što je 
u stanju alkoholisanosti vikao i psovao na javnom mjestu.
Sud je okrivljenog K.M. oglasio krivim za krivično djelo narušavanje 
javnog reda i mira iz čl. 220 st. 1 KZ CG i osudio ga na novčanu 
kaznu u iznosu od 300,00 eura. Ako osuđeni ne plati novčanu kaznu 
u roku od 30 dana od dana pravosnažnosti presude, kazna će se 
zamijeniti kaznom zatvora u trajanju od 30 dana.
""".strip()

print("Testing sliding window inference with long text:")
print(f"Text length: {len(long_test_text)} characters")
print(f"Word count: {len(long_test_text.split())} words")

# Tokenize to see how many WordPiece tokens this creates
wordpiece_tokens = []
for word in long_test_text.split():
    word_tokens = ner_pipeline.tokenizer.tokenize(word)
    wordpiece_tokens.extend(word_tokens)

print(f"WordPiece tokens: {len(wordpiece_tokens)}")
print(f"With [CLS] and [SEP]: {len(wordpiece_tokens) + 2} tokens")

if len(wordpiece_tokens) + 2 > 512:
    print("\n✅ This text requires sliding windows!")
    chunks_needed = (len(wordpiece_tokens) + 2 - 512) // 128 + 2
    print(f"Estimated chunks needed: {chunks_needed}")
else:
    print("\n❌ This text fits in a single window")

# Run prediction
print("\nRunning prediction...")
entities = ner_pipeline.predict(long_test_text)

print(f"\nFound {len(entities)} entities:")
for entity in entities:
    print(f"  {entity['label']}: '{entity['text']}'")

if not entities:
    print("  No entities detected")

## 11. Testing the Pipeline

In [None]:
# Test with a sample Serbian legal text
# sample_text = """
# OSNOVNI SUD U HERCEG NOVOM, po sudiji Leković Branislavu, 
# u krivičnom predmetu protiv okrivljenog K.M., zbog krivičnog 
# djela iz čl.220 st.1 KZ CG, donio je presudu dana 10.02.2015. godine.
# """

# print("Testing pipeline with sample text:")
# print(f"Text: {sample_text.strip()}")
# print("\nPredicted entities:")

# entities = ner_pipeline.predict(sample_text)
# for entity in entities:
#     print(f"  {entity['label']}: '{entity['text']}'")

# if not entities:
#     print("  No entities detected")

In [None]:
# # Test with an actual judgment file
# judgment_file = "../judgments/judgment_K_4_2015.txt"

# if os.path.exists(judgment_file):
#     print(f"\nTesting with judgment file: {judgment_file}")
    
#     entities = ner_pipeline.predict_from_file(judgment_file)
    
#     print(f"\nFound {len(entities)} entities:")
    
#     # Group entities by type
#     entities_by_type = {}
#     for entity in entities:
#         entity_type = entity['label']
#         if entity_type not in entities_by_type:
#             entities_by_type[entity_type] = []
#         entities_by_type[entity_type].append(entity['text'])
    
#     for entity_type, texts in entities_by_type.items():
#         print(f"\n{entity_type}:")
#         for text in texts[:5]:  # Show first 5 entities of each type
#             print(f"  - {text}")
#         if len(texts) > 5:
#             print(f"  ... and {len(texts) - 5} more")
# else:
#     print(f"Judgment file not found: {judgment_file}")

## 12. Results Visualization

In [None]:
# Plot training history if available
def plot_training_history(trainer):
    """Plot training metrics"""
    if hasattr(trainer.state, 'log_history'):
        logs = trainer.state.log_history
        
        # Extract metrics
        train_loss = []
        eval_loss = []
        eval_f1 = []
        steps = []
        
        for log in logs:
            if 'loss' in log:
                train_loss.append(log['loss'])
            if 'eval_loss' in log:
                eval_loss.append(log['eval_loss'])
                eval_f1.append(log.get('eval_f1', 0))
                steps.append(log['step'])
        
        # Create plots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Loss plot
        if eval_loss:
            ax1.plot(steps, eval_loss, label='Validation Loss', color='red')
        ax1.set_title('Training Loss')
        ax1.set_xlabel('Steps')
        ax1.set_ylabel('Loss')
        ax1.legend()
        ax1.grid(True)
        
        # F1 score plot
        if eval_f1:
            ax2.plot(steps, eval_f1, label='Validation F1', color='green')
        ax2.set_title('F1 Score')
        ax2.set_xlabel('Steps')
        ax2.set_ylabel('F1 Score')
        ax2.legend()
        ax2.grid(True)
        
        plt.tight_layout()
        plt.show()
    else:
        print("No training history available")

# Plot training history
plot_training_history(trainer)

In [None]:
# Plot entity distribution
def plot_entity_distribution(entity_counts):
    """Plot distribution of entity types"""
    if not entity_counts:
        print("No entity counts available")
        return
    
    entities = list(entity_counts.keys())
    counts = list(entity_counts.values())
    
    plt.figure(figsize=(12, 6))
    bars = plt.bar(entities, counts, color='skyblue', edgecolor='navy', alpha=0.7)
    
    # Add value labels on bars
    for bar, count in zip(bars, counts):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                str(count), ha='center', va='bottom')
    
    plt.title('Distribution of Entity Types in Training Data', fontsize=14, fontweight='bold')
    plt.xlabel('Entity Types', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

# Plot entity distribution
plot_entity_distribution(entity_counts)

## 13. Model Saving and Export

In [None]:
# Save model configuration and metadata
model_info = {
    "model_name": MODEL_NAME,
    "num_labels": ner_dataset.get_num_labels(),
    "label_to_id": ner_dataset.label_to_id,
    "id_to_label": ner_dataset.id_to_label,
    "entity_types": sorted(converter.entity_types),
    "training_examples": len(train_examples),
    "validation_examples": len(val_examples),
    "test_examples": len(test_examples),
    "test_results": test_results if 'test_results' in locals() else None
}

# Save model info
with open(f"{OUTPUT_DIR}/model_info.json", 'w', encoding='utf-8') as f:
    json.dump(model_info, f, indent=2, ensure_ascii=False)

print(f"Model information saved to {OUTPUT_DIR}/model_info.json")
print("\nModel training and evaluation completed successfully!")
print(f"\nFinal model location: {OUTPUT_DIR}")
print("\nTo use the model for inference:")
print(f"pipeline = SerbianLegalNERPipeline('{OUTPUT_DIR}')")
print("entities = pipeline.predict('Your Serbian legal text here')")