# GLiNER Evaluation: Zero-Shot vs Few-Shot on Serbian Legal Documents

This notebook evaluates GLiNER (Generalist and Lightweight Named Entity Recognition) on 225 Serbian legal documents.

## 🎯 **Evaluation Flow:**
1. **Setup**: Define everything needed for GLiNER and few-shot
2. **Zero-Shot Evaluation**: Run GLiNER without examples
3. **Few-Shot Evaluation**: Run GLiNER with manual examples
4. **Results Analysis**: Compare performance

## 🏷️ **Entity Types (Serbian):**
- `sud ili tribunal` → COURT
- `sudija ili pravosudni službenik` → JUDGE  
- `optuženi ili osoba na suđenju` → DEFENDANT
- `iznos ili trajanje kazne` → SANCTION_VALUE
- `broj predmeta ili identifikator slučaja` → CASE_NUMBER
- And 9 more entity types for comprehensive legal NER

In [None]:
# Install required packages
!pip install gliner seqeval scikit-learn matplotlib seaborn pandas tqdm

## 1. Setup: Define Everything Needed for GLiNER and Few-Shot

In [None]:
# Import required libraries
import json
import os
from pathlib import Path
from typing import List, Dict, Optional
from collections import Counter
import time

# Data processing
import pandas as pd
import numpy as np
from tqdm import tqdm

# GLiNER
from gliner import GLiNER

# Evaluation metrics
from sklearn.metrics import classification_report
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score
from seqeval.scheme import IOB2

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)

# Check GPU availability
import torch
if torch.cuda.is_available():
    print(f"🚀 GPU available: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("⚠️ GPU not available - will use CPU (slower)")

print("✅ All dependencies loaded successfully!")

In [None]:
# Configuration
LABELSTUDIO_JSON_PATH = "/content/drive/MyDrive/NER_Master/annotations.json"  # Update path as needed
JUDGMENTS_DIR = "/content/drive/MyDrive/NER_Master/judgments"  # Update path as needed
FEW_SHOT_EXAMPLES_DIR = "/content/drive/MyDrive/NER_Master/few_shot_examples"  # Manual examples directory
CONFIDENCE_THRESHOLD = 0.3  # GLiNER confidence threshold

# Serbian Legal Entity Types for GLiNER
LEGAL_ENTITY_TYPES = [
    "sud ili tribunal",  # COURT
    "datum presude ili odluke",  # DECISION_DATE
    "broj predmeta ili identifikator slučaja",  # CASE_NUMBER
    "krivično delo ili prestup",  # CRIMINAL_ACT
    "tužilac ili javni tužilac",  # PROSECUTOR
    "optuženi ili osoba na suđenju",  # DEFENDANT
    "sudija ili pravosudni službenik",  # JUDGE
    "sudski zapisničar ili službenik",  # REGISTRAR
    "sudska presuda ili odluka",  # VERDICT
    "vrsta kazne ili sankcije",  # SANCTION_TYPE
    "iznos ili trajanje kazne",  # SANCTION_VALUE
    "materijalna pravna odredba ili član",  # PROVISION_MATERIAL
    "procesna pravna odredba ili član",  # PROVISION_PROCEDURAL
    "troskovi ili takse sudskog postupka",  # PROCEDURE_COSTS
]

# Mapping from Serbian labels to ground truth labels
GLINER_TO_GT_MAPPING = {
    "sud ili tribunal": "COURT",
    "datum presude ili odluke": "DECISION_DATE",
    "broj predmeta ili identifikator slučaja": "CASE_NUMBER",
    "krivično delo ili prestup": "CRIMINAL_ACT",
    "tužilac ili javni tužilac": "PROSECUTOR",
    "optuženi ili osoba na suđenju": "DEFENDANT",
    "sudija ili pravosudni službenik": "JUDGE",
    "sudski zapisničar ili službenik": "REGISTRAR",
    "sudska presuda ili odluka": "VERDICT",
    "vrsta kazne ili sankcije": "SANCTION_TYPE",
    "iznos ili trajanje kazne": "SANCTION_VALUE",
    "materijalna pravna odredba ili član": "PROVISION_MATERIAL",
    "procesna pravna odredba ili član": "PROVISION_PROCEDURAL",
    "troskovi ili takse sudskog postupka": "PROCEDURE_COSTS",
}

print(f"🎯 Entity types: {len(LEGAL_ENTITY_TYPES)}")
print(f"⚡ Confidence threshold: {CONFIDENCE_THRESHOLD}")

In [None]:
# Import complete manual annotations with all 14 entity types
import sys

sys.path.append(".")


# Use the complete annotations
MANUAL_ANNOTATIONS = {
    "example_01_theft.txt": {
        "sud ili tribunal": ["Osnovni sud u Herceg Novom"],
        "datum presude ili odluke": ["30.12.2019."],
        "broj predmeta ili identifikator slučaja": ["K. br. 245/23"],
        "krivično delo ili prestup": ["krađe"],
        "tužilac ili javni tužilac": ["Dragana Milovanovića"],
        "optuženi ili osoba na suđenju": ["M.P."],
        "sudija ili pravosudni službenik": ["Marija Nikolić"],
        "sudski zapisničar ili službenik": ["Ane Stojanović"],
        "sudska presuda ili odluka": ["USLOVNU OSUDU"],
        "vrsta kazne ili sankcije": ["kaznu zatvora"],
        "iznos ili trajanje kazne": ["6 mjeseci"],
        "materijalna pravna odredba ili član": [
            "člana 344. stav 1. Krivičnog zakonika"
        ],
        "procesna pravna odredba ili član": [
            "članu 434. Zakonika o krivičnom postupku",
            "člana 261. Zakonika o krivičnom postupku",
        ],
        "troskovi ili takse sudskog postupka": ["40€"],
    },
    "example_02_assault.txt": {
        "sud ili tribunal": ["Viši sud u Podgorici"],
        "datum presude ili odluke": ["16.05.2019."],
        "broj predmeta ili identifikator slučaja": ["Kž. 1567/22"],
        "krivično delo ili prestup": ["nasilničko ponašanje"],
        "tužilac ili javni tužilac": ["Milana Đorđevića"],
        "optuženi ili osoba na suđenju": ["S.M."],
        "sudija ili pravosudni službenik": [
            "Aleksandar Jovanović",
            "Milica Radović",
            "Petar Stanković",
        ],
        "sudski zapisničar ili službenik": ["Jovane Mitrović"],
        "sudska presuda ili odluka": ["OSLOBAĐA SE OD OPTUŽBE"],
        "vrsta kazne ili sankcije": [],
        "iznos ili trajanje kazne": [],
        "materijalna pravna odredba ili član": [
            "člana 220. stav 1. Krivičnog zakonika"
        ],
        "procesna pravna odredba ili član": [
            "članu 434. Zakonika o krivičnom postupku"
        ],
        "troskovi ili takse sudskog postupka": [],
    },
    "example_03_fraud.txt": {
        "sud ili tribunal": ["Osnovni sud u Nikšiću", "OSNOVNI SUD U NIKŠIĆU"],
        "datum presude ili odluke": ["28. juna 2023. godine"],
        "broj predmeta ili identifikator slučaja": ["K. br. 567/23"],
        "krivično delo ili prestup": ["prevare"],
        "tužilac ili javni tužilac": ["Srđana Petrovića"],
        "optuženi ili osoba na suđenju": ["A.S."],
        "sudija ili pravosudni službenik": ["Jelena Milosavljević"],
        "sudski zapisničar ili službenik": [
            "Nemanje Stojanovića",
            "Nemanja Stojanović",
        ],
        "sudska presuda ili odluka": ["O S U Đ U J E"],
        "vrsta kazne ili sankcije": ["kaznu zatvora", "uslovnu osudu"],
        "iznos ili trajanje kazne": ["8 mjeseci", "sa rokom kušnje od 2 godine"],
        "materijalna pravna odredba ili član": [
            "člana 208. stav 1. Krivičnog zakonika"
        ],
        "procesna pravna odredba ili član": [],
        "troskovi ili takse sudskog postupka": ["120€"],
    },
    "example_04_traffic.txt": {
        "sud ili tribunal": [
            "PREKRŠAJNI SUD U PODGORICI",
            "Prekršajni sud u Podgorici",
        ],
        "datum presude ili odluke": ["10. maja 2023. godine"],
        "broj predmeta ili identifikator slučaja": ["Pr. br. 3456/23"],
        "krivično delo ili prestup": ["prekršaj"],
        "tužilac ili javni tužilac": [],
        "optuženi ili osoba na suđenju": ["V.M."],
        "sudija ili pravosudni službenik": ["Marko Đorđević"],
        "sudski zapisničar ili službenik": ["Tanje Nikolić"],
        "sudska presuda ili odluka": ["P R E S U D U"],
        "vrsta kazne ili sankcije": [
            "novčanom kaznom",
            "ZAŠTITNA MERA zabrane upravljanja motornim vozilom",
        ],
        "iznos ili trajanje kazne": ["80€", "u trajanju od 8 mjeseci"],
        "materijalna pravna odredba ili član": [
            "člana 330. stav 1. tačka 3. Zakona o bezbjednosti saobraćaja na putevima"
        ],
        "procesna pravna odredba ili član": ["članu 175. Zakona o prekršajima"],
        "troskovi ili takse sudskog postupka": ["30€"],
    },
    "example_05_drug_possession.txt": {
        "sud ili tribunal": ["Osnovni sud u Baru"],
        "datum presude ili odluke": ["20.10.2023."],
        "broj predmeta ili identifikator slučaja": ["K. br. 567/23"],
        "krivično delo ili prestup": [
            "neovlašćena proizvodnja i stavljanje u promet opojnih droga"
        ],
        "tužilac ili javni tužilac": [
            "Marije Stanković"
        ],
        "optuženi ili osoba na suđenju": ["N.R."],
        "sudija ili pravosudni službenik": ["Ana Popović"],
        "sudski zapisničar ili službenik": ["Milan Jovanović"],
        "sudska presuda ili odluka": ["USLOVNU OSUDU"],
        "vrsta kazne ili sankcije": ["kaznu zatvora", "Oduzima se predmet"],
        "iznos ili trajanje kazne": ["6 mjeseci", "marihuana"],
        "materijalna pravna odredba ili član": [
            "člana 246a stav 1. Krivičnog zakonika"
        ],
        "procesna pravna odredba ili član": [
            "članu 423. Zakonika o krivičnom postupku"
        ],
        "troskovi ili takse sudskog postupka": ["80€"],
    },
    "example_06_domestic_violence.txt": {
        "sud ili tribunal": ["OSNOVNI SUD U PLJEVLJIMA", "Osnovni sud u Pljevljima"],
        "datum presude ili odluke": ["25. 01. 2017."],
        "broj predmeta ili identifikator slučaja": ["K. br. 789/23"],
        "krivično delo ili prestup": ["nasilje u porodici"],
        "tužilac ili javni tužilac": ["Dragice Nikolić"],
        "optuženi ili osoba na suđenju": ["Z.M."],
        "sudija ili pravosudni službenik": ["Gordana Milić"],
        "sudski zapisničar ili službenik": ["Jovana Petrovića", "Jovan Petrović"],
        "sudska presuda ili odluka": ["O S U Đ U J E"],
        "vrsta kazne ili sankcije": ["kaznu zatvora", "ZAŠTITNA MERA zabrane prilaska"],
        "iznos ili trajanje kazne": ["10 mjeseci", "u trajanju od 1 godine"],
        "materijalna pravna odredba ili član": [
            "člana 194. stav 2. Krivičnog zakonika CG"
        ],
        "procesna pravna odredba ili član": [],
        "troskovi ili takse sudskog postupka": ["150€"],
    },
    "example_07_embezzlement.txt": {
        "sud ili tribunal": ["VIŠI SUD U PODGORICI", "Viši sud u Podgorici"],
        "datum presude ili odluke": ["09.12.2020"],
        "broj predmeta ili identifikator slučaja": ["K. br. 234/22"],
        "krivično delo ili prestup": ["pronevjere"],
        "tužilac ili javni tužilac": ["Aleksandra Milovanovića"],
        "optuženi ili osoba na suđenju": ["M.P."],
        "sudija ili pravosudni službenik": [
            "Milan Stojanović",
        ],
        "sudski zapisničar ili službenik": ["Milice Jovanović", "Milica Jovanović"],
        "sudska presuda ili odluka": ["O S U Đ U J E"],
        "vrsta kazne ili sankcije": ["kaznu zatvora"],
        "iznos ili trajanje kazne": ["3 godine"],
        "materijalna pravna odredba ili član": [
            "člana 364. stav 3. Krivičnog zakonika"
        ],
        "procesna pravna odredba ili član": ["čl. 363 st. 1 tač. 3 ZKP-a"],
        "troskovi ili takse sudskog postupka": ["450€"],
    },
    "example_08_tax_evasion.txt": {
        "sud ili tribunal": ["OSNOVNI SUD U ROŽAJAMA", "Osnovni sud u Rožajama"],
        "datum presude ili odluke": ["25. 08. 2023."],
        "broj predmeta ili identifikator slučaja": ["K. br. 445/23"],
        "krivično delo ili prestup": ["poreska utaja"],
        "tužilac ili javni tužilac": ["Jovane Petrović"],
        "optuženi ili osoba na suđenju": ["R.Đ."],
        "sudija ili pravosudni službenik": ["Milena Stanković"],
        "sudski zapisničar ili službenik": ["Nemanje Milovanovića", "Nemanja Milovanović"],
        "sudska presuda ili odluka": ["USLOVNU OSUDU"],
        "vrsta kazne ili sankcije": ["kaznu zatvora"],
        "iznos ili trajanje kazne": ["1 godine i 6 mjeseci", "u roku od 3 godine"],
        "materijalna pravna odredba ili član": [
            "člana 229. stav 2. Krivičnog zakonika",
            "čl. 4 st. 2, čl. 5, čl. 13, čl.15, čl. 42 st. 1, čl. 52 st. 2, čl. 53 i čl. 54 Krivičnog zakonika Crne Gore"
        ],
        "procesna pravna odredba ili član": ["čl. 226, čl. 229 i čl. 374 Zakonika o krivičnom postupku"],
        "troskovi ili takse sudskog postupka": ["250€"],
    },
    "example_09_robbery.txt": {
        "sud ili tribunal": ["OSNOVNI SUD U BIJELO POLJU", "Osnovni sud u Bijelo Polju"],
        "datum presude ili odluke": ["06.09.2011."],
        "broj predmeta ili identifikator slučaja": ["K. br. 678/23"],
        "krivično delo ili prestup": ["razbojništva"],
        "tužilac ili javni tužilac": ["Milice Đorđević"],
        "optuženi ili osoba na suđenju": ["M.J.", "S.N."],
        "sudija ili pravosudni službenik": ["Bojan Marković"],
        "sudski zapisničar ili službenik": ["Ane Milenković", "Ana Milenković"],
        "sudska presuda ili odluka": ["O S U Đ U J E"],
        "vrsta kazne ili sankcije": ["kaznu zatvora"],
        "iznos ili trajanje kazne": ["2 godine", "1 godine i 8 mjeseci"],
        "materijalna pravna odredba ili član": [
            "člana 206. stav 1. Krivičnog zakonika",
            "čl.3, 4, 5, 13, 16, 32, 42, 45, 46, 52, 53 i 54 KZ CG"
        ],
        "procesna pravna odredba ili član": ["čl. 226 st.3 i čl.374 ZKP-a"],
        "troskovi ili takse sudskog postupka": ["300€"],
    },
    "example_10_corruption.txt": {
        "sud ili tribunal": ["OSNOVNI SUD U BERANAMA", "Osnovni sud u Beranama"],
        "datum presude ili odluke": ["01.04.2024."],
        "broj predmeta ili identifikator slučaja": ["K. br. 123/22"],
        "krivično delo ili prestup": ["primanja mita"],
        "tužilac ili javni tužilac": ["Nikole Samardžića"],
        "optuženi ili osoba na suđenju": ["V.S."],
        "sudija ili pravosudni službenik": [
            "Vesna Gazdić"
        ],
        "sudski zapisničar ili službenik": ["Ristić Katarina", "Katarine Ristić"],
        "sudska presuda ili odluka": ["O S U Đ U J E"],
        "vrsta kazne ili sankcije": [
            "kaznu zatvora",
            "SPOREDNA KAZNA zabrane vršenja javne funkcije",
        ],
        "iznos ili trajanje kazne": ["4 godine", "u trajanju od 3 godine"],
        "materijalna pravna odredba ili član": [
            "čl.327 st.4 u vezi st.1 Krivičnog Zakonika CG",
            "čl. 2, čl. 3, čl. 4 st. 2, čl. 5, čl. 13 st. 1, čl. 15, čl. 32, čl. 36, čl. 42, čl. 51 st. 1 Krivičnog zakonika Crne Gore"
        ],
        "procesna pravna odredba ili član": ["čl. 226, 229 i 374 Zakonika o krivičnom postupku"],
        "troskovi ili takse sudskog postupka": ["600€"],
    },
}

# Display statistics about the annotations
print(f"📚 Manual annotations loaded for {len(MANUAL_ANNOTATIONS)} examples")

# Count entity types coverage
entity_type_coverage = {}
for example_name, annotations in MANUAL_ANNOTATIONS.items():
    for entity_type, entities in annotations.items():
        if entities:  # Only count non-empty entity list
            if entity_type not in entity_type_coverage:
                entity_type_coverage[entity_type] = 0
            entity_type_coverage[entity_type] += 1

print(f"\n🏷️ Entity Type Coverage Across {len(MANUAL_ANNOTATIONS)} Examples:")
for entity_type in LEGAL_ENTITY_TYPES:
    count = entity_type_coverage.get(entity_type, 0)
    coverage_pct = (count / len(MANUAL_ANNOTATIONS)) * 100
    print(
        f"  {entity_type}: {count}/{len(MANUAL_ANNOTATIONS)} examples ({coverage_pct:.0f}%)"
    )

# Check if all entity types are covered
missing_types = [
    et
    for et in LEGAL_ENTITY_TYPES
    if et not in entity_type_coverage or entity_type_coverage[et] == 0
]
if missing_types:
    print(f"\n⚠️ Missing entity types: {missing_types}")
else:
    print(
        f"\n✅ All {len(LEGAL_ENTITY_TYPES)} entity types are covered in the manual examples!"
    )

In [None]:
def load_ground_truth_data():
    """Load ground truth data from LabelStudio annotations"""
    print("📂 Loading LabelStudio annotations...")
    
    # Create reverse mapping from BIO labels to Serbian labels
    label_to_serbian = {label: serbian_label for serbian_label, label in GLINER_TO_GT_MAPPING.items()}
    print(f"🔄 Created mapping for {len(label_to_serbian)} entity types")
    
    try:
        with open(LABELSTUDIO_JSON_PATH, 'r', encoding='utf-8') as f:
            labelstudio_data = json.load(f)
        print(f"✅ Loaded {len(labelstudio_data)} annotated documents")
    except FileNotFoundError:
        print(f"❌ Error: {LABELSTUDIO_JSON_PATH} not found!")
        return []
    
    ground_truth_examples = []
    entity_types = set()
    bio_labels_found = set()
    
    for item in tqdm(labelstudio_data, desc="Loading ground truth"):
        file_path = item.get("file_upload", "")
        
        # Load text file
        if "/" in file_path:
            filename = file_path.split("/")[-1]
        else:
            filename = file_path
        
        full_path = Path(JUDGMENTS_DIR) / filename
        
        if not full_path.exists():
            continue
        
        try:
            with open(full_path, 'r', encoding='utf-8') as f:
                text_content = f.read().strip()
        except Exception as e:
            print(f"❌ Error reading {full_path}: {e}")
            continue
        
        # Extract entities
        annotations = item.get("annotations", [])
        for annotation in annotations:
            entities = []
            result = annotation.get("result", [])
            
            for res in result:
                if res.get("type") == "labels":
                    value = res["value"]
                    start = value["start"]
                    end = value["end"]
                    labels = value["labels"]
                    
                    for bio_label in labels:
                        bio_labels_found.add(bio_label)
                        
                        # Convert BIO label to Serbian label
                        serbian_label = label_to_serbian.get(bio_label)
                        if serbian_label:
                            entity_types.add(serbian_label)
                            entities.append({
                                'text': text_content[start:end],
                                'label': serbian_label,  # Use Serbian label
                                'start': start,
                                'end': end,
                                'bio_label': bio_label  # Keep original for reference
                            })
                        else:
                            print(f"⚠️ Warning: Unknown BIO label '{bio_label}' - skipping")
            
            if entities:
                ground_truth_examples.append({
                    'text': text_content,
                    'entities': entities,
                    'file_path': file_path
                })
    
    print(f"✅ Loaded {len(ground_truth_examples)} examples with ground truth entities")
    print(f"🏷️ Found Serbian entity types: {sorted(entity_types)}")
    print(f"🔤 Found BIO labels: {sorted(bio_labels_found)}")
    
    # Show entity distribution
    entity_counts = Counter()
    for example in ground_truth_examples:
        for entity in example['entities']:
            entity_counts[entity['label']] += 1
    
    print(f"\n📊 Ground Truth Statistics:")
    print(f"  📄 Total examples: {len(ground_truth_examples)}")
    print(f"  🏷️ Entity types: {len(entity_types)}")
    print(f"\n📈 Entity Distribution:")
    for entity_type, count in entity_counts.most_common():
        print(f"  {entity_type}: {count}")
    
    return ground_truth_examples

# Load ground truth data
ground_truth_examples = load_ground_truth_data()

In [None]:
def load_manual_few_shot_examples():
    """Load manual few-shot examples from files and annotations"""
    examples = []
    
    for filename, annotations in MANUAL_ANNOTATIONS.items():
        # Load text file
        file_path = Path(FEW_SHOT_EXAMPLES_DIR) / filename
        
        if not file_path.exists():
            print(f"⚠️ Warning: {file_path} not found, skipping...")
            continue
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
        except Exception as e:
            print(f"❌ Error reading {file_path}: {e}")
            continue
        
        # Convert annotations to GLiNER format: {"text": text, "entities": [{"start": x, "end": y, "label": "LABEL"}]}
        entities = []
        
        for label, entity_texts in annotations.items():
            # Convert Serbian label to BIO label for GLiNER
            serbian_label = GLINER_TO_GT_MAPPING.get(label)
            
            for entity_text in entity_texts:
                if entity_text:  # Skip empty entities
                    # Find entity position in text
                    start_pos = text.find(entity_text)
                    if start_pos != -1:
                        end_pos = start_pos + len(entity_text)
                        entities.append({
                            "start": start_pos,
                            "end": end_pos,
                            "label": serbian_label
                        })
                    else:
                        print(f"⚠️ Warning: Entity '{entity_text}' not found in {filename}")
        
        # Create GLiNER example format
        gliner_example = {
            "text": text,
            "entities": entities
        }
        
        examples.append(gliner_example)
        print(f"✅ {filename}: {len(entities)} entities found")
    
    print(f"📚 Loaded {len(examples)} manual few-shot examples in GLiNER format")
    return examples

# Load manual few-shot examples
manual_examples = load_manual_few_shot_examples()

In [None]:
class GLiNEREvaluator:
    """GLiNER NER Evaluator for Serbian Legal Documents"""
    
    def __init__(self, model_name: str = "knowledgator/gliner-bi-large-v1.0", confidence_threshold: float = 0.3):
        print(f"🌟 Initializing GLiNER Evaluator: {model_name}")
        
        try:
            # Load GLiNER model
            self.model = GLiNER.from_pretrained(model_name)
            
            # Enable GPU if available
            if torch.cuda.is_available():
                self.model = self.model.cuda()
                print(f"🚀 GPU enabled: {torch.cuda.get_device_name(0)}")
            else:
                print("⚠️ GPU not available, using CPU")
            
            self.model_name = model_name
            self.confidence_threshold = confidence_threshold
            
            print(f"✅ GLiNER model loaded successfully")
            print(f"🎯 Confidence threshold: {confidence_threshold}")
            
        except Exception as e:
            print(f"❌ Error loading GLiNER model {model_name}: {e}")
            self.model = None
    
    def predict_entities(self, text: str, entity_types: List[str], examples: Optional[List] = None) -> List[Dict]:
        """Predict entities using GLiNER zero-shot or few-shot approach"""
        if self.model is None:
            return []
        
        try:
            # Determine method based on examples
            method = "gliner_few_shot" if examples else "gliner_zero_shot"
            
            # Use correct GLiNER API
            if examples:
                # Few-shot prediction with examples in (text, {label: [entities]}) format
                entities = self.model.predict_entities(
                    text,
                    labels=entity_types,
                    threshold=self.confidence_threshold,
                    few_shot_examples=examples
                )
            else:
                # Zero-shot prediction
                entities = self.model.predict_entities(
                    text,
                    labels=entity_types,
                    threshold=self.confidence_threshold
                )
            
            # Convert to our format
            formatted_entities = []
            for entity in entities:
                formatted_entities.append({
                    "text": entity["text"],
                    "label": entity["label"],
                    "start": entity["start"],
                    "end": entity["end"],
                    "confidence": entity["score"],
                    "method": method,
                    "model": self.model_name
                })
            
            return sorted(formatted_entities, key=lambda x: x["start"])
        
        except Exception as e:
            print(f"❌ Error in GLiNER prediction: {e}")
            return []

# Initialize GLiNER evaluator
gliner_evaluator = GLiNEREvaluator(confidence_threshold=CONFIDENCE_THRESHOLD)

## 2. Zero-Shot Evaluation

In [None]:
def evaluate_gliner(examples, entity_types, few_shot_examples=None, method_name="GLiNER"):
    """Evaluate GLiNER on the given examples"""
    print(f"\n🧪 Starting {method_name} Evaluation")
    print("=" * 60)
    
    if gliner_evaluator.model is None:
        return {"error": "GLiNER model not loaded"}
    
    detailed_results = []
    prediction_counts = Counter()
    confidence_scores = []
    
    print(f"📊 Evaluating on {len(examples)} examples...")
    start_time = time.time()
    
    for i, example in enumerate(tqdm(examples, desc=f"{method_name} Evaluation")):
        text = example["text"]
        true_entities = example["entities"]
        
        # Get GLiNER predictions
        pred_entities = gliner_evaluator.predict_entities(text, entity_types, examples=few_shot_examples)
        
        # Count predictions by type
        for entity in pred_entities:
            prediction_counts[entity["label"]] += 1
            confidence_scores.append(entity["confidence"])
        
        # Store detailed results
        detailed_results.append({
            "example_id": i,
            "text": text[:200] + "..." if len(text) > 200 else text,
            "file_path": example["file_path"],
            "true_entities": true_entities,
            "pred_entities": pred_entities,
            "true_count": len(true_entities),
            "pred_count": len(pred_entities)
        })
    
    end_time = time.time()
    evaluation_time = end_time - start_time
    
    # Calculate statistics
    total_true = sum(len(r["true_entities"]) for r in detailed_results)
    total_pred = sum(len(r["pred_entities"]) for r in detailed_results)
    avg_confidence = np.mean(confidence_scores) if confidence_scores else 0.0
    
    print(f"\n📊 {method_name} Prediction Statistics:")
    for label, count in prediction_counts.most_common():
        print(f"  {label}: {count}")
    
    results = {
        "method": method_name,
        "model_name": gliner_evaluator.model_name,
        "confidence_threshold": gliner_evaluator.confidence_threshold,
        "detailed_results": detailed_results,
        "total_true_entities": total_true,
        "total_pred_entities": total_pred,
        "prediction_counts": dict(prediction_counts),
        "examples_evaluated": len(examples),
        "avg_confidence": avg_confidence,
        "evaluation_time": evaluation_time,
        "entities_per_second": total_pred / evaluation_time if evaluation_time > 0 else 0
    }
    
    print(f"✅ {method_name} evaluation complete!")
    print(f"  📊 True entities: {total_true}")
    print(f"  🤖 Predicted entities: {total_pred}")
    print(f"  ⚡ Average confidence: {avg_confidence:.3f}")
    print(f"  ⏱️ Evaluation time: {evaluation_time:.2f}s")
    print(f"  🚀 Entities/second: {results['entities_per_second']:.2f}")
    
    return results

# Run zero-shot evaluation
zero_shot_results = evaluate_gliner(
    ground_truth_examples, 
    LEGAL_ENTITY_TYPES, 
    few_shot_examples=None, 
    method_name="Zero-Shot GLiNER"
)

## 3. Few-Shot Evaluation

In [None]:
# Sanity check: Verify few-shot examples change predictions
if len(manual_examples) > 0 and len(ground_truth_examples) > 0:
    test_text = ground_truth_examples[0]["text"]
    
    # Zero-shot prediction
    zero_shot_pred = gliner_evaluator.predict_entities(test_text, LEGAL_ENTITY_TYPES)
    
    # Few-shot prediction
    few_shot_pred = gliner_evaluator.predict_entities(test_text, LEGAL_ENTITY_TYPES, examples=manual_examples)
    
    if zero_shot_pred != few_shot_pred:
        print("✅ Few-shot examples are working! Predictions differ.")
        print(f"Zero-shot found {len(zero_shot_pred)} entities")
        print(f"Few-shot found {len(few_shot_pred)} entities")
    else:
        print("⚠️ Warning: Few-shot predictions identical to zero-shot. Check examples format.")
else:
    print("⚠️ Cannot run sanity check - missing examples or ground truth data")

In [None]:
# Run few-shot evaluation using manual examples
few_shot_results = evaluate_gliner(
    ground_truth_examples, 
    LEGAL_ENTITY_TYPES, 
    few_shot_examples=manual_examples,
    method_name="Few-Shot GLiNER"
)

## 4. Results Analysis

In [None]:
def compare_results(zero_shot_results, few_shot_results):
    """Compare zero-shot and few-shot results"""
    print("\n📊 COMPARISON: Zero-Shot vs Few-Shot GLiNER")
    print("=" * 60)
    
    # Overall statistics
    print(f"\n📈 Overall Performance:")
    print(f"  Zero-Shot:")
    print(f"    🤖 Predicted entities: {zero_shot_results['total_pred_entities']}")
    print(f"    ⚡ Avg confidence: {zero_shot_results['avg_confidence']:.3f}")
    print(f"    ⏱️ Time: {zero_shot_results['evaluation_time']:.2f}s")
    
    print(f"  Few-Shot:")
    print(f"    🤖 Predicted entities: {few_shot_results['total_pred_entities']}")
    print(f"    ⚡ Avg confidence: {few_shot_results['avg_confidence']:.3f}")
    print(f"    ⏱️ Time: {few_shot_results['evaluation_time']:.2f}s")
    
    # Entity type comparison
    print(f"\n🏷️ Entity Type Predictions:")
    all_labels = set(zero_shot_results['prediction_counts'].keys()) | set(few_shot_results['prediction_counts'].keys())
    
    for label in sorted(all_labels):
        zero_count = zero_shot_results['prediction_counts'].get(label, 0)
        few_count = few_shot_results['prediction_counts'].get(label, 0)
        diff = few_count - zero_count
        diff_str = f"({diff:+d})" if diff != 0 else ""
        print(f"  {label}: {zero_count} → {few_count} {diff_str}")
    
    # Performance improvement
    total_improvement = few_shot_results['total_pred_entities'] - zero_shot_results['total_pred_entities']
    confidence_improvement = few_shot_results['avg_confidence'] - zero_shot_results['avg_confidence']
    
    print(f"\n📊 Summary:")
    print(f"  📈 Entity prediction change: {total_improvement:+d}")
    print(f"  ⚡ Confidence change: {confidence_improvement:+.3f}")
    
    if total_improvement > 0:
        print(f"  ✅ Few-shot found {total_improvement} more entities than zero-shot")
    elif total_improvement < 0:
        print(f"  ⚠️ Few-shot found {abs(total_improvement)} fewer entities than zero-shot")
    else:
        print(f"  ➡️ Few-shot and zero-shot found the same number of entities")

# Compare results
if 'zero_shot_results' in locals() and 'few_shot_results' in locals():
    compare_results(zero_shot_results, few_shot_results)
else:
    print("⚠️ Results not available for comparison")

In [None]:
def create_results_visualization(zero_shot_results, few_shot_results):
    """Create visualizations comparing zero-shot and few-shot results"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('GLiNER Zero-Shot vs Few-Shot Comparison', fontsize=16, fontweight='bold')
    
    # 1. Entity count comparison
    ax1 = axes[0, 0]
    methods = ['Zero-Shot', 'Few-Shot']
    entity_counts = [zero_shot_results['total_pred_entities'], few_shot_results['total_pred_entities']]
    colors = ['skyblue', 'lightcoral']
    
    bars1 = ax1.bar(methods, entity_counts, color=colors)
    ax1.set_title('Total Predicted Entities')
    ax1.set_ylabel('Number of Entities')
    
    # Add value labels on bars
    for bar, count in zip(bars1, entity_counts):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10, 
                str(count), ha='center', va='bottom', fontweight='bold')
    
    # 2. Confidence comparison
    ax2 = axes[0, 1]
    confidence_scores = [zero_shot_results['avg_confidence'], few_shot_results['avg_confidence']]
    
    bars2 = ax2.bar(methods, confidence_scores, color=colors)
    ax2.set_title('Average Confidence Score')
    ax2.set_ylabel('Confidence')
    ax2.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar, conf in zip(bars2, confidence_scores):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{conf:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 3. Entity type comparison
    ax3 = axes[1, 0]
    all_labels = set(zero_shot_results['prediction_counts'].keys()) | set(few_shot_results['prediction_counts'].keys())
    labels = sorted(all_labels)
    
    zero_counts = [zero_shot_results['prediction_counts'].get(label, 0) for label in labels]
    few_counts = [few_shot_results['prediction_counts'].get(label, 0) for label in labels]
    
    x = np.arange(len(labels))
    width = 0.35
    
    ax3.bar(x - width/2, zero_counts, width, label='Zero-Shot', color='skyblue')
    ax3.bar(x + width/2, few_counts, width, label='Few-Shot', color='lightcoral')
    
    ax3.set_title('Predictions by Entity Type')
    ax3.set_ylabel('Number of Predictions')
    ax3.set_xticks(x)
    ax3.set_xticklabels([label.replace(' ili ', '\n') for label in labels], rotation=45, ha='right')
    ax3.legend()
    
    # 4. Performance metrics
    ax4 = axes[1, 1]
    metrics = ['Entities/sec', 'Time (s)']
    zero_metrics = [zero_shot_results['entities_per_second'], zero_shot_results['evaluation_time']]
    few_metrics = [few_shot_results['entities_per_second'], few_shot_results['evaluation_time']]
    
    x = np.arange(len(metrics))
    ax4.bar(x - width/2, zero_metrics, width, label='Zero-Shot', color='skyblue')
    ax4.bar(x + width/2, few_metrics, width, label='Few-Shot', color='lightcoral')
    
    ax4.set_title('Performance Metrics')
    ax4.set_xticks(x)
    ax4.set_xticklabels(metrics)
    ax4.legend()
    
    plt.tight_layout()
    plt.show()

# Create visualizations
if 'zero_shot_results' in locals() and 'few_shot_results' in locals():
    create_results_visualization(zero_shot_results, few_shot_results)
else:
    print("⚠️ Results not available for visualization")

## 🔍 Debug: Few-Shot Examples Format

In [None]:
# Debug: Check few-shot examples format
print("🔍 DEBUG: Few-Shot Examples Format")
print("=" * 50)

if 'manual_examples' in locals() and len(manual_examples) > 0:
    print(f"📚 Total few-shot examples: {len(manual_examples)}")
    print(f"📝 Type of manual_examples: {type(manual_examples)}")
    
    # Show first example in detail
    print(f"\n🔍 First example structure:")
    first_example = manual_examples[0]
    print(f"Type: {type(first_example)}")
    
    if isinstance(first_example, tuple):
        print(f"Tuple length: {len(first_example)}")
        print(f"First element type: {type(first_example[0])}")
        print(f"Second element type: {type(first_example[1])}")
        
        # Show text (truncated)
        text = first_example[0]
        print(f"\n📄 Text (first 200 chars): {text[:200]}...")
        
        # Show entities structure
        entities = first_example[1]
        print(f"\n🏷️ Entities structure:")
        print(f"Type: {type(entities)}")
        
        if isinstance(entities, dict):
            print(f"Keys: {list(entities.keys())}")
            for key, values in list(entities.items())[:3]:  # Show first 3 entity types
                print(f"  {key}: {values} (type: {type(values)})")
        
        print(f"\n✅ GLiNER Expected Format:")
        print(f"   (text, {{label: [entity_texts]}})")
        print(f"\n✅ Current Format:")
        print(f"   {type(first_example)} with {type(first_example[1])}")
        
    else:
        print(f"❌ Unexpected format: {first_example}")
        
    # Test if GLiNER would accept this format
    print(f"\n🧪 Testing GLiNER compatibility...")
    try:
        test_text = "Test text for GLiNER."
        test_pred = gliner_evaluator.predict_entities(
            test_text, 
            ["sud ili tribunal"], 
            examples=manual_examples[:1]  # Use just first example
        )
        print(f"✅ GLiNER accepted the format!")
        print(f"🔍 Test prediction: {test_pred}")
    except Exception as e:
        print(f"❌ GLiNER format error: {e}")
        
else:
    print("❌ No manual_examples found!")

## 📊 Detailed Entity-Level Classification Report (No BIO Tagging)

In [None]:
# Generate detailed classification report
from sklearn.metrics import classification_report
import numpy as np

def generate_classification_report(results, method_name):
    """Generate detailed entity-level classification report for GLiNER results"""
    print(f"\n📊 {method_name} - Detailed Entity-Level Classification Report:")
    print("=" * 80)
    
    if 'detailed_results' not in results:
        print("❌ No detailed results available")
        return
    
    # Collect all true and predicted labels
    all_true_labels = []
    all_pred_labels = []
    
    # Create BIO tags for evaluation
    for example_result in results['detailed_results']:
        text = example_result['text']
        true_entities = example_result['true_entities']
        pred_entities = example_result['pred_entities']
        
        # Convert to BIO format
        text_length = len(text)
        true_bio = ['O'] * text_length
        pred_bio = ['O'] * text_length
        
        # Fill true BIO tags
        for entity in true_entities:
            start, end = entity['start'], entity['end']
            bio_label = GLINER_TO_GT_MAPPING.get(entity['label'], entity['label'])
            if start < text_length and end <= text_length:
                true_bio[start] = f'B-{bio_label}'
                for i in range(start + 1, min(end, text_length)):
                    true_bio[i] = f'I-{bio_label}'
        
        # Fill predicted BIO tags
        for entity in pred_entities:
            start, end = entity['start'], entity['end']
            bio_label = GLINER_TO_GT_MAPPING.get(entity['label'], entity['label'])
            if start < text_length and end <= text_length:
                pred_bio[start] = f'B-{bio_label}'
                for i in range(start + 1, min(end, text_length)):
                    pred_bio[i] = f'I-{bio_label}'
        
        all_true_labels.extend(true_bio)
        all_pred_labels.extend(pred_bio)
    
    # Generate classification report
    if len(all_true_labels) > 0 and len(all_pred_labels) > 0:
        report = classification_report(
            all_true_labels, 
            all_pred_labels, 
            zero_division=0,
            digits=2
        )
        print(report)
        
        # Entity-level statistics
        entity_stats = {}
        for label in set(all_true_labels + all_pred_labels):
            if label != 'O':
                true_count = all_true_labels.count(label)
                pred_count = all_pred_labels.count(label)
                entity_stats[label] = {'true': true_count, 'pred': pred_count}
        
        print(f"\n📈 Entity-Level Statistics:")
        for label, stats in sorted(entity_stats.items()):
            print(f"  {label:20} True: {stats['true']:4d}  Pred: {stats['pred']:4d}")
    else:
        print("❌ No labels found for classification report")

# Generate reports for both methods
if 'zero_shot_results' in locals():
    generate_classification_report(zero_shot_results, "Zero-Shot GLiNER")

if 'few_shot_results' in locals():
    generate_classification_report(few_shot_results, "Few-Shot GLiNER")

## 🎯 Proper Entity-Level Classification Report

In [None]:
def generate_entity_classification_report(results, method_name):
    """Generate proper entity-level classification report (no BIO tagging)"""
    print(f"\n📊 {method_name} - Entity-Level Classification Report:")
    print("=" * 80)
    
    if 'detailed_results' not in results:
        print("❌ No detailed results available")
        return
    
    # Entity-level evaluation
    entity_stats = {}
    total_true = 0
    total_pred = 0
    exact_matches = 0
    
    # Collect entity-level statistics
    for example_result in results['detailed_results']:
        true_entities = example_result['true_entities']
        pred_entities = example_result['pred_entities']
        
        total_true += len(true_entities)
        total_pred += len(pred_entities)
        
        # Count by entity type
        for entity in true_entities:
            label = entity['label']
            if label not in entity_stats:
                entity_stats[label] = {'true': 0, 'pred': 0, 'correct': 0}
            entity_stats[label]['true'] += 1
        
        for entity in pred_entities:
            label = entity['label']
            if label not in entity_stats:
                entity_stats[label] = {'true': 0, 'pred': 0, 'correct': 0}
            entity_stats[label]['pred'] += 1
        
        # Check for exact matches (same span and label)
        for true_entity in true_entities:
            for pred_entity in pred_entities:
                if (true_entity['start'] == pred_entity['start'] and 
                    true_entity['end'] == pred_entity['end'] and 
                    true_entity['label'] == pred_entity['label']):
                    entity_stats[true_entity['label']]['correct'] += 1
                    exact_matches += 1
                    break
    
    # Calculate metrics for each entity type
    print(f"{'Entity Type':<30} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    print("=" * 80)
    
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    num_types = 0
    
    for label in sorted(entity_stats.keys()):
        stats = entity_stats[label]
        
        # Calculate precision, recall, F1
        precision = stats['correct'] / stats['pred'] if stats['pred'] > 0 else 0.0
        recall = stats['correct'] / stats['true'] if stats['true'] > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        print(f"{label:<30} {precision:<10.2f} {recall:<10.2f} {f1:<10.2f} {stats['true']:<10d}")
        
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        num_types += 1
    
    # Overall metrics
    print("=" * 80)
    overall_precision = exact_matches / total_pred if total_pred > 0 else 0.0
    overall_recall = exact_matches / total_true if total_true > 0 else 0.0
    overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
    
    print(f"{'Micro Avg':<30} {overall_precision:<10.2f} {overall_recall:<10.2f} {overall_f1:<10.2f} {total_true:<10d}")
    
    if num_types > 0:
        macro_precision = total_precision / num_types
        macro_recall = total_recall / num_types
        macro_f1 = total_f1 / num_types
        print(f"{'Macro Avg':<30} {macro_precision:<10.2f} {macro_recall:<10.2f} {macro_f1:<10.2f} {total_true:<10d}")
    
    print(f"\n📊 Summary:")
    print(f"  Total True Entities: {total_true}")
    print(f"  Total Predicted Entities: {total_pred}")
    print(f"  Exact Matches: {exact_matches}")
    accuracy = exact_matches/max(total_true, total_pred) if max(total_true, total_pred) > 0 else 0.0
    print(f"  Overall Accuracy: {accuracy:.2f}")

# Generate proper entity-level reports
if 'zero_shot_results' in locals():
    generate_entity_classification_report(zero_shot_results, "Zero-Shot GLiNER")

if 'few_shot_results' in locals():
    generate_entity_classification_report(few_shot_results, "Few-Shot GLiNER")

In [None]:
# Save results for further analysis
results_summary = {
    "evaluation_date": time.strftime("%Y-%m-%d %H:%M:%S"),
    "model_name": gliner_evaluator.model_name,
    "confidence_threshold": CONFIDENCE_THRESHOLD,
    "entity_types": LEGAL_ENTITY_TYPES,
    "manual_examples_count": len(manual_examples),
    "ground_truth_examples_count": len(ground_truth_examples),
    "zero_shot": zero_shot_results if 'zero_shot_results' in locals() else None,
    "few_shot": few_shot_results if 'few_shot_results' in locals() else None
}

# Save to JSON file
output_file = "gliner_evaluation_results.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results_summary, f, ensure_ascii=False, indent=2)

print(f"\n💾 Results saved to {output_file}")
print(f"\n🎉 GLiNER evaluation complete!")
print(f"📊 Evaluated {len(ground_truth_examples)} documents")
print(f"🏷️ Tested {len(LEGAL_ENTITY_TYPES)} entity types")
print(f"📚 Used {len(manual_examples)} manual few-shot examples")