# ═══════════════════════════════════════════════════════════════════
# 🏛️ TLA Dataset Preparation for Egyptian Transliteration RAG System
# ═══════════════════════════════════════════════════════════════════

## 📦 Part 1: Install & Import Libraries

In [25]:
# 🔹 Upgrade pip first
!pip install --upgrade pip --quiet

# 🔹 Core Dependencies
!pip install datasets>=2.18.0 --quiet
!pip install transformers>=4.38.0 --quiet
!pip install torch>=2.2.0 --quiet
!pip install pandas>=2.2.0 --quiet
!pip install numpy>=1.26.0 --quiet

# 🔹 Translation
!pip install sentencepiece>=0.2.0 --quiet

# 🔹 Vector Database
!pip install qdrant-client>=1.7.0 --quiet

# 🔹 Ollama Cloud API
!pip install httpx>=0.25.2,<0.26.0 --quiet
!pip install ollama>=0.1.7 --quiet

# 🔹 BM25 for Hybrid Search
!pip install rank-bm25>=0.2.2 --quiet

# 🔹 Utilities
!pip install tqdm>=4.66.0 --quiet
!pip install python-dotenv>=1.0.0 --quiet
!pip install jupyter>=1.0.0 --quiet
!pip install ipywidgets>=8.1.0 --quiet
!pip install scikit-learn --quiet
!pip install matplotlib --quiet
!pip install sentence-transformers --quiet
!pip install nltk==3.9.2 --quiet
!pip install rouge-score==0.1.2 --quiet
!pip install sacrebleu==2.6.0 --quiet


/bin/bash: line 1: 0.26.0: No such file or directory


In [26]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
import pandas as pd
import numpy as np
import re
import unicodedata
from tqdm.auto import tqdm
from datasets import load_dataset
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import subprocess
import json
import ollama

print("✅ All imports successful!")


✅ All imports successful!


In [4]:
## configuration
# Models
EMBEDDING_MODEL = "BAAI/bge-m3"  # Ollama local

# Settings
VECTOR_DIM = 1024
TRAIN_SPLIT = 0.99  # 99% for training, 1% for testing

# Egyptian character mapping (uniliteral signs)
EGYPTIAN_CHAR_MAP = {
    # Traditional → Normalized
    'ꜣ': 'a',      # vulture (aleph)
    'ꞽ': 'i',      # reed (yodh)
    'y': 'y',      # double yodh
    'ꜥ': 'a',      # arm (ayin)
    'w': 'w',      # quail
    'b': 'b',      # leg
    'p': 'p',      # stool
    'f': 'f',      # viper
    'm': 'm',      # owl
    'n': 'n',      # water
    'r': 'r',      # mouth
    'h': 'h',      # shelter
    'ḥ': 'h',      # wick
    'ḫ': 'kh',     # placenta
    'ẖ': 'kh',     # belly
    's': 's',      # cloth
    'š': 'sh',     # pool
    'ḳ': 'q',      # hill
    'q': 'q',      # hill
    'k': 'k',      # basket
    'g': 'g',      # stand
    't': 't',      # bun
    'ṯ': 'tj',     # rope
    'd': 'd',      # hand
    'ḏ': 'dj',     # cobra

    # Additional special characters
    'ṭ': 't',
    'ḍ': 'd',
    'ṣ': 's',
    'ẓ': 'z',
    'ḥ': 'h',
}

# Suffixes to remove (pronouns and particles)
SUFFIXES_TO_REMOVE = [
    '=f',   # his/him
    '=k',   # your/you (masc)
    '=ṯ',   # your/you (fem)
    '=s',   # her/it
    '=sn',  # their/them
    '=ꞽ',   # my/me
    '=n',   # our/us
    '=tn',  # your/you (pl)
    '=fꞽ',  # variant
]

print(f"🔧 Configuration loaded")
print(f"   Training split: {TRAIN_SPLIT*100}%")
print(f"   Embedding model: {EMBEDDING_MODEL}")

🔧 Configuration loaded
   Training split: 99.0%
   Embedding model: BAAI/bge-m3


In [5]:
## part 3: Load dataset
print("📥 Loading TLA dataset from HuggingFace...")

dataset = load_dataset(
    "thesaurus-linguae-aegyptiae/tla-Earlier_Egyptian_original-v18-premium",
    split="train"
)

df = pd.DataFrame(dataset)

print(f"✅ Loaded {len(df)} records")
print(f"\nColumns: {list(df.columns)}")
print(f"\nSample record:")
print(df.iloc[0][['transliteration', 'translation', 'UPOS']].to_dict())

📥 Loading TLA dataset from HuggingFace...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/5.92M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

✅ Loaded 12773 records

Columns: ['hieroglyphs', 'transliteration', 'lemmatization', 'UPOS', 'glossing', 'translation', 'dateNotBefore', 'dateNotAfter']

Sample record:
{'transliteration': 'nḏ (w)di̯ r =s', 'translation': '(es) werde zerrieben, (es) werde darauf gelegt.', 'UPOS': 'VERB VERB ADP PRON'}


In [6]:
## part 4 : Data Cleaning
print("\n" + "="*70)
print("🧹 STEP 1: Removing unwanted columns")
print("="*70)

# Remove unwanted columns
columns_to_drop = ['hieroglyphs', 'dateNotBefore', 'dateNotAfter']
df_clean = df.drop(columns=columns_to_drop)

print(f"✅ Removed columns: {columns_to_drop}")
print(f"   Remaining columns: {list(df_clean.columns)}")

# Remove rows with missing critical data
print("\n🧹 STEP 2: Removing rows with missing data")
initial_count = len(df_clean)

df_clean = df_clean.dropna(subset=['transliteration', 'translation'])
df_clean = df_clean[df_clean['transliteration'].str.strip() != '']
df_clean = df_clean[df_clean['translation'].str.strip() != '']

print(f"✅ Removed {initial_count - len(df_clean)} rows with missing data")
print(f"   Records remaining: {len(df_clean)}")

# Remove duplicates
print("\n🧹 STEP 3: Removing duplicates")
initial_count = len(df_clean)

df_clean = df_clean.drop_duplicates(subset=['transliteration'], keep='first')

print(f"✅ Removed {initial_count - len(df_clean)} duplicate records")
print(f"   Unique records: {len(df_clean)}")

df_clean = df_clean.reset_index(drop=True)


🧹 STEP 1: Removing unwanted columns
✅ Removed columns: ['hieroglyphs', 'dateNotBefore', 'dateNotAfter']
   Remaining columns: ['transliteration', 'lemmatization', 'UPOS', 'glossing', 'translation']

🧹 STEP 2: Removing rows with missing data
✅ Removed 0 rows with missing data
   Records remaining: 12773

🧹 STEP 3: Removing duplicates
✅ Removed 3685 duplicate records
   Unique records: 9088


In [7]:
## part 5: Transliteration Normalization
def normalize_transliteration(text):
    """
    Normalize Egyptian transliteration:
    1. Remove brackets
    2. Lowercase
    3. Map special characters
    4. Remove suffixes
    5. Clean spaces
    """
    if not isinstance(text, str) or text.strip() == '':
        return ""

    # Step 1: Remove brackets (but keep content)
    text = re.sub(r'[()]', '', text)

    # Step 2: Normalize Unicode (NFC form)
    text = unicodedata.normalize('NFC', text)

    # REMOVE combining marks (important for di̯, etc.)
    text = ''.join(c for c in text if not unicodedata.combining(c))

    # Step 3: Lowercase
    text = text.lower()

    # Step 4: Map Egyptian characters
    for egy_char, normalized in EGYPTIAN_CHAR_MAP.items():
        text = text.replace(egy_char.lower(), normalized)

    # Step 5: Remove suffixes (pronouns/particles)
    for suffix in SUFFIXES_TO_REMOVE:
        # Match suffix at word boundaries or before spaces/dots
        pattern = re.escape(suffix) + r'(?=[\s\.]|$)'
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Step 6: Clean up extra spaces and dots
    text = re.sub(r'\.+', '.', text)  # Multiple dots to single
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
    text = text.strip('. ')  # Remove leading/trailing dots and spaces

    return text

print("\n" + "="*70)
print("🔤 STEP 4: Normalizing transliterations")
print("="*70)

# Test normalization on sample
sample_text = df_clean.iloc[0]['transliteration']
normalized_sample = normalize_transliteration(sample_text)

print(f"\n📝 Sample normalization:")
print(f"   Original:   {sample_text}")
print(f"   Normalized: {normalized_sample}")

# Apply normalization to entire dataset
print(f"\n🔄 Normalizing {len(df_clean)} transliterations...")

df_clean['transliteration_normalized'] = df_clean['transliteration'].apply(
    normalize_transliteration
)


# Remove empty normalizations
df_clean = df_clean[df_clean['transliteration_normalized'].str.len() > 0]
df_clean = df_clean.reset_index(drop=True)

print(f"✅ Normalization complete!")
print(f"   Valid records: {len(df_clean)}")

# Show more examples
print(f"\n📋 Sample normalizations:")
for i in range(min(5, len(df_clean))):
    orig = df_clean.iloc[i]['transliteration']
    norm = df_clean.iloc[i]['transliteration_normalized']
    print(f"   {i+1}. {orig[:40]:40} → {norm[:40]}")



🔤 STEP 4: Normalizing transliterations

📝 Sample normalization:
   Original:   nḏ (w)di̯ r =s
   Normalized: ndj wdi r

🔄 Normalizing 9088 transliterations...
✅ Normalization complete!
   Valid records: 9088

📋 Sample normalizations:
   1. nḏ (w)di̯ r =s                           → ndj wdi r
   2. n ṯw ꞽm =sn                              → n tjw im
   3. ḫꜣ m tʾ ḥnq.t kꜣ(.PL) ꜣpd(.PL) n ꞽmꜣḫ ꞽm → kha m tʾ hnq.t ka.pl apd.pl n imakh im.i
   4. ꜥḥꜥ                                      → aha
   5. (w)sꞽr wnꞽs m n =k ꞽr.t-ḥr.w ꞽꜥb n =k s( → wsir wnis m n ir.t-hr.w iab n si ir rʾ


In [8]:
## part 6: Train/ test split

print("\n" + "="*70)
print(f"📊 STEP 5: Creating train/test split ({TRAIN_SPLIT*100}%/{(1-TRAIN_SPLIT)*100}%)")
print("="*70)

# Shuffle dataset
df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)

# Split
split_idx = int(len(df_clean) * TRAIN_SPLIT)
df_train = df_clean.iloc[:split_idx].copy()
df_test = df_clean.iloc[split_idx:].copy()

print(f"✅ Split complete!")
print(f"   Training set: {len(df_train)} records ({len(df_train)/len(df_clean)*100:.1f}%)")
print(f"   Test set:     {len(df_test)} records ({len(df_test)/len(df_clean)*100:.1f}%)")

# Save test set for later evaluation
df_test.to_csv('tla_test_set.csv', index=False)
print(f"\n💾 Test set saved to: tla_test_set.csv")


📊 STEP 5: Creating train/test split (99.0%/1.0000000000000009%)
✅ Split complete!
   Training set: 8997 records (99.0%)
   Test set:     91 records (1.0%)

💾 Test set saved to: tla_test_set.csv


In [9]:
## part 7: Generate Embedding

from sentence_transformers import SentenceTransformer

print("\n📥 Loading embedding model...")
# Load model (do this ONCE before the loop)
embedding_model = SentenceTransformer('BAAI/bge-m3')
print(f"✅ Model loaded: BAAI/bge-m3")

def get_embedding_fast(text):
    """Generate embedding using sentence-transformers"""
    try:
        # Generate embedding
        embedding = embedding_model.encode(text, normalize_embeddings=True)
        return embedding.tolist()
    except Exception as e:
        print(f"❌ Error: {e}")
        return np.random.randn(VECTOR_DIM).tolist()

print("\n" + "="*70)
print(f"🔢 STEP 6: Generating embeddings for {len(df_train)} records")
print("="*70)

print(f"\n⚙️ Using model: BAAI/bge-m3")
print(f"   Vector dimension: {VECTOR_DIM}")

# Generate embeddings in batches (MUCH faster!)
batch_size = 32
all_embeddings = []

for i in tqdm(range(0, len(df_train), batch_size), desc="Generating embeddings"):
    batch_end = min(i + batch_size, len(df_train))
    batch_texts = df_train.iloc[i:batch_end]['transliteration_normalized'].tolist()

    try:
        # Process entire batch at once (FAST!)
        batch_embeddings = embedding_model.encode(
            batch_texts,
            normalize_embeddings=True,
            show_progress_bar=False
        )
        all_embeddings.extend(batch_embeddings.tolist())
    except Exception as e:
        print(f"❌ Batch error at {i}: {e}")
        # Fallback: process individually
        for text in batch_texts:
            all_embeddings.append(get_embedding_fast(text))

df_train['embedding'] = all_embeddings

print(f"\n✅ Embedding generation complete!")
print(f"   Total: {len(all_embeddings)} embeddings")
print(f"   Dimension: {len(all_embeddings[0])}")

# Verify
sample_embedding = all_embeddings[0]
print(f"\n📊 Sample embedding (first 10 values):")
print(f"   {sample_embedding[:10]}")


📥 Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

✅ Model loaded: BAAI/bge-m3

🔢 STEP 6: Generating embeddings for 8997 records

⚙️ Using model: BAAI/bge-m3
   Vector dimension: 1024


Generating embeddings:   0%|          | 0/282 [00:00<?, ?it/s]


✅ Embedding generation complete!
   Total: 8997 embeddings
   Dimension: 1024

📊 Sample embedding (first 10 values):
   [0.009475680999457836, 0.012296928092837334, -0.03066054731607437, 0.0029091022443026304, -0.038571588695049286, -0.0011071120388805866, -0.002732239430770278, -0.013377217575907707, 0.02956884168088436, -0.00481629790738225]


In [10]:
## part 8: Exract Lemmas from Lemmettizaton

def extract_lemmas(lemmatization_text):
    """Extract lemma words from lemmatization field"""
    if not isinstance(lemmatization_text, str):
        return []

    lemmas = []
    parts = lemmatization_text.split()

    for part in parts:
        if '|' in part:
            lemma_id, lemma_word = part.split('|', 1)
            # Skip suffixes/particles
            if not lemma_word.startswith('='):
                lemmas.append(lemma_word)

    return lemmas

print("\n" + "="*70)
print("📝 STEP 7: Extracting lemmas")
print("="*70)

df_train['lemmas'] = df_train['lemmatization'].apply(extract_lemmas)

print(f"✅ Lemma extraction complete!")
print(f"\n📋 Sample lemmas:")
for i in range(min(3, len(df_train))):
    lemmas = df_train.iloc[i]['lemmas']
    print(f"   {i+1}. {lemmas[:5]}")


📝 STEP 7: Extracting lemmas
✅ Lemma extraction complete!

📋 Sample lemmas:
   1. ['ḥm-nṯr-Ḫwi̯=f-wꞽ', 'ḥr.ꞽ-sštꜣ']
   2. ['zꜣ', 'sms.w', 'ꞽm.ꞽ-rʾ-zẖꜣ.ww-ꜥ-n-nswt', 'Sšm-nfr']
   3. ['zbi̯', 'ṯw', 'm', 'ꜥḥꜥ.w', 'nfr']


In [11]:
## part 9: Setup Qdrant Vector Database
print("\n" + "="*70)
print("Setting up Qdrant database")
print("="*70)

# Initialize Qdrant (in-memory for development)
# For production, use: QdrantClient(host="localhost", port=6333)

# Initialize persistent Qdrant (local)
#qdrant = QdrantClient(path="qdrant_db")

# Initialize Qdrant (in-memory for development)
qdrant = QdrantClient(":memory:")



print(f"✅ Qdrant client initialized (in-memory)")

# Create collection
collection_name = "egyptian_transliterations"

qdrant.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=VECTOR_DIM,
        distance=Distance.COSINE
    )
)

print(f"✅ Collection created: {collection_name}")
print(f"   Vector size: {VECTOR_DIM}")
print(f"   Distance metric: COSINE")


Setting up Qdrant database
✅ Qdrant client initialized (in-memory)
✅ Collection created: egyptian_transliterations
   Vector size: 1024
   Distance metric: COSINE


In [12]:
## part 10: upload data to qdrant


print("\n" + "="*70)
print(f" Uploading {len(df_train)} records to Qdrant")
print("="*70)

# Prepare points
points = []

for idx, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Preparing points"):
    point = PointStruct(
        id=idx,
        vector=row['embedding'],
        payload={
            "transliteration_original": row['transliteration'],
            "transliteration_normalized": row['transliteration_normalized'],
            "lemmas": row['lemmas'],
            "UPOS": row.get('UPOS', ''),
            "glossing": row.get('glossing', ''),
            "translation_de": row['translation']
        }
    )
    points.append(point)

# Upload in batches
batch_size = 100
print(f"\n📦 Uploading in batches of {batch_size}...")

for i in tqdm(range(0, len(points), batch_size), desc="Uploading batches"):
    batch = points[i:i+batch_size]
    qdrant.upsert(
        collection_name=collection_name,
        points=batch
    )

print(f"\n✅ Upload complete!")
print(f"   Total records in database: {len(points)}")





 Uploading 8997 records to Qdrant


Preparing points:   0%|          | 0/8997 [00:00<?, ?it/s]


📦 Uploading in batches of 100...


Uploading batches:   0%|          | 0/90 [00:00<?, ?it/s]


✅ Upload complete!
   Total records in database: 8997


In [13]:
## part 11 : verify Database

print("\n" + "="*70)
print("✅ STEP 10: Verifying database")
print("="*70)

count_info = qdrant.count(
    collection_name=collection_name,
    exact=True
)

print(f"📊 Collection statistics:")
print(f"   Name: {collection_name}")
print(f"   Points count: {count_info.count}")

# Test search
print(f"\n🔍 Testing search functionality...")

test_query = df_train.iloc[0]['transliteration_normalized']
test_embedding = df_train.iloc[0]['embedding']

search_results = qdrant.query_points(
    collection_name=collection_name,
    query=test_embedding,
    limit=3
).points

print(f"\n📝 Test query: {test_query}")
print(f"\n🎯 Top 3 search results:")

for i, result in enumerate(search_results, 1):
    print(f"\n   {i}. Score: {result.score:.4f}")
    print(f"      Transliteration: {result.payload['transliteration_normalized']}")
    print(f"      Translation: {result.payload['translation_de'][:60]}...")



✅ STEP 10: Verifying database
📊 Collection statistics:
   Name: egyptian_transliterations
   Points count: 8997

🔍 Testing search functionality...

📝 Test query: hm-ntjr-khwi=f-wi hr.i-sshta

🎯 Top 3 search results:

   1. Score: 1.0000
      Transliteration: hm-ntjr-khwi=f-wi hr.i-sshta
      Translation: Priester des Cheops und Hüter des Geheimnisses....

   2. Score: 0.9444
      Transliteration: hm-ntjr-khwi=f-wi hr.i-sshta ka=i-n.i-nswt
      Translation: Priester des Cheops und Hüter des Geheimnisses Kai-ni-nisut....

   3. Score: 0.8495
      Transliteration: wt.i hr.i-sshta
      Translation: Balsamierer, Hüter des Geheimnisses...


# ═══════════════════════════════════════════════════════════════════
# 🔮 PART 2: RAG Translation Pipeline
# ═══════════════════════════════════════════════════════════════════

## 📦 Part 12: Install Additional Libraries

In [14]:
import os
from google.colab import userdata

# Load Ollama API Key securely from Colab Secrets
OLLAMA_API_KEY = userdata.get('OLLAMA_API_KEY')

if OLLAMA_API_KEY is None:
    raise ValueError("❌ OLLAMA_API_KEY not found in Colab Secrets")

# Set env var for libraries that expect it
os.environ['OLLAMA_API_KEY'] = OLLAMA_API_KEY

# Configuration
LLM_MODEL = "qwen3-vl:235b-instruct-cloud" #"gpt-oss:120b-cloud" #"qwen3-next:80b-cloud" #"qwen3-vl:235b-cloud"
TOP_K_RESULTS = 55

print("🔧 RAG Pipeline Configuration:")
print(f"   LLM Model: {LLM_MODEL}")
print(f"   Top-K Results: {TOP_K_RESULTS}")
print(f"   API Key: ✅ Loaded securely from Colab Secrets")


🔧 RAG Pipeline Configuration:
   LLM Model: qwen3-vl:235b-instruct-cloud
   Top-K Results: 55
   API Key: ✅ Loaded securely from Colab Secrets


In [15]:
!pip install rank-bm25



In [16]:
## part 14: prepare BM25 index for Sparce Search

from rank_bm25 import BM25Okapi

print("\n" + "="*70)
print(" Building BM25 index for sparse search")
print("="*70)

# Tokenize corpus for BM25
corpus_texts = df_train['transliteration_normalized'].tolist()
tokenized_corpus = [text.split() for text in corpus_texts]

# Build BM25 index
bm25 = BM25Okapi(tokenized_corpus)

print(f"✅ BM25 index built!")
print(f"   Documents indexed: {len(tokenized_corpus)}")


 Building BM25 index for sparse search
✅ BM25 index built!
   Documents indexed: 8997


In [17]:
## part 15: Hybrid Search Function
def hybrid_search(query_text, query_embedding, top_k=10, alpha=0.5):
    """
    Perform hybrid search: Dense (Vector) + Sparse (BM25)

    Args:
        query_text: Normalized transliteration query
        query_embedding: Embedding vector of query
        top_k: Number of results to return
        alpha: Weight for dense search (1-alpha for sparse)

    Returns:
        List of search results with scores
    """

    # 1. Dense Search (Vector Similarity)
    dense_results = qdrant.query_points(
        collection_name=collection_name,
        query=query_embedding,
        limit=top_k * 2
    ).points


    # 2. Sparse Search (BM25)
    query_tokens = query_text.split()
    bm25_scores = bm25.get_scores(query_tokens)

    # Get top BM25 indices
    top_bm25_indices = np.argsort(bm25_scores)[-top_k*2:][::-1]

    # 3. Reciprocal Rank Fusion (RRF)
    combined_scores = {}

    # Add dense scores
    for rank, result in enumerate(dense_results):
        doc_id = result.id
        rrf_score = 1 / (rank + 60)  # RRF formula
        combined_scores[doc_id] = {
            'rrf_score': rrf_score,
            'dense_score': result.score,
            'sparse_score': 0,
            'payload': result.payload
        }

    # Add sparse scores
    for rank, idx in enumerate(top_bm25_indices):
        if idx in combined_scores:
            combined_scores[idx]['rrf_score'] += 1 / (rank + 60)
            combined_scores[idx]['sparse_score'] = bm25_scores[idx]
        else:
            # Retrieve payload from Qdrant
            point = qdrant.retrieve(
                collection_name=collection_name,
                ids=[int(idx)]
            )
            if point:
                combined_scores[idx] = {
                    'rrf_score': 1 / (rank + 60),
                    'dense_score': 0,
                    'sparse_score': bm25_scores[idx],
                    'payload': point[0].payload
                }

    # 4. Sort by combined RRF score
    sorted_results = sorted(
        combined_scores.items(),
        key=lambda x: x[1]['rrf_score'],
        reverse=True
    )[:top_k]

    # 5. Format results
    final_results = []
    for doc_id, scores in sorted_results:
        final_results.append({
            'id': doc_id,
            'rrf_score': scores['rrf_score'],
            'dense_score': scores['dense_score'],
            'sparse_score': scores['sparse_score'],
            'payload': scores['payload']
        })

    return final_results

print("✅ Hybrid search function ready!")


✅ Hybrid search function ready!


In [18]:
## part 16: LLM Translation Function
import requests
OLLAMA_API_URL = "https://ollama.com/api/chat"

def translate_with_llm(query_original, query_normalized, retrieved_examples):
    """
    Use LLM to translate Egyptian to German based on retrieved examples
    """

    # Build examples context (same as before)
    examples_text = ""
    for i, example in enumerate(retrieved_examples, 1):
        payload = example['payload']
        examples_text += f"""
Example {i}:
- Original: {payload['transliteration_original']}
- Normalized: {payload['transliteration_normalized']}
- Lemmas: {', '.join(payload['lemmas'][:5]) if payload['lemmas'] else 'N/A'}
- POS Tags: {payload['UPOS']}
- Glossing: {payload['glossing']}
- German: {payload['translation_de']}
---
"""

    # Build prompt (same as before)
    prompt = f"""You are a senior linguist specializing in Earlier Egyptian (Old Egyptian & Early Middle Egyptian),
with strong expertise in morphology, syntax, and historical semantics.

Your task is to translate an Earlier Egyptian transliteration into German
using retrieved linguistic examples ONLY as structural and semantic guidance.

=====================================
QUERY TO TRANSLATE
=====================================

Normalized Transliteration:
{query_normalized}

=====================================
RETRIEVED DATABASE EXAMPLES
=====================================
{examples_text}

=====================================
INSTRUCTIONS
=====================================
Follow these steps carefully:

1. Linguistic Analysis
   - Identify the grammatical category of each word (verb, noun, particle, suffix, etc.)
   - Detect verb tense/aspect, suffix pronouns, and syntactic order (VSO, SVO, nominal clause).

2. Morphological Alignment
   - Compare suffixes, verb forms, and particles with the retrieved examples.
   - Use lemma meanings as semantic hints, not literal translations.

3. Translation Construction
   - Produce a fluent and historically plausible German translation.
   - Adapt word order to correct German syntax.
   - Prefer linguistically conservative interpretations over speculative ones.

4. Uncertainty Handling
   - If multiple readings are possible, choose the most likely one.
   - Briefly mention ambiguity only if it materially affects meaning.

=====================================
STRICT RULES
=====================================
- DO NOT copy any German translation from the examples.
- DO NOT mention example numbers or quote them.
- DO NOT add explanations unless uncertainty exists.
- DO NOT hallucinate missing words.
- Base your output strictly on Earlier Egyptian grammar.

=====================================
OUTPUT FORMAT (STRICT)
=====================================
German Translation: <one clear German sentence>
Confidence: High | Medium | Low
Notes: <only if confidence is Medium or Low>

"""

    # Call Ollama Cloud API with CORRECT endpoint
    try:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {OLLAMA_API_KEY}"
        }

        # Use Ollama's native format (not OpenAI format)
        payload = {
            "model": LLM_MODEL,
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert Ancient Egyptian linguist specializing in translating Earlier Egyptian to German."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "stream": False
        }

        response = requests.post(
            OLLAMA_API_URL,
            headers=headers,
            json=payload,
            timeout=240
        )

        if response.status_code == 200:
            result = response.json()
            # Ollama format uses 'message' -> 'content'
            llm_output = result['message']['content']

            # Extract German translation
            import re
            match = re.search(r'German Translation:\s*(.+?)(?:\n|$)', llm_output, re.IGNORECASE)
            if match:
                german_translation = match.group(1).strip()
                return german_translation, llm_output
            else:
                return llm_output.split('\n')[0].strip(), llm_output
        else:
            print(f"❌ API Error: {response.status_code}")
            print(f"   Response: {response.text}")
            return None, None

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        return None, None


In [19]:
## part 17: German to English Tranlslation
from transformers import MarianMTModel, MarianTokenizer

print("\n" + "="*70)
print("Loading German→English translation model")
print("="*70)

# Load MarianMT model
print("📥 Loading MarianMT model...")
de_en_model_name = "Helsinki-NLP/opus-mt-de-en"
de_en_tokenizer = MarianTokenizer.from_pretrained(de_en_model_name)
de_en_model = MarianMTModel.from_pretrained(de_en_model_name)

print(f"✅ Model loaded: {de_en_model_name}")

def translate_german_to_english(german_text):
    """Translate German to English using MarianMT"""
    try:
        # Tokenize
        inputs = de_en_tokenizer(
            german_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )

        # Generate translation
        outputs = de_en_model.generate(**inputs)

        # Decode
        english_text = de_en_tokenizer.decode(
            outputs[0],
            skip_special_tokens=True
        )

        return english_text

    except Exception as e:
        print(f"❌ Translation error: {e}")
        return None

print("✅ German→English translation ready!")




Loading German→English translation model
📥 Loading MarianMT model...


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/258 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

✅ Model loaded: Helsinki-NLP/opus-mt-de-en
✅ German→English translation ready!


In [20]:
## part 18: Complete Translation pipeline

def translate_egyptian_to_english(query_original, show_details=True):
    """
    Complete pipeline: Egyptian → German → English

    Args:
        query_original: Original Egyptian transliteration
        show_details: Print intermediate steps

    Returns:
        dict with results
    """

    if show_details:
        print("\n" + "="*70)
        print(f"📝 TRANSLATING: {query_original}")
        print("="*70)

    # Step 1: Normalize query
    query_normalized = normalize_transliteration(query_original)

    if show_details:
        print(f"\n1️⃣ Normalization:")
        print(f"   Original:   {query_original}")
        print(f"   Normalized: {query_normalized}")

    # Step 2: Generate embedding
    query_embedding = embedding_model.encode(
        query_normalized,
        normalize_embeddings=True
    ).tolist()

    if show_details:
        print(f"\n2️⃣ Embedding generated (dim={len(query_embedding)})")

    # Step 3: Hybrid search
    if show_details:
        print(f"\n3️⃣ Hybrid search (Dense + BM25)...")

    search_results = hybrid_search(
        query_text=query_normalized,
        query_embedding=query_embedding,
        top_k=TOP_K_RESULTS
    )

    if show_details:
        print(f"   ✅ Found {len(search_results)} results")
        print(f"\n   📊 Top 3 matches:")
        for i, result in enumerate(search_results[:3], 1):
            print(f"\n   {i}. RRF Score: {result['rrf_score']:.4f}")
            print(f"      Transliteration: {result['payload']['transliteration_normalized']}")
            print(f"      German: {result['payload']['translation_de'][:50]}...")

    # Step 4: LLM Translation (German)
    if show_details:
        print(f"\n4️⃣ LLM Translation (Egyptian → German)...")

    german_translation, llm_full_output = translate_with_llm(
        query_original=query_original,
        query_normalized=query_normalized,
        retrieved_examples=search_results
    )

    if not german_translation:
        return {
            'success': False,
            'error': 'LLM translation failed'
        }

    if show_details:
        print(f"   🇩🇪 German: {german_translation}")

    # Step 5: German → English
    if show_details:
        print(f"\n5️⃣ Translation (German → English)...")

    english_translation = translate_german_to_english(german_translation)

    if not english_translation:
        return {
            'success': False,
            'error': 'German→English translation failed'
        }

    # Final result
    if show_details:
        print("\n" + "="*70)
        print("✅ TRANSLATION COMPLETE")
        print("="*70)
        print(f"🏛️ Egyptian:  {query_original}")
        print(f"🔤 Normalized: {query_normalized}")
        print(f"🇩🇪 German:    {german_translation}")
        print(f"🇬🇧 English:   {english_translation}")
        print("="*70 + "\n")

    return {
        'success': True,
        'query_original': query_original,
        'query_normalized': query_normalized,
        'german': german_translation,
        'english': english_translation,
        'llm_output': llm_full_output,
        'top_matches': search_results[:3]
    }

print("✅ Complete translation pipeline ready!")


✅ Complete translation pipeline ready!


# ═══════════════════════════════════════════════════════════════════
# 📊 EVALUATION METRICS FOR EGYPTIAN TRANSLITERATION RAG SYSTEM
# ═══════════════════════════════════════════════════════════════════

## 📦 Part 24: Install Evaluation Libraries

In [21]:
print(len(df_test))
print(df_test.columns.tolist())
df_test.head()

91
['transliteration', 'lemmatization', 'UPOS', 'glossing', 'translation', 'transliteration_normalized']


Unnamed: 0,transliteration,lemmatization,UPOS,glossing,translation,transliteration_normalized
8997,smr-wꜥ.tꞽ ẖr(.ꞽ)-ḥ(ꜣ)b(.t) (ꞽ)m(.ꞽ)-r(ʾ)-(ꞽ)ꜥw...,400142|smr-wꜥ.tꞽ 124340|ẖr.ꞽ-ḥꜣb.t 400011|ꞽm.ꞽ...,NOUN NOUN NOUN PROPN,TITL TITL TITL PERSN,"Der Einzige Freund, der Vorlesepriester, der V...",smr-wa.ti khr.i-hab.t im.i-rʾ-iaw hr.w-khwi
8998,ḫꜣ ꜣpd ḫꜣ ꜣpd ḫꜣ mnḫ.t ḫꜣ ꞽḥ ḫꜣ tʾ ḥnq.t pꜣ.t ...,113110|ḫꜣ 107|ꜣpd 113110|ḫꜣ 107|ꜣpd 113110|ḫꜣ ...,NOUN NOUN NOUN NOUN NOUN NOUN NOUN NOUN NOUN N...,N.m N.m N.m N.m N.m N.f N.m N.m N.m N.m N.f N....,"Tausend an Geflügel, tausend an Geflügel, taus...",kha apd kha apd kha mnkh.t kha ih kha tʾ hnq.t...
8999,ꞽ:nḏ ḥr =k mnw m pr.t.PL =f qꜣ šw.tꞽ zꜣ wsꞽr m...,91190|nḏ+(ḥr) 107510|ḥr 10110|=k 70530|Mnw 643...,VERB NOUN PRON PROPN ADP NOUN PRON ADJ NOUN NO...,V N.m:stpr -2sg.m DIVN PREP N.f:pl:stpr -3sg.m...,"Sei gegrüßt, Min bei seinen Prozessionen, mit ...",i:ndj hr mnw m pr.t.pl qa shw.ti za wsir msi.n...
9000,šdi̯.t kꜣ rḫ.yt,158710|šdi̯ 162890|kꜣ 95820|rḫ.yt,VERB NOUN NOUN,V\inf N.m N.f,Darbringen der Speisen (für die) Untertanen,shdi.t ka rkh.yt
9001,(w)sr(.w) n(ꞽ).t m n =k ꞽr(.t)-ḥr.w šmi̯.t =f ...,49461|Wsꞽr 702960|Nꞽ.t 67780|mꞽ 400055|n 10110...,NOUN PROPN VERB ADP PRON NOUN VERB PRON ADP PRON,TITL PERSN V\imp.sg PREP:stpr -2sg.m N.f V\rel...,"Osiris Neith, nimm dir das Horusauge, zu dem e...",wsr.w ni.t m n ir.t-hr.w shmi.t ir


In [22]:
## 📦 Part 19: Install Evaluation Libraries
import sys
print("\n📦 Installing all evaluation libraries...")
eval_packages = [
    'nltk',
    'rouge-score',
    'sacrebleu',
]

for package in eval_packages:
    subprocess.run([sys.executable, '-m', 'pip', 'install', package, '--break-system-packages', '-q'])

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from sacrebleu.metrics import CHRF
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
print("📥 Downloading NLTK data...")
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    print("✅ NLTK data ready!")
except:
    print("⚠️ NLTK download warning (may still work)")


📦 Installing all evaluation libraries...
📥 Downloading NLTK data...
✅ NLTK data ready!


In [23]:
## 📊 Part 20: Define Evaluation Metrics

# ============================================================================
# TRANSLATION QUALITY METRICS
# ============================================================================

def calculate_bleu(reference, hypothesis):
    """Calculate BLEU score (0-100)"""
    try:
        reference_tokens = reference.lower().split()
        hypothesis_tokens = hypothesis.lower().split()
        smoothing = SmoothingFunction()
        bleu_score = sentence_bleu(
            [reference_tokens],
            hypothesis_tokens,
            smoothing_function=smoothing.method1
        )
        return bleu_score * 100
    except:
        return 0.0

def calculate_rouge(reference, hypothesis):
    """Calculate ROUGE scores"""
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference, hypothesis)
        return {
            'rouge1': scores['rouge1'].fmeasure * 100,
            'rouge2': scores['rouge2'].fmeasure * 100,
            'rougeL': scores['rougeL'].fmeasure * 100
        }
    except:
        return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

def calculate_meteor(reference, hypothesis):
    """Calculate METEOR score (0-100)"""
    try:
        reference_tokens = reference.lower().split()
        hypothesis_tokens = hypothesis.lower().split()
        meteor = meteor_score([reference_tokens], hypothesis_tokens)
        return meteor * 100
    except:
        return 0.0

def calculate_chrf(reference, hypothesis):
    """Calculate chrF score (0-100)"""
    try:
        chrf = CHRF()
        score = chrf.sentence_score(hypothesis, [reference])
        return score.score  # Already 0-100
    except:
        return 0.0

def calculate_exact_match(reference, hypothesis):
    """Calculate exact match"""
    return 100.0 if reference.strip().lower() == hypothesis.strip().lower() else 0.0

def calculate_word_overlap(reference, hypothesis):
    """Calculate word-level overlap percentage"""
    try:
        ref_words = set(reference.lower().split())
        hyp_words = set(hypothesis.lower().split())
        if len(ref_words) == 0:
            return 0.0
        overlap = len(ref_words.intersection(hyp_words))
        return (overlap / len(ref_words)) * 100
    except:
        return 0.0

# ============================================================================
# RETRIEVAL QUALITY METRICS
# ============================================================================

def calculate_recall_at_k(reference_german, retrieved_examples, k_values=[1, 3, 5, 10]):
    """
    Calculate Recall@K - checks if reference appears in top K results

    Args:
        reference_german: The ground truth German translation
        retrieved_examples: List of retrieved examples from RAG
        k_values: List of K values to calculate recall for

    Returns:
        Dict with Recall@K for each K
    """
    recalls = {}

    for k in k_values:
        found = False
        for i, example in enumerate(retrieved_examples[:k]):
            if i >= len(retrieved_examples):
                break
            retrieved_german = example['payload']['translation_de']
            # Check if the reference matches (exact or high similarity)
            if reference_german.strip().lower() == retrieved_german.strip().lower():
                found = True
                break

        recalls[f'recall@{k}'] = 100.0 if found else 0.0

    return recalls

def calculate_mrr(reference_german, retrieved_examples):
    """
    Calculate Mean Reciprocal Rank (MRR)

    MRR = 1 / rank of first relevant result
    If no relevant result found, MRR = 0

    Args:
        reference_german: The ground truth German translation
        retrieved_examples: List of retrieved examples from RAG

    Returns:
        MRR score (0-100)
    """
    for i, example in enumerate(retrieved_examples):
        retrieved_german = example['payload']['translation_de']
        # Check if this is a relevant result
        if reference_german.strip().lower() == retrieved_german.strip().lower():
            # Rank starts at 1, not 0
            mrr = 1.0 / (i + 1)
            return mrr * 100  # Convert to percentage

    # No relevant result found
    return 0.0

def calculate_average_retrieval_score(retrieved_examples, top_k=10):
    """
    Calculate average retrieval score from top K results

    Args:
        retrieved_examples: List of retrieved examples with scores
        top_k: Number of top results to consider

    Returns:
        Average RRF score (0-100)
    """
    if not retrieved_examples:
        return 0.0

    scores = [example['rrf_score'] for example in retrieved_examples[:top_k]]
    avg_score = np.mean(scores) if scores else 0.0
    return avg_score * 100  # Convert to percentage

print("✅ All evaluation metrics defined!")


✅ All evaluation metrics defined!


In [27]:
## 🔄 Part 21: Process RAG Test Set with ALL Metrics

print("\n" + "="*70)
print("🚀 PROCESSING RAG TEST SET WITH ALL METRICS")
print("="*70)

# Load test set
print("\n📥 Loading test set...")
df_test = pd.read_csv('tla_test_set.csv')
print(f"✅ Loaded {len(df_test)} test records")

# Initialize results storage
test_results = []
failed_translations = []

print(f"\n🔄 Processing {len(df_test)} test samples...")
print("⏱️ Estimated time: ~{:.1f} minutes\n".format(len(df_test) * 3 / 60))

# Process each test sample
for idx in tqdm(range(len(df_test)), desc="RAG Translation"):
    try:
        # Get query
        query_original = df_test.iloc[idx]['transliteration']
        reference_german = df_test.iloc[idx]['translation']

        # Translate Egyptian → German → English using RAG
        result = translate_egyptian_to_english(
            query_original=query_original,
            show_details=False
        )

        if result['success']:
            # Translate reference German → English
            reference_english = translate_german_to_english(reference_german)

            if reference_english:
                # Store results (including retrieval info)
                test_results.append({
                    'sample_id': idx,
                    'transliteration': query_original,
                    'transliteration_normalized': result['query_normalized'],
                    'reference_german': reference_german,
                    'reference_english': reference_english,
                    'predicted_german': result['german'],
                    'predicted_english': result['english'],
                    'top_matches': result['top_matches']  # Store retrieval results
                })
            else:
                failed_translations.append({
                    'sample_id': idx,
                    'reason': 'Reference translation to English failed'
                })
        else:
            failed_translations.append({
                'sample_id': idx,
                'reason': result.get('error', 'RAG translation failed')
            })

    except Exception as e:
        failed_translations.append({
            'sample_id': idx,
            'reason': f'Exception: {str(e)}'
        })
        continue

# Create results DataFrame
df_test_results = pd.DataFrame(test_results)

print(f"\n✅ RAG Processing complete!")
print(f"   Successful: {len(test_results)}")
print(f"   Failed: {len(failed_translations)}")


🚀 PROCESSING RAG TEST SET WITH ALL METRICS

📥 Loading test set...
✅ Loaded 91 test records

🔄 Processing 91 test samples...
⏱️ Estimated time: ~4.5 minutes



RAG Translation:   0%|          | 0/91 [00:00<?, ?it/s]

❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=240)

✅ RAG Processing complete!
   Successful: 90
   Failed: 1


In [28]:
## 📊 Part 22: Calculate ALL Metrics (Translation + Retrieval)

print("\n" + "="*70)
print("📊 CALCULATING ALL METRICS FOR RAG SYSTEM")
print("="*70)

metrics_list = []

print(f"\n🔄 Computing all metrics for {len(df_test_results)} translations...\n")

for idx, row in tqdm(df_test_results.iterrows(), total=len(df_test_results), desc="Computing all metrics"):
    reference_english = row['reference_english']
    hypothesis_english = row['predicted_english']
    reference_german = row['reference_german']
    retrieved_examples = row['top_matches']

    # Calculate translation quality metrics
    rouge_scores = calculate_rouge(reference_english, hypothesis_english)

    # Calculate retrieval quality metrics
    recall_scores = calculate_recall_at_k(reference_german, retrieved_examples, k_values=[1, 3, 5, 10, 20])
    mrr_score = calculate_mrr(reference_german, retrieved_examples)
    avg_retrieval_score = calculate_average_retrieval_score(retrieved_examples, top_k=10)

    # Combine all metrics
    metrics = {
        'sample_id': row['sample_id'],
        # Translation Quality Metrics
        'bleu': calculate_bleu(reference_english, hypothesis_english),
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'meteor': calculate_meteor(reference_english, hypothesis_english),
        'chrf': calculate_chrf(reference_english, hypothesis_english),
        'exact_match': calculate_exact_match(reference_english, hypothesis_english),
        'word_overlap': calculate_word_overlap(reference_english, hypothesis_english),
        # Retrieval Quality Metrics
        'recall@1': recall_scores['recall@1'],
        'recall@3': recall_scores['recall@3'],
        'recall@5': recall_scores['recall@5'],
        'recall@10': recall_scores['recall@10'],
        'recall@20': recall_scores['recall@20'],
        'mrr': mrr_score,
        'avg_retrieval_score': avg_retrieval_score
    }

    metrics_list.append(metrics)

# Create metrics DataFrame
df_metrics = pd.DataFrame(metrics_list)

print("✅ All metrics calculation complete!")


📊 CALCULATING ALL METRICS FOR RAG SYSTEM

🔄 Computing all metrics for 90 translations...



Computing all metrics:   0%|          | 0/90 [00:00<?, ?it/s]

✅ All metrics calculation complete!


In [29]:

## 📈 Part 23: Display Complete RAG System Summary

print("\n" + "="*70)
print("📈 COMPLETE RAG SYSTEM EVALUATION SUMMARY")
print("="*70)

# Calculate averages for all metrics
avg_metrics = {
    # Translation Quality
    'BLEU': df_metrics['bleu'].mean(),
    'ROUGE-1': df_metrics['rouge1'].mean(),
    'ROUGE-2': df_metrics['rouge2'].mean(),
    'ROUGE-L': df_metrics['rougeL'].mean(),
    'METEOR': df_metrics['meteor'].mean(),
    'chrF': df_metrics['chrf'].mean(),
    'Exact Match': df_metrics['exact_match'].mean(),
    'Word Overlap': df_metrics['word_overlap'].mean(),
    # Retrieval Quality
    'Recall@1': df_metrics['recall@1'].mean(),
    'Recall@3': df_metrics['recall@3'].mean(),
    'Recall@5': df_metrics['recall@5'].mean(),
    'Recall@10': df_metrics['recall@10'].mean(),
    'Recall@20': df_metrics['recall@20'].mean(),
    'MRR': df_metrics['mrr'].mean(),
    'Avg Retrieval Score': df_metrics['avg_retrieval_score'].mean()
}

# Quality emoji function
def get_quality_emoji(metric_name, score):
    """Get quality emoji based on metric and score"""
    if metric_name in ['Recall@1', 'Recall@3', 'Exact Match']:
        return '🟢' if score > 20 else '🟡' if score > 5 else '🔴'
    elif 'Recall' in metric_name:
        return '🟢' if score > 40 else '🟡' if score > 20 else '🔴'
    elif metric_name == 'MRR':
        return '🟢' if score > 30 else '🟡' if score > 15 else '🔴'
    else:
        return '🟢' if score > 50 else '🟡' if score > 30 else '🔴'

print("\n📊 TRANSLATION QUALITY METRICS:")
print("─" * 70)
translation_metrics = ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'METEOR', 'chrF', 'Exact Match', 'Word Overlap']
for metric in translation_metrics:
    score = avg_metrics[metric]
    emoji = get_quality_emoji(metric, score)
    print(f"   {emoji} {metric:20s}: {score:6.2f}%")

print("\n📊 RETRIEVAL QUALITY METRICS:")
print("─" * 70)
retrieval_metrics = ['Recall@1', 'Recall@3', 'Recall@5', 'Recall@10', 'Recall@20', 'MRR', 'Avg Retrieval Score']
for metric in retrieval_metrics:
    score = avg_metrics[metric]
    emoji = get_quality_emoji(metric, score)
    print(f"   {emoji} {metric:20s}: {score:6.2f}%")

# Distribution statistics for key metrics
print("\n📈 KEY METRIC DISTRIBUTIONS:")
print("─" * 70)

for metric_name, metric_col in [('BLEU', 'bleu'), ('METEOR', 'meteor'), ('Recall@10', 'recall@10'), ('MRR', 'mrr')]:
    scores = df_metrics[metric_col]
    print(f"\n{metric_name}:")
    print(f"   Min:    {scores.min():6.2f}%")
    print(f"   25%:    {scores.quantile(0.25):6.2f}%")
    print(f"   Median: {scores.median():6.2f}%")
    print(f"   75%:    {scores.quantile(0.75):6.2f}%")
    print(f"   Max:    {scores.max():6.2f}%")
    print(f"   Std:    {scores.std():6.2f}%")

print("\n" + "="*70)



📈 COMPLETE RAG SYSTEM EVALUATION SUMMARY

📊 TRANSLATION QUALITY METRICS:
──────────────────────────────────────────────────────────────────────
   🔴 BLEU                :  23.36%
   🟢 ROUGE-1             :  53.35%
   🟡 ROUGE-2             :  36.24%
   🟢 ROUGE-L             :  51.65%
   🟡 METEOR              :  39.27%
   🟡 chrF                :  45.43%
   🟡 Exact Match         :   8.89%
   🟡 Word Overlap        :  44.00%

📊 RETRIEVAL QUALITY METRICS:
──────────────────────────────────────────────────────────────────────
   🔴 Recall@1            :   2.22%
   🔴 Recall@3            :   2.22%
   🔴 Recall@5            :   2.22%
   🔴 Recall@10           :   2.22%
   🔴 Recall@20           :   2.22%
   🔴 MRR                 :   2.22%
   🔴 Avg Retrieval Score :   2.93%

📈 KEY METRIC DISTRIBUTIONS:
──────────────────────────────────────────────────────────────────────

BLEU:
   Min:      0.00%
   25%:      2.44%
   Median:   8.02%
   75%:     36.31%
   Max:    100.00%
   Std:     29.02%

METEOR:

In [30]:
## 📊 Part 24: Visual Comparison Charts

print("\n" + "="*70)
print("📊 VISUAL METRIC COMPARISON")
print("="*70)

# Translation metrics
print("\n🎯 Translation Quality:")
print("─" * 70)
for metric, score in list(avg_metrics.items())[:8]:
    bar_length = int((score / 100) * 50)
    bar = '█' * bar_length
    print(f"{metric:20s} {bar} {score:6.2f}%")

# Retrieval metrics
print("\n🔍 Retrieval Quality:")
print("─" * 70)
for metric, score in list(avg_metrics.items())[8:]:
    bar_length = int((score / 100) * 50)
    bar = '█' * bar_length
    print(f"{metric:20s} {bar} {score:6.2f}%")

print("\n" + "="*70)


📊 VISUAL METRIC COMPARISON

🎯 Translation Quality:
──────────────────────────────────────────────────────────────────────
BLEU                 ███████████  23.36%
ROUGE-1              ██████████████████████████  53.35%
ROUGE-2              ██████████████████  36.24%
ROUGE-L              █████████████████████████  51.65%
METEOR               ███████████████████  39.27%
chrF                 ██████████████████████  45.43%
Exact Match          ████   8.89%
Word Overlap         ██████████████████████  44.00%

🔍 Retrieval Quality:
──────────────────────────────────────────────────────────────────────
Recall@1             █   2.22%
Recall@3             █   2.22%
Recall@5             █   2.22%
Recall@10            █   2.22%
Recall@20            █   2.22%
MRR                  █   2.22%
Avg Retrieval Score  █   2.93%



In [31]:
## 📊 Part 25: Process LLM-Only Test Set

print("\n" + "="*70)
print("🚀 PROCESSING LLM-ONLY TEST SET")
print("="*70)

# LLM-only translation function (simplified)

def translate_with_llm_only(query_original, query_normalized):
    """
    Direct LLM translation WITHOUT RAG retrieval
    Only uses the LLM's knowledge
    """

    # Simple prompt without retrieved examples
    prompt = f"""You are an expert linguist specialized in Earlier Egyptian grammar and historical translation.

Translate the following Earlier Egyptian transliteration into German.
Use a conservative, grammar-based interpretation.
Do not modernize meanings or add implied words.

Egyptian Transliteration:
{query_original}

Output ONLY the German translation.
Do not add explanations, comments, or alternative readings.

German Translation:
"""

    try:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {OLLAMA_API_KEY}"
        }

        payload = {
            "model": LLM_MODEL,
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert Ancient Egyptian linguist. Translate Earlier Egyptian to German."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "stream": False
        }

        response = requests.post(
            OLLAMA_API_URL,
            headers=headers,
            json=payload,
            timeout=240
        )

        if response.status_code == 200:
            result = response.json()
            llm_output = result['message']['content']

            # Clean the response
            german_translation = llm_output.strip()
            # Remove "German Translation:" prefix if present
            german_translation = re.sub(r'^German Translation:\s*', '', german_translation, flags=re.IGNORECASE)

            return german_translation.strip()
        else:
            print(f"❌ API Error: {response.status_code}")
            return None

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        return None

print("✅ LLM-only translation function ready!")

# Process LLM-only
llm_only_results = []
llm_only_failed = []

print(f"\n🔄 Processing {len(df_test)} samples with LLM-only...")

for idx in tqdm(range(len(df_test)), desc="LLM-only translation"):
    try:
        query_original = df_test.iloc[idx]['transliteration']
        query_normalized = normalize_transliteration(query_original)
        reference_german = df_test.iloc[idx]['translation']

        german_translation = translate_with_llm_only(query_original, query_normalized)

        if german_translation:
            english_translation = translate_german_to_english(german_translation)

            if english_translation:
                # Get reference English from RAG results
                if idx < len(df_test_results):
                    reference_english = df_test_results.iloc[idx]['reference_english']
                else:
                    reference_english = translate_german_to_english(reference_german)

                llm_only_results.append({
                    'sample_id': idx,
                    'transliteration': query_original,
                    'reference_german': reference_german,
                    'reference_english': reference_english,
                    'predicted_german_llm': german_translation,
                    'predicted_english_llm': english_translation
                })
            else:
                llm_only_failed.append({'sample_id': idx, 'reason': 'English translation failed'})
        else:
            llm_only_failed.append({'sample_id': idx, 'reason': 'LLM translation failed'})

    except Exception as e:
        llm_only_failed.append({'sample_id': idx, 'reason': f'Exception: {str(e)}'})
        continue

df_llm_only = pd.DataFrame(llm_only_results)

print(f"\n✅ LLM-only processing complete!")
print(f"   Successful: {len(llm_only_results)}")
print(f"   Failed: {len(llm_only_failed)}")


🚀 PROCESSING LLM-ONLY TEST SET
✅ LLM-only translation function ready!

🔄 Processing 91 samples with LLM-only...


LLM-only translation:   0%|          | 0/91 [00:00<?, ?it/s]

❌ API Error: 503
❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=240)

✅ LLM-only processing complete!
   Successful: 89
   Failed: 2


In [32]:
## 🏆 Part 26: Calculate Metrics for LLM-Only

print("\n" + "="*70)
print("📊 CALCULATING METRICS FOR LLM-ONLY")
print("="*70)

llm_only_metrics = []

print(f"\n🔄 Computing metrics for {len(df_llm_only)} LLM-only translations...\n")

for idx, row in tqdm(df_llm_only.iterrows(), total=len(df_llm_only), desc="Computing LLM-only metrics"):
    reference = row['reference_english']
    hypothesis = row['predicted_english_llm']

    rouge_scores = calculate_rouge(reference, hypothesis)

    # LLM-only has NO retrieval metrics (all 0)
    metrics = {
        'sample_id': row['sample_id'],
        # Translation Quality Metrics
        'bleu': calculate_bleu(reference, hypothesis),
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'meteor': calculate_meteor(reference, hypothesis),
        'chrf': calculate_chrf(reference, hypothesis),
        'exact_match': calculate_exact_match(reference, hypothesis),
        'word_overlap': calculate_word_overlap(reference, hypothesis),
        # Retrieval metrics = 0 (no retrieval)
        'recall@1': 0.0,
        'recall@3': 0.0,
        'recall@5': 0.0,
        'recall@10': 0.0,
        'recall@20': 0.0,
        'mrr': 0.0,
        'avg_retrieval_score': 0.0
    }

    llm_only_metrics.append(metrics)

df_llm_only_metrics = pd.DataFrame(llm_only_metrics)

print("✅ LLM-only metrics calculation complete!")


📊 CALCULATING METRICS FOR LLM-ONLY

🔄 Computing metrics for 89 LLM-only translations...



Computing LLM-only metrics:   0%|          | 0/89 [00:00<?, ?it/s]

✅ LLM-only metrics calculation complete!


In [33]:
##  Part 27: COMPREHENSIVE COMPARISON

print("\n" + "="*70)
print("🆚 COMPREHENSIVE RAG vs LLM-ONLY COMPARISON")
print("="*70)

# Calculate averages for LLM-only
llm_only_averages = {
    'BLEU': df_llm_only_metrics['bleu'].mean(),
    'ROUGE-1': df_llm_only_metrics['rouge1'].mean(),
    'ROUGE-2': df_llm_only_metrics['rouge2'].mean(),
    'ROUGE-L': df_llm_only_metrics['rougeL'].mean(),
    'METEOR': df_llm_only_metrics['meteor'].mean(),
    'chrF': df_llm_only_metrics['chrf'].mean(),
    'Exact Match': df_llm_only_metrics['exact_match'].mean(),
    'Word Overlap': df_llm_only_metrics['word_overlap'].mean(),
    'Recall@1': 0.0,
    'Recall@3': 0.0,
    'Recall@5': 0.0,
    'Recall@10': 0.0,
    'Recall@20': 0.0,
    'MRR': 0.0,
    'Avg Retrieval Score': 0.0
}

print("\n📊 COMPLETE METRICS COMPARISON:")
print("─" * 100)
print(f"{'Metric':<25} {'RAG System':<15} {'LLM-Only':<15} {'Difference':<15} {'Winner':<15}")
print("─" * 100)

for metric in avg_metrics.keys():
    rag_score = avg_metrics[metric]
    llm_score = llm_only_averages[metric]
    diff = rag_score - llm_score

    if 'Recall' in metric or metric == 'MRR' or metric == 'Avg Retrieval Score':
        winner = "🏆 RAG (only)" if diff > 0 else "N/A"
    else:
        winner = "🏆 RAG" if diff > 0 else "🏆 LLM" if diff < 0 else "🤝 Tie"

    print(f"{metric:<25} {rag_score:>6.2f}%        {llm_score:>6.2f}%        {diff:>+6.2f}%       {winner:<15}")

print("─" * 100)



🆚 COMPREHENSIVE RAG vs LLM-ONLY COMPARISON

📊 COMPLETE METRICS COMPARISON:
────────────────────────────────────────────────────────────────────────────────────────────────────
Metric                    RAG System      LLM-Only        Difference      Winner         
────────────────────────────────────────────────────────────────────────────────────────────────────
BLEU                       23.36%          2.78%        +20.57%       🏆 RAG          
ROUGE-1                    53.35%         19.46%        +33.88%       🏆 RAG          
ROUGE-2                    36.24%          5.07%        +31.17%       🏆 RAG          
ROUGE-L                    51.65%         17.93%        +33.72%       🏆 RAG          
METEOR                     39.27%         11.54%        +27.73%       🏆 RAG          
chrF                       45.43%         15.32%        +30.11%       🏆 RAG          
Exact Match                 8.89%          0.00%         +8.89%       🏆 RAG          
Word Overlap               44.

In [34]:
## 📦 Part 29: Detailed Comparison Analysis

print("\n" + "="*70)
print("📈 DETAILED COMPARISON ANALYSIS")
print("="*70)

# Statistical comparison
comparison_stats = []

for metric_col in ['bleu', 'rouge1', 'meteor', 'chrf', 'recall@10', 'mrr']:
    rag_scores = df_metrics[metric_col].values
    llm_scores = df_llm_only_metrics[metric_col].values

    stats = {
        'Metric': metric_col.upper(),
        'RAG_Mean': rag_scores.mean(),
        'RAG_Median': np.median(rag_scores),
        'RAG_Std': rag_scores.std(),
        'LLM_Mean': llm_scores.mean(),
        'LLM_Median': np.median(llm_scores),
        'LLM_Std': llm_scores.std(),
        'Mean_Diff': rag_scores.mean() - llm_scores.mean()
    }

    comparison_stats.append(stats)

df_comparison_stats = pd.DataFrame(comparison_stats)

print("\n📊 Statistical Summary:")
print("─" * 100)
print(df_comparison_stats.to_string(index=False))
print("─" * 100)


📈 DETAILED COMPARISON ANALYSIS

📊 Statistical Summary:
────────────────────────────────────────────────────────────────────────────────────────────────────
   Metric  RAG_Mean  RAG_Median   RAG_Std  LLM_Mean  LLM_Median   LLM_Std  Mean_Diff
     BLEU 23.356350    8.018785 28.859505  2.781497    1.636204  5.018270  20.574853
   ROUGE1 53.345616   50.000000 28.841781 19.461966   16.000000 16.980133  33.883649
   METEOR 39.271824   32.159388 31.958111 11.537178    8.064516 12.845902  27.734646
     CHRF 45.432234   38.671498 30.373582 15.323039   12.463133 11.417125  30.109196
RECALL@10  2.222222    0.000000 14.740555  0.000000    0.000000  0.000000   2.222222
      MRR  2.222222    0.000000 14.740555  0.000000    0.000000  0.000000   2.222222
────────────────────────────────────────────────────────────────────────────────────────────────────


In [35]:
## 📊 Part 30: Win/Loss Analysis

print("\n" + "="*70)
print("🏆 SAMPLE-BY-SAMPLE WIN/LOSS ANALYSIS")
print("="*70)

wins = {'RAG': 0, 'LLM': 0, 'Tie': 0}

for metric in ['bleu', 'meteor', 'chrf']:
    metric_wins = {'RAG': 0, 'LLM': 0, 'Tie': 0}

    for i in range(min(len(df_metrics), len(df_llm_only_metrics))):
        rag_score = df_metrics.iloc[i][metric]
        llm_score = df_llm_only_metrics.iloc[i][metric]

        if rag_score > llm_score:
            metric_wins['RAG'] += 1
            wins['RAG'] += 1
        elif llm_score > rag_score:
            metric_wins['LLM'] += 1
            wins['LLM'] += 1
        else:
            metric_wins['Tie'] += 1
            wins['Tie'] += 1

    total = sum(metric_wins.values())
    print(f"\n{metric.upper()} Wins:")
    print(f"  RAG:      {metric_wins['RAG']:3d} ({metric_wins['RAG']/total*100:5.1f}%)")
    print(f"  LLM-Only: {metric_wins['LLM']:3d} ({metric_wins['LLM']/total*100:5.1f}%)")
    print(f"  Tie:      {metric_wins['Tie']:3d} ({metric_wins['Tie']/total*100:5.1f}%)")

total_comparisons = sum(wins.values())
print(f"\n{'─' * 70}")
print(f"Overall Wins (across all metrics):")
print(f"  🏆 RAG:      {wins['RAG']:3d} ({wins['RAG']/total_comparisons*100:5.1f}%)")
print(f"  🏆 LLM-Only: {wins['LLM']:3d} ({wins['LLM']/total_comparisons*100:5.1f}%)")
print(f"  🤝 Tie:      {wins['Tie']:3d} ({wins['Tie']/total_comparisons*100:5.1f}%)")
print("=" * 70)

## 💾 Part 32: Save All Results

print("\n" + "="*70)
print("💾 SAVING ALL RESULTS")
print("="*70)

# Save RAG comprehensive results
df_test_results_full = df_test_results.copy()
for metric in df_metrics.columns:
    if metric != 'sample_id':
        df_test_results_full[metric] = df_metrics[metric].values

df_test_results_full.to_csv('rag_comprehensive_results.csv', index=False)
print(f"✅ RAG comprehensive results saved to: rag_comprehensive_results.csv")

# Save LLM-only comprehensive results
df_llm_only_full = df_llm_only.copy()
for metric in df_llm_only_metrics.columns:
    if metric != 'sample_id':
        df_llm_only_full[metric] = df_llm_only_metrics[metric].values

df_llm_only_full.to_csv('llm_only_comprehensive_results.csv', index=False)
print(f"✅ LLM-only comprehensive results saved to: llm_only_comprehensive_results.csv")

# Save comparison summary
comparison_summary = pd.DataFrame([
    {'System': 'RAG', **{f'{k}': v for k, v in avg_metrics.items()}},
    {'System': 'LLM-Only', **{f'{k}': v for k, v in llm_only_averages.items()}},
    {'System': 'Difference (RAG - LLM)', **{f'{k}': avg_metrics[k] - llm_only_averages[k] for k in avg_metrics.keys()}}
])

comparison_summary.to_csv('complete_comparison_summary.csv', index=False)
print(f"✅ Comparison summary saved to: complete_comparison_summary.csv")

# Save statistical comparison
df_comparison_stats.to_csv('comparison_statistics.csv', index=False)
print(f"✅ Statistics saved to: comparison_statistics.csv")

print("\n" + "="*70)

## ✅ Part 33: FINAL COMPREHENSIVE REPORT

print("\n" + "="*70)
print("✅ FINAL COMPREHENSIVE EVALUATION REPORT")
print("="*70)

# Determine overall winner
rag_total = sum(avg_metrics.values())
llm_total = sum(llm_only_averages.values())
overall_winner = "RAG System 🏆" if rag_total > llm_total else "LLM-Only 🏆" if llm_total > rag_total else "Tie 🤝"

print(f"""
📊 System Performance Summary:

════════════════════════════════════════════════════════════════
RAG SYSTEM (Hybrid Search + LLM):
════════════════════════════════════════════════════════════════
Total samples: {len(df_metrics)}

Translation Quality:
  • BLEU:         {avg_metrics['BLEU']:.2f}%
  • METEOR:       {avg_metrics['METEOR']:.2f}%
  • chrF:         {avg_metrics['chrF']:.2f}%
  • ROUGE-1:      {avg_metrics['ROUGE-1']:.2f}%
  • Exact Match:  {avg_metrics['Exact Match']:.2f}%

Retrieval Quality:
  • Recall@1:     {avg_metrics['Recall@1']:.2f}%
  • Recall@10:    {avg_metrics['Recall@10']:.2f}%
  • MRR:          {avg_metrics['MRR']:.2f}%
  • Avg Score:    {avg_metrics['Avg Retrieval Score']:.2f}%

════════════════════════════════════════════════════════════════
LLM-ONLY (No RAG):
════════════════════════════════════════════════════════════════
Total samples: {len(df_llm_only_metrics)}

Translation Quality:
  • BLEU:         {llm_only_averages['BLEU']:.2f}%
  • METEOR:       {llm_only_averages['METEOR']:.2f}%
  • chrF:         {llm_only_averages['chrF']:.2f}%
  • ROUGE-1:      {llm_only_averages['ROUGE-1']:.2f}%
  • Exact Match:  {llm_only_averages['Exact Match']:.2f}%

════════════════════════════════════════════════════════════════
COMPARISON RESULTS:
════════════════════════════════════════════════════════════════
🏆 Overall Winner: {overall_winner}

Translation Improvement (RAG - LLM):
  • BLEU:    {avg_metrics['BLEU'] - llm_only_averages['BLEU']:+.2f}%
  • METEOR:  {avg_metrics['METEOR'] - llm_only_averages['METEOR']:+.2f}%
  • chrF:    {avg_metrics['chrF'] - llm_only_averages['chrF']:+.2f}%

Sample-wise Wins:
  • RAG wins:      {wins['RAG']} ({wins['RAG']/total_comparisons*100:.1f}%)
  • LLM-only wins: {wins['LLM']} ({wins['LLM']/total_comparisons*100:.1f}%)

Retrieval Effectiveness:
  • Recall@10:     {avg_metrics['Recall@10']:.1f}% (RAG finds relevant in top 10)
  • MRR:           {avg_metrics['MRR']:.1f}% (Average rank quality)

════════════════════════════════════════════════════════════════
CONCLUSIONS:
════════════════════════════════════════════════════════════════
{"✅ The RAG system significantly outperforms LLM-only approach." if avg_metrics['BLEU'] > llm_only_averages['BLEU'] + 5 else
 "🟡 The RAG system shows moderate improvement over LLM-only." if avg_metrics['BLEU'] > llm_only_averages['BLEU'] else
 "❌ The LLM-only approach performs comparably or better than RAG."}

{"✅ Retrieval quality is good (Recall@10 > 40%)." if avg_metrics['Recall@10'] > 40 else
 "🟡 Retrieval quality is moderate (Recall@10 20-40%)." if avg_metrics['Recall@10'] > 20 else
 "❌ Retrieval quality needs improvement (Recall@10 < 20%)."}

📁 Output Files:
1. rag_comprehensive_results.csv - RAG results with all metrics
2. llm_only_comprehensive_results.csv - LLM-only results with all metrics
3. complete_comparison_summary.csv - Full comparison table
4. comparison_statistics.csv - Detailed statistical analysis
""")

print("="*70)
print("\n🎉 Complete evaluation finished!")
print("="*70 + "\n")



🏆 SAMPLE-BY-SAMPLE WIN/LOSS ANALYSIS

BLEU Wins:
  RAG:       67 ( 75.3%)
  LLM-Only:  16 ( 18.0%)
  Tie:        6 (  6.7%)

METEOR Wins:
  RAG:       66 ( 74.2%)
  LLM-Only:  17 ( 19.1%)
  Tie:        6 (  6.7%)

CHRF Wins:
  RAG:       78 ( 87.6%)
  LLM-Only:  10 ( 11.2%)
  Tie:        1 (  1.1%)

──────────────────────────────────────────────────────────────────────
Overall Wins (across all metrics):
  🏆 RAG:      211 ( 79.0%)
  🏆 LLM-Only:  43 ( 16.1%)
  🤝 Tie:       13 (  4.9%)

💾 SAVING ALL RESULTS
✅ RAG comprehensive results saved to: rag_comprehensive_results.csv
✅ LLM-only comprehensive results saved to: llm_only_comprehensive_results.csv
✅ Comparison summary saved to: complete_comparison_summary.csv
✅ Statistics saved to: comparison_statistics.csv


✅ FINAL COMPREHENSIVE EVALUATION REPORT

📊 System Performance Summary:

════════════════════════════════════════════════════════════════
RAG SYSTEM (Hybrid Search + LLM):
══════════════════════════════════════════════════════════

# ═══════════════════════════════════════════════════════════════════
# 🔬 K-VALUE OPTIMIZATION ANALYSIS
# Find the optimal TOP_K_RESULTS value by testing a single sentence
# ═══════════════════════════════════════════════════════════════════

In [None]:
# ═══════════════════════════════════════════════════════════════════
# 🔬 K-VALUE OPTIMIZATION ANALYSIS
# Find the optimal TOP_K_RESULTS value by testing a single sentence
# ═══════════════════════════════════════════════════════════════════

## 🎯 Part XX: K-Value Optimization for Single Sample

print("\n" + "="*70)
print("🔬 K-VALUE OPTIMIZATION ANALYSIS")
print("="*70)

# Select a test sample
print("\n📝 Selecting test sample...")
test_sample_idx = 0  # You can change this to test different samples
test_query_original = df_test.iloc[test_sample_idx]['transliteration']
test_reference_german = df_test.iloc[test_sample_idx]['translation']

print(f"✅ Test sample selected:")
print(f"   Egyptian: {test_query_original}")
print(f"   Reference (DE): {test_reference_german}")

# Translate reference to English (do this once)
print(f"\n📖 Translating reference to English...")
test_reference_english = translate_german_to_english(test_reference_german)
print(f"   Reference (EN): {test_reference_english}")

# Normalize query
test_query_normalized = normalize_transliteration(test_query_original)
print(f"   Normalized: {test_query_normalized}")

# Generate embedding (do this once)
print(f"\n🔢 Generating embedding...")
test_query_embedding = embedding_model.encode(
    test_query_normalized,
    normalize_embeddings=True
).tolist()
print(f"   ✅ Embedding generated (dim={len(test_query_embedding)})")

# Define K values to test
k_values = list(range(5, 205, 5))  # K = 5, 10, 15, 20, ..., 200
print(f"\n📊 Testing K values: {k_values[0]} to {k_values[-1]} (step=5)")
print(f"   Total tests: {len(k_values)}")
print(f"   Estimated time: ~{len(k_values) * 2 / 60:.1f} minutes\n")

# Storage for results
k_optimization_results = []

# Test each K value
for k in tqdm(k_values, desc="Testing K values"):
    try:
        # Step 1: Retrieve top-K examples
        search_results = hybrid_search(
            query_text=test_query_normalized,
            query_embedding=test_query_embedding,
            top_k=k
        )

        # Step 2: Translate with LLM using these K examples
        german_translation, llm_output = translate_with_llm(
            query_original=test_query_original,
            query_normalized=test_query_normalized,
            retrieved_examples=search_results
        )

        if not german_translation:
            print(f"⚠️ Translation failed for K={k}")
            continue

        # Step 3: Translate German to English
        english_translation = translate_german_to_english(german_translation)

        if not english_translation:
            print(f"⚠️ English translation failed for K={k}")
            continue

        # Step 4: Calculate all metrics
        rouge_scores = calculate_rouge(test_reference_english, english_translation)
        recall_scores = calculate_recall_at_k(
            test_reference_german,
            search_results,
            k_values=[1, 3, 5, 10, 20, 50, 100] if k >= 100 else [1, 3, 5, 10, 20, 50]
        )

        # Store results
        result = {
            'k': k,
            # Translation Quality Metrics
            'bleu': calculate_bleu(test_reference_english, english_translation),
            'rouge1': rouge_scores['rouge1'],
            'rouge2': rouge_scores['rouge2'],
            'rougeL': rouge_scores['rougeL'],
            'meteor': calculate_meteor(test_reference_english, english_translation),
            'chrf': calculate_chrf(test_reference_english, english_translation),
            'exact_match': calculate_exact_match(test_reference_english, english_translation),
            'word_overlap': calculate_word_overlap(test_reference_english, english_translation),
            # Retrieval Quality Metrics
            'recall@1': recall_scores.get('recall@1', 0.0),
            'recall@3': recall_scores.get('recall@3', 0.0),
            'recall@5': recall_scores.get('recall@5', 0.0),
            'recall@10': recall_scores.get('recall@10', 0.0),
            'recall@20': recall_scores.get('recall@20', 0.0),
            'recall@50': recall_scores.get('recall@50', 0.0),
            'recall@100': recall_scores.get('recall@100', 0.0) if k >= 100 else 0.0,
            'mrr': calculate_mrr(test_reference_german, search_results),
            'avg_retrieval_score': calculate_average_retrieval_score(search_results, top_k=min(k, 10)),
            # Translations
            'predicted_german': german_translation,
            'predicted_english': english_translation
        }

        k_optimization_results.append(result)

    except Exception as e:
        print(f"❌ Error at K={k}: {e}")
        continue

# Create results DataFrame
df_k_optimization = pd.DataFrame(k_optimization_results)

print(f"\n✅ K-value testing complete!")
print(f"   Successful tests: {len(df_k_optimization)}/{len(k_values)}")

## 📊 Part XX+1: Analysis and Visualization

print("\n" + "="*70)
print("📊 K-VALUE OPTIMIZATION RESULTS")
print("="*70)

# Find best K for each metric
best_k_results = {}
metrics_to_analyze = ['bleu', 'rouge1', 'meteor', 'chrf', 'recall@10', 'mrr']

print("\n🏆 BEST K VALUE FOR EACH METRIC:")
print("─" * 70)

for metric in metrics_to_analyze:
    best_idx = df_k_optimization[metric].idxmax()
    best_k = df_k_optimization.iloc[best_idx]['k']
    best_score = df_k_optimization.iloc[best_idx][metric]

    best_k_results[metric] = {
        'k': best_k,
        'score': best_score
    }

    print(f"   {metric.upper():15s}: K={best_k:3.0f} → Score={best_score:6.2f}%")

# Overall best K (average across all metrics)
print("\n🎯 OVERALL BEST K (Average Performance):")
print("─" * 70)

# Normalize metrics to 0-100 scale and average
df_k_optimization['avg_score'] = df_k_optimization[metrics_to_analyze].mean(axis=1)
best_overall_idx = df_k_optimization['avg_score'].idxmax()
best_overall_k = df_k_optimization.iloc[best_overall_idx]['k']
best_overall_score = df_k_optimization.iloc[best_overall_idx]['avg_score']

print(f"   Best K: {best_overall_k}")
print(f"   Average Score: {best_overall_score:.2f}%")
print(f"\n   Individual scores at K={best_overall_k}:")
for metric in metrics_to_analyze:
    score = df_k_optimization.iloc[best_overall_idx][metric]
    print(f"      {metric.upper():15s}: {score:6.2f}%")

## 📈 Part XX+2: Detailed Performance Analysis

print("\n" + "="*70)
print("📈 PERFORMANCE TRENDS ACROSS K VALUES")
print("="*70)

# Calculate statistics for each metric
print("\nMetric Performance Summary:")
print("─" * 90)
print(f"{'Metric':<15} {'Min K':<10} {'Max K':<10} {'Best K':<10} {'Min Score':<12} {'Max Score':<12} {'Variance':<10}")
print("─" * 90)

for metric in metrics_to_analyze:
    min_score = df_k_optimization[metric].min()
    max_score = df_k_optimization[metric].max()
    min_k = df_k_optimization.loc[df_k_optimization[metric].idxmin(), 'k']
    max_k = df_k_optimization.loc[df_k_optimization[metric].idxmax(), 'k']
    variance = df_k_optimization[metric].std()

    print(f"{metric.upper():<15} {min_k:<10.0f} {max_k:<10.0f} {best_k_results[metric]['k']:<10.0f} "
          f"{min_score:<12.2f} {max_score:<12.2f} {variance:<10.2f}")

print("─" * 90)

## 📉 Part XX+3: Visual Trend Charts

print("\n" + "="*70)
print("📉 METRIC TRENDS (Visual)")
print("="*70)

# Show trends for key metrics
for metric in ['bleu', 'meteor', 'chrf']:
    print(f"\n{metric.upper()} Score by K:")
    print("─" * 70)

    # Sample every 10 K values for readability
    sample_indices = df_k_optimization[df_k_optimization['k'] % 20 == 0].index

    for idx in sample_indices:
        k = df_k_optimization.loc[idx, 'k']
        score = df_k_optimization.loc[idx, metric]
        bar_length = int((score / 100) * 40)
        bar = '█' * bar_length

        # Mark if this is the best K
        marker = " 🏆 BEST" if k == best_k_results[metric]['k'] else ""

        print(f"  K={k:3.0f}: {bar} {score:5.2f}%{marker}")

## 🔍 Part XX+4: Translation Quality at Different K Values

print("\n" + "="*70)
print("🔍 TRANSLATION EXAMPLES AT KEY K VALUES")
print("="*70)

# Show translations at K = 5, 30, 100, 200 (if available)
example_k_values = [5, 30, 100, 200]

for k in example_k_values:
    if k in df_k_optimization['k'].values:
        row = df_k_optimization[df_k_optimization['k'] == k].iloc[0]

        print(f"\n{'='*70}")
        print(f"K = {k}")
        print(f"{'='*70}")
        print(f"Egyptian:     {test_query_original}")
        print(f"Reference:    {test_reference_english}")
        print(f"Predicted:    {row['predicted_english']}")
        print(f"\nScores:")
        print(f"  BLEU:   {row['bleu']:6.2f}%")
        print(f"  METEOR: {row['meteor']:6.2f}%")
        print(f"  chrF:   {row['chrf']:6.2f}%")
        print(f"  ROUGE-1:{row['rouge1']:6.2f}%")

## 💾 Part XX+5: Save K Optimization Results

print("\n" + "="*70)
print("💾 SAVING K OPTIMIZATION RESULTS")
print("="*70)

# Save detailed results
df_k_optimization.to_csv('k_optimization_detailed.csv', index=False)
print(f"✅ Detailed results saved to: k_optimization_detailed.csv")

# Save summary
summary_data = {
    'Metric': metrics_to_analyze,
    'Best_K': [best_k_results[m]['k'] for m in metrics_to_analyze],
    'Best_Score': [best_k_results[m]['score'] for m in metrics_to_analyze],
    'Min_Score': [df_k_optimization[m].min() for m in metrics_to_analyze],
    'Max_Score': [df_k_optimization[m].max() for m in metrics_to_analyze],
    'Avg_Score': [df_k_optimization[m].mean() for m in metrics_to_analyze],
    'Std_Score': [df_k_optimization[m].std() for m in metrics_to_analyze]
}

df_k_summary = pd.DataFrame(summary_data)
df_k_summary.to_csv('k_optimization_summary.csv', index=False)
print(f"✅ Summary saved to: k_optimization_summary.csv")

## ✅ Part XX+6: Recommendations

print("\n" + "="*70)
print("✅ K-VALUE OPTIMIZATION RECOMMENDATIONS")
print("="*70)

print(f"""
📊 Analysis Summary:
   • Test Sample: {test_query_original}
   • K Values Tested: {k_values[0]} to {k_values[-1]} (step=5)
   • Successful Tests: {len(df_k_optimization)}/{len(k_values)}

🏆 Optimal K Values by Metric:
   • BLEU:       K = {best_k_results['bleu']['k']} (Score: {best_k_results['bleu']['score']:.2f}%)
   • METEOR:     K = {best_k_results['meteor']['k']} (Score: {best_k_results['meteor']['score']:.2f}%)
   • chrF:       K = {best_k_results['chrf']['k']} (Score: {best_k_results['chrf']['score']:.2f}%)
   • ROUGE-1:    K = {best_k_results['rouge1']['k']} (Score: {best_k_results['rouge1']['score']:.2f}%)
   • Recall@10:  K = {best_k_results['recall@10']['k']} (Score: {best_k_results['recall@10']['score']:.2f}%)
   • MRR:        K = {best_k_results['mrr']['k']} (Score: {best_k_results['mrr']['score']:.2f}%)

🎯 OVERALL RECOMMENDATION:
   • Best Overall K: {best_overall_k}
   • Average Performance: {best_overall_score:.2f}%

💡 Interpretation:
""")

# Analyze trends
k_range = df_k_optimization['k'].max() - df_k_optimization['k'].min()
bleu_variance = df_k_optimization['bleu'].std()

if bleu_variance < 5:
    print("   ✅ Performance is STABLE across different K values")
    print(f"   → You can use any K between {k_values[0]} and {k_values[-1]}")
    print(f"   → Recommended: K = {best_overall_k} (best overall performance)")
elif bleu_variance < 15:
    print("   🟡 Performance VARIES moderately with K")
    print(f"   → Recommended: K = {best_overall_k} for best results")
    print(f"   → Alternative: K = {best_k_results['bleu']['k']} for highest BLEU")
else:
    print("   🔴 Performance is HIGHLY SENSITIVE to K")
    print(f"   → CRITICAL: Use K = {best_overall_k}")
    print(f"   → Avoid K < {k_values[int(len(k_values)*0.2)]} (low performance)")

# Performance trend analysis
if df_k_optimization['bleu'].iloc[-1] > df_k_optimization['bleu'].iloc[0]:
    print("\n   📈 Trend: Performance IMPROVES with larger K")
    print(f"   → Consider testing K > {k_values[-1]} for potential gains")
elif df_k_optimization['bleu'].iloc[-1] < df_k_optimization['bleu'].iloc[0]:
    print("\n   📉 Trend: Performance DECREASES with larger K")
    print(f"   → Use smaller K values (K < {best_overall_k})")
else:
    print("\n   ➡️ Trend: Performance is FLAT across K range")
    print(f"   → Use K = {k_values[int(len(k_values)*0.3)]} for efficiency")

print(f"""
📁 Output Files:
   1. k_optimization_detailed.csv - All K values with all metrics
   2. k_optimization_summary.csv - Summary statistics per metric

🚀 Next Steps:
   1. Update TOP_K_RESULTS = {best_overall_k} in your main code
   2. Re-run evaluation with optimized K value
   3. Compare performance improvement
""")

print("="*70)
print("\n🎉 K-value optimization analysis complete!")
print("="*70 + "\n")

## 📊 Part XX+7: Quick Performance Comparison

print("\n" + "="*70)
print("📊 PERFORMANCE COMPARISON: Current vs Optimal K")
print("="*70)

# Get current K performance
current_k = TOP_K_RESULTS
if current_k in df_k_optimization['k'].values:
    current_row = df_k_optimization[df_k_optimization['k'] == current_k].iloc[0]
    optimal_row = df_k_optimization.iloc[best_overall_idx]

    print(f"\nCurrent Configuration (K={current_k}):")
    print("─" * 70)
    for metric in metrics_to_analyze:
        current_score = current_row[metric]
        print(f"   {metric.upper():15s}: {current_score:6.2f}%")

    print(f"\nOptimal Configuration (K={best_overall_k}):")
    print("─" * 70)
    for metric in metrics_to_analyze:
        optimal_score = optimal_row[metric]
        diff = optimal_score - current_row[metric]
        arrow = "📈" if diff > 0 else "📉" if diff < 0 else "➡️"
        print(f"   {metric.upper():15s}: {optimal_score:6.2f}% {arrow} ({diff:+.2f}%)")

    improvement = optimal_row['avg_score'] - current_row['avg_score']
    if improvement > 5:
        print(f"\n✅ Switching to K={best_overall_k} will improve performance by {improvement:.2f}%")
    elif improvement > 0:
        print(f"\n🟡 Switching to K={best_overall_k} will slightly improve performance by {improvement:.2f}%")
    else:
        print(f"\n✅ Current K={current_k} is already optimal!")
else:
    print(f"\n⚠️ Current K={current_k} not tested in this analysis")
    print(f"   Recommended K: {best_overall_k}")

print("\n" + "="*70)


🔬 K-VALUE OPTIMIZATION ANALYSIS

📝 Selecting test sample...
✅ Test sample selected:
   Egyptian: smr-wꜥ.tꞽ ẖr(.ꞽ)-ḥ(ꜣ)b(.t) (ꞽ)m(.ꞽ)-r(ʾ)-(ꞽ)ꜥw ḥr(.w)-ḫwi̯=f
   Reference (DE): Der Einzige Freund, der Vorlesepriester, der Vorsteher der fremdsprachigen Truppe, Harchuf.

📖 Translating reference to English...
   Reference (EN): The only friend, the presiding priest, the leader of the foreign language troupe, Harchuf.
   Normalized: smr-wa.ti khr.i-hab.t im.i-rʾ-iaw hr.w-khwi

🔢 Generating embedding...
   ✅ Embedding generated (dim=1024)

📊 Testing K values: 5 to 200 (step=5)
   Total tests: 40
   Estimated time: ~1.3 minutes



Testing K values:   0%|          | 0/40 [00:00<?, ?it/s]


✅ K-value testing complete!
   Successful tests: 40/40

📊 K-VALUE OPTIMIZATION RESULTS

🏆 BEST K VALUE FOR EACH METRIC:
──────────────────────────────────────────────────────────────────────
   BLEU           : K= 15 → Score=100.00%
   ROUGE1         : K= 15 → Score=100.00%
   METEOR         : K= 15 → Score= 99.98%
   CHRF           : K= 15 → Score=100.00%
   RECALL@10      : K=  5 → Score=  0.00%
   MRR            : K=  5 → Score=  0.00%

🎯 OVERALL BEST K (Average Performance):
──────────────────────────────────────────────────────────────────────
   Best K: 15
   Average Score: 66.66%

   Individual scores at K=15:
      BLEU           : 100.00%
      ROUGE1         : 100.00%
      METEOR         :  99.98%
      CHRF           : 100.00%
      RECALL@10      :   0.00%
      MRR            :   0.00%

📈 PERFORMANCE TRENDS ACROSS K VALUES

Metric Performance Summary:
──────────────────────────────────────────────────────────────────────────────────────────
Metric          Min K      Max

# ═══════════════════════════════════════════════════════════════════
# 🔬 K-VALUE OPTIMIZATION ANALYSIS (10 SENTENCES)
# Find the optimal TOP_K_RESULTS value by testing on first 10 sentences
# ═══════════════════════════════════════════════════════════════════

In [None]:

## 🎯 Part XX: K-Value Optimization for Multiple Samples

print("\n" + "="*70)
print("🔬 K-VALUE OPTIMIZATION ANALYSIS (10 SENTENCES)")
print("="*70)

# Select first 10 test samples
num_test_samples = 10
print(f"\n📝 Selecting first {num_test_samples} test samples...")

test_samples = []
for idx in range(min(num_test_samples, len(df_test))):
    sample = {
        'idx': idx,
        'query_original': df_test.iloc[idx]['transliteration'],
        'reference_german': df_test.iloc[idx]['translation'],
        'query_normalized': normalize_transliteration(df_test.iloc[idx]['transliteration'])
    }

    # Translate reference to English once
    sample['reference_english'] = translate_german_to_english(sample['reference_german'])

    # Generate embedding once
    sample['query_embedding'] = embedding_model.encode(
        sample['query_normalized'],
        normalize_embeddings=True
    ).tolist()

    test_samples.append(sample)
    print(f"   {idx+1}. {sample['query_original'][:50]}...")

print(f"\n✅ Prepared {len(test_samples)} test samples")

# Define K values to test
k_values = list(range(5, 205, 5))  # K = 5, 10, 15, 20, ..., 200
print(f"\n📊 Testing K values: {k_values[0]} to {k_values[-1]} (step=5)")
print(f"   Total K values: {len(k_values)}")
print(f"   Test samples: {len(test_samples)}")
print(f"   Total translations: {len(k_values) * len(test_samples)}")
print(f"   Estimated time: ~{(len(k_values) * len(test_samples) * 2) / 60:.1f} minutes\n")

# Storage for results (organized by K value)
k_results_by_k = {}  # {k_value: [results for each sample]}

# Test each K value across all samples
for k in tqdm(k_values, desc="Testing K values"):
    k_results_by_k[k] = []

    for sample in test_samples:
        try:
            # Step 1: Retrieve top-K examples
            search_results = hybrid_search(
                query_text=sample['query_normalized'],
                query_embedding=sample['query_embedding'],
                top_k=k
            )

            # Step 2: Translate with LLM using these K examples
            german_translation, llm_output = translate_with_llm(
                query_original=sample['query_original'],
                query_normalized=sample['query_normalized'],
                retrieved_examples=search_results
            )

            if not german_translation:
                continue

            # Step 3: Translate German to English
            english_translation = translate_german_to_english(german_translation)

            if not english_translation:
                continue

            # Step 4: Calculate all metrics
            rouge_scores = calculate_rouge(sample['reference_english'], english_translation)
            recall_scores = calculate_recall_at_k(
                sample['reference_german'],
                search_results,
                k_values=[1, 3, 5, 10, 20, 50, 100] if k >= 100 else [1, 3, 5, 10, 20, 50]
            )

            # Store results for this sample at this K
            result = {
                'sample_idx': sample['idx'],
                'k': k,
                # Translation Quality Metrics
                'bleu': calculate_bleu(sample['reference_english'], english_translation),
                'rouge1': rouge_scores['rouge1'],
                'rouge2': rouge_scores['rouge2'],
                'rougeL': rouge_scores['rougeL'],
                'meteor': calculate_meteor(sample['reference_english'], english_translation),
                'chrf': calculate_chrf(sample['reference_english'], english_translation),
                'exact_match': calculate_exact_match(sample['reference_english'], english_translation),
                'word_overlap': calculate_word_overlap(sample['reference_english'], english_translation),
                # Retrieval Quality Metrics
                'recall@1': recall_scores.get('recall@1', 0.0),
                'recall@3': recall_scores.get('recall@3', 0.0),
                'recall@5': recall_scores.get('recall@5', 0.0),
                'recall@10': recall_scores.get('recall@10', 0.0),
                'recall@20': recall_scores.get('recall@20', 0.0),
                'recall@50': recall_scores.get('recall@50', 0.0),
                'recall@100': recall_scores.get('recall@100', 0.0) if k >= 100 else 0.0,
                'mrr': calculate_mrr(sample['reference_german'], search_results),
                'avg_retrieval_score': calculate_average_retrieval_score(search_results, top_k=min(k, 10)),
                # Translations
                'query_original': sample['query_original'],
                'reference_english': sample['reference_english'],
                'predicted_german': german_translation,
                'predicted_english': english_translation
            }

            k_results_by_k[k].append(result)

        except Exception as e:
            print(f"\n❌ Error at K={k}, Sample={sample['idx']}: {e}")
            continue

print(f"\n✅ K-value testing complete!")

## 📊 Part XX+1: Aggregate Results Across All Samples

print("\n" + "="*70)
print("📊 AGGREGATING RESULTS ACROSS ALL SAMPLES")
print("="*70)

# Calculate average metrics for each K value
k_aggregated_results = []
metrics_to_analyze = ['bleu', 'rouge1', 'meteor', 'chrf', 'recall@10', 'mrr']

for k in k_values:
    if k not in k_results_by_k or len(k_results_by_k[k]) == 0:
        continue

    # Get all results for this K value
    k_results = k_results_by_k[k]

    # Calculate average for each metric
    aggregated = {'k': k, 'num_samples': len(k_results)}

    for metric in metrics_to_analyze:
        scores = [r[metric] for r in k_results]
        aggregated[f'{metric}_mean'] = np.mean(scores)
        aggregated[f'{metric}_std'] = np.std(scores)
        aggregated[f'{metric}_min'] = np.min(scores)
        aggregated[f'{metric}_max'] = np.max(scores)

    # Calculate overall average score
    avg_scores = [aggregated[f'{m}_mean'] for m in metrics_to_analyze]
    aggregated['overall_mean'] = np.mean(avg_scores)

    k_aggregated_results.append(aggregated)

# Create DataFrame
df_k_aggregated = pd.DataFrame(k_aggregated_results)

print(f"✅ Aggregated results for {len(df_k_aggregated)} K values")
print(f"   Average samples per K: {df_k_aggregated['num_samples'].mean():.1f}")

## 📊 Part XX+2: Find Best K Values

print("\n" + "="*70)
print("📊 BEST K VALUES (Averaged Across 10 Samples)")
print("="*70)

# Find best K for each metric
best_k_results = {}

print("\n🏆 BEST K VALUE FOR EACH METRIC:")
print("─" * 90)
print(f"{'Metric':<15} {'Best K':<10} {'Mean Score':<12} {'Std Dev':<12} {'Min':<10} {'Max':<10}")
print("─" * 90)

for metric in metrics_to_analyze:
    best_idx = df_k_aggregated[f'{metric}_mean'].idxmax()
    best_k = df_k_aggregated.iloc[best_idx]['k']
    best_mean = df_k_aggregated.iloc[best_idx][f'{metric}_mean']
    best_std = df_k_aggregated.iloc[best_idx][f'{metric}_std']
    best_min = df_k_aggregated.iloc[best_idx][f'{metric}_min']
    best_max = df_k_aggregated.iloc[best_idx][f'{metric}_max']

    best_k_results[metric] = {
        'k': best_k,
        'mean': best_mean,
        'std': best_std,
        'min': best_min,
        'max': best_max
    }

    print(f"{metric.upper():<15} {best_k:<10.0f} {best_mean:<12.2f} {best_std:<12.2f} {best_min:<10.2f} {best_max:<10.2f}")

# Find overall best K (based on average of all metrics)
print("\n🎯 OVERALL BEST K (Average Performance Across All Metrics):")
print("─" * 90)

best_overall_idx = df_k_aggregated['overall_mean'].idxmax()
best_overall_k = df_k_aggregated.iloc[best_overall_idx]['k']
best_overall_score = df_k_aggregated.iloc[best_overall_idx]['overall_mean']

print(f"   Best K: {best_overall_k}")
print(f"   Overall Mean Score: {best_overall_score:.2f}%")
print(f"   Number of Samples: {df_k_aggregated.iloc[best_overall_idx]['num_samples']}")

print(f"\n   Individual metric scores at K={best_overall_k}:")
for metric in metrics_to_analyze:
    mean_score = df_k_aggregated.iloc[best_overall_idx][f'{metric}_mean']
    std_score = df_k_aggregated.iloc[best_overall_idx][f'{metric}_std']
    print(f"      {metric.upper():15s}: {mean_score:6.2f}% (±{std_score:.2f}%)")

## 📈 Part XX+3: Performance Trends Analysis

print("\n" + "="*70)
print("📈 PERFORMANCE TRENDS ACROSS K VALUES")
print("="*70)

print("\nMetric Performance Summary:")
print("─" * 110)
print(f"{'Metric':<15} {'Best K':<10} {'Best Mean':<12} {'Worst K':<10} {'Worst Mean':<12} {'Range':<10} {'Std Dev':<12}")
print("─" * 110)

for metric in metrics_to_analyze:
    best_idx = df_k_aggregated[f'{metric}_mean'].idxmax()
    worst_idx = df_k_aggregated[f'{metric}_mean'].idxmin()

    best_k = df_k_aggregated.iloc[best_idx]['k']
    best_mean = df_k_aggregated.iloc[best_idx][f'{metric}_mean']
    worst_k = df_k_aggregated.iloc[worst_idx]['k']
    worst_mean = df_k_aggregated.iloc[worst_idx][f'{metric}_mean']

    score_range = best_mean - worst_mean
    score_std = df_k_aggregated[f'{metric}_mean'].std()

    print(f"{metric.upper():<15} {best_k:<10.0f} {best_mean:<12.2f} {worst_k:<10.0f} {worst_mean:<12.2f} "
          f"{score_range:<10.2f} {score_std:<12.2f}")

print("─" * 110)

## 📉 Part XX+4: Visual Trends

print("\n" + "="*70)
print("📉 METRIC TRENDS (Averaged Across 10 Samples)")
print("="*70)

# Show trends for key metrics
for metric in ['bleu', 'meteor', 'chrf']:
    print(f"\n{metric.upper()} Score by K (Mean ± Std):")
    print("─" * 80)

    # Sample every 20 K values for readability
    sample_ks = [k for k in k_values if k % 20 == 0]

    for k in sample_ks:
        if k in df_k_aggregated['k'].values:
            row = df_k_aggregated[df_k_aggregated['k'] == k].iloc[0]
            mean_score = row[f'{metric}_mean']
            std_score = row[f'{metric}_std']

            bar_length = int((mean_score / 100) * 40)
            bar = '█' * bar_length

            # Mark if this is the best K
            marker = " 🏆 BEST" if k == best_k_results[metric]['k'] else ""

            print(f"  K={k:3.0f}: {bar} {mean_score:5.2f}% (±{std_score:4.2f}%){marker}")

## 📊 Part XX+5: Sample-by-Sample Breakdown

print("\n" + "="*70)
print("📊 SAMPLE-BY-SAMPLE BREAKDOWN AT BEST K")
print("="*70)

print(f"\nPerformance at K={best_overall_k} for each of the 10 samples:")
print("─" * 100)
print(f"{'Sample':<8} {'Egyptian':<30} {'BLEU':<10} {'METEOR':<10} {'chrF':<10} {'ROUGE-1':<10}")
print("─" * 100)

if best_overall_k in k_results_by_k:
    for result in k_results_by_k[best_overall_k]:
        sample_idx = result['sample_idx']
        query = result['query_original'][:25] + "..." if len(result['query_original']) > 25 else result['query_original']
        bleu = result['bleu']
        meteor = result['meteor']
        chrf = result['chrf']
        rouge1 = result['rouge1']

        print(f"{sample_idx:<8} {query:<30} {bleu:<10.2f} {meteor:<10.2f} {chrf:<10.2f} {rouge1:<10.2f}")

print("─" * 100)

## 💾 Part XX+6: Save Results

print("\n" + "="*70)
print("💾 SAVING K OPTIMIZATION RESULTS")
print("="*70)

# Save aggregated results (average across 10 samples)
df_k_aggregated.to_csv('k_optimization_aggregated.csv', index=False)
print(f"✅ Aggregated results saved to: k_optimization_aggregated.csv")

# Save detailed results (all samples, all K values)
all_detailed_results = []
for k, results in k_results_by_k.items():
    all_detailed_results.extend(results)

df_k_detailed = pd.DataFrame(all_detailed_results)
df_k_detailed.to_csv('k_optimization_detailed_10samples.csv', index=False)
print(f"✅ Detailed results saved to: k_optimization_detailed_10samples.csv")

# Save summary
summary_data = {
    'Metric': metrics_to_analyze,
    'Best_K': [best_k_results[m]['k'] for m in metrics_to_analyze],
    'Best_Mean_Score': [best_k_results[m]['mean'] for m in metrics_to_analyze],
    'Std_Dev': [best_k_results[m]['std'] for m in metrics_to_analyze],
    'Min_Score': [best_k_results[m]['min'] for m in metrics_to_analyze],
    'Max_Score': [best_k_results[m]['max'] for m in metrics_to_analyze]
}

df_k_summary = pd.DataFrame(summary_data)
df_k_summary.to_csv('k_optimization_summary_10samples.csv', index=False)
print(f"✅ Summary saved to: k_optimization_summary_10samples.csv")

## ✅ Part XX+7: Final Recommendations

print("\n" + "="*70)
print("✅ K-VALUE OPTIMIZATION RECOMMENDATIONS (10 SAMPLES)")
print("="*70)

print(f"""
📊 Analysis Summary:
   • Test Samples: {len(test_samples)} sentences
   • K Values Tested: {k_values[0]} to {k_values[-1]} (step=5)
   • Total Translations: {len(all_detailed_results)}
   • Average Success Rate: {len(all_detailed_results)/(len(k_values)*len(test_samples))*100:.1f}%

🏆 Optimal K Values by Metric (Averaged Across 10 Samples):
──────────────────────────────────────────────────────────────────────
   • BLEU:       K = {best_k_results['bleu']['k']} (Mean: {best_k_results['bleu']['mean']:.2f}%, Std: {best_k_results['bleu']['std']:.2f}%)
   • METEOR:     K = {best_k_results['meteor']['k']} (Mean: {best_k_results['meteor']['mean']:.2f}%, Std: {best_k_results['meteor']['std']:.2f}%)
   • chrF:       K = {best_k_results['chrf']['k']} (Mean: {best_k_results['chrf']['mean']:.2f}%, Std: {best_k_results['chrf']['std']:.2f}%)
   • ROUGE-1:    K = {best_k_results['rouge1']['k']} (Mean: {best_k_results['rouge1']['mean']:.2f}%, Std: {best_k_results['rouge1']['std']:.2f}%)
   • Recall@10:  K = {best_k_results['recall@10']['k']} (Mean: {best_k_results['recall@10']['mean']:.2f}%, Std: {best_k_results['recall@10']['std']:.2f}%)
   • MRR:        K = {best_k_results['mrr']['k']} (Mean: {best_k_results['mrr']['mean']:.2f}%, Std: {best_k_results['mrr']['std']:.2f}%)

🎯 OVERALL RECOMMENDATION:
──────────────────────────────────────────────────────────────────────
   • Best Overall K: {best_overall_k}
   • Overall Mean Performance: {best_overall_score:.2f}%
   • Consistency: {"High (low std)" if df_k_aggregated.iloc[best_overall_idx]['bleu_std'] < 10 else "Moderate" if df_k_aggregated.iloc[best_overall_idx]['bleu_std'] < 20 else "Variable (high std)"}

💡 Interpretation:
""")

# Analyze performance stability
bleu_variance = df_k_aggregated['bleu_mean'].std()

if bleu_variance < 5:
    print("   ✅ Performance is VERY STABLE across different K values")
    print(f"   → Any K between {k_values[5]} and {k_values[-5]} will give good results")
    print(f"   → Recommended: K = {best_overall_k} for optimal performance")
elif bleu_variance < 10:
    print("   🟡 Performance is MODERATELY STABLE across K values")
    print(f"   → Recommended: K = {best_overall_k} for best results")
    print(f"   → Alternative: K = {best_k_results['bleu']['k']} for highest BLEU")
else:
    print("   🔴 Performance is HIGHLY VARIABLE with K")
    print(f"   → CRITICAL: Use K = {best_overall_k}")
    print(f"   → Avoid K < {k_values[int(len(k_values)*0.2)]} (poor performance)")

# Check if performance improves with K
first_k_score = df_k_aggregated[df_k_aggregated['k'] == k_values[0]]['bleu_mean'].values[0]
last_k_score = df_k_aggregated[df_k_aggregated['k'] == k_values[-1]]['bleu_mean'].values[0]

if last_k_score > first_k_score + 5:
    print(f"\n   📈 Trend: Performance IMPROVES significantly with larger K")
    print(f"   → Consider testing K > {k_values[-1]} for potential gains")
elif last_k_score > first_k_score:
    print(f"\n   📈 Trend: Performance SLIGHTLY IMPROVES with larger K")
    print(f"   → K = {best_overall_k} is a good balance")
elif last_k_score < first_k_score - 5:
    print(f"\n   📉 Trend: Performance DECREASES with larger K")
    print(f"   → Use smaller K values (K < {best_overall_k})")
else:
    print(f"\n   ➡️ Trend: Performance is RELATIVELY FLAT across K range")
    print(f"   → K = {best_overall_k} provides best results with good consistency")

print(f"""
📁 Output Files:
──────────────────────────────────────────────────────────────────────
   1. k_optimization_aggregated.csv - Average metrics for each K
   2. k_optimization_detailed_10samples.csv - All samples, all K values
   3. k_optimization_summary_10samples.csv - Summary statistics

🚀 Next Steps:
──────────────────────────────────────────────────────────────────────
   1. Update TOP_K_RESULTS = {best_overall_k} in your main code
   2. Re-run evaluation on full test set with optimized K
   3. Expected improvement: {best_overall_score - df_k_aggregated[df_k_aggregated['k'] == TOP_K_RESULTS]['overall_mean'].values[0] if TOP_K_RESULTS in df_k_aggregated['k'].values else 0:.2f}%
""")

print("="*70)
print("\n🎉 K-value optimization analysis complete!")
print("="*70 + "\n")

## 📊 Part XX+8: Comparison with Current K

print("\n" + "="*70)
print("📊 PERFORMANCE COMPARISON: Current K vs Optimal K")
print("="*70)

current_k = TOP_K_RESULTS

if current_k in df_k_aggregated['k'].values:
    current_row = df_k_aggregated[df_k_aggregated['k'] == current_k].iloc[0]
    optimal_row = df_k_aggregated.iloc[best_overall_idx]

    print(f"\nCurrent Configuration (K={current_k}):")
    print("─" * 90)
    print(f"{'Metric':<15} {'Mean':<12} {'Std Dev':<12} {'Min':<10} {'Max':<10}")
    print("─" * 90)
    for metric in metrics_to_analyze:
        mean = current_row[f'{metric}_mean']
        std = current_row[f'{metric}_std']
        min_val = current_row[f'{metric}_min']
        max_val = current_row[f'{metric}_max']
        print(f"{metric.upper():<15} {mean:<12.2f} {std:<12.2f} {min_val:<10.2f} {max_val:<10.2f}")

    print(f"\nOptimal Configuration (K={best_overall_k}):")
    print("─" * 90)
    print(f"{'Metric':<15} {'Mean':<12} {'Std Dev':<12} {'Min':<10} {'Max':<10} {'Improvement':<15}")
    print("─" * 90)
    for metric in metrics_to_analyze:
        mean = optimal_row[f'{metric}_mean']
        std = optimal_row[f'{metric}_std']
        min_val = optimal_row[f'{metric}_min']
        max_val = optimal_row[f'{metric}_max']
        improvement = mean - current_row[f'{metric}_mean']
        arrow = "📈" if improvement > 0 else "📉" if improvement < 0 else "➡️"
        print(f"{metric.upper():<15} {mean:<12.2f} {std:<12.2f} {min_val:<10.2f} {max_val:<10.2f} {arrow} {improvement:+.2f}%")

    overall_improvement = optimal_row['overall_mean'] - current_row['overall_mean']

    if overall_improvement > 5:
        print(f"\n✅ Switching to K={best_overall_k} will SIGNIFICANTLY improve performance by {overall_improvement:.2f}%")
    elif overall_improvement > 0:
        print(f"\n🟡 Switching to K={best_overall_k} will slightly improve performance by {overall_improvement:.2f}%")
    else:
        print(f"\n✅ Current K={current_k} is already optimal or near-optimal!")
else:
    print(f"\n⚠️ Current K={current_k} was not tested in this analysis")
    print(f"   Recommended K based on 10-sample analysis: {best_overall_k}")
    print(f"   Expected performance at K={best_overall_k}: {best_overall_score:.2f}%")

print("\n" + "="*70)


🔬 K-VALUE OPTIMIZATION ANALYSIS (10 SENTENCES)

📝 Selecting first 10 test samples...
   1. smr-wꜥ.tꞽ ẖr(.ꞽ)-ḥ(ꜣ)b(.t) (ꞽ)m(.ꞽ)-r(ʾ)-(ꞽ)ꜥw ḥr(...
   2. ḫꜣ ꜣpd ḫꜣ ꜣpd ḫꜣ mnḫ.t ḫꜣ ꞽḥ ḫꜣ tʾ ḥnq.t pꜣ.t ḫꜣ ꜣ...
   3. ꞽ:nḏ ḥr =k mnw m pr.t.PL =f qꜣ šw.tꞽ zꜣ wsꞽr msi̯....
   4. šdi̯.t kꜣ rḫ.yt...
   5. (w)sr(.w) n(ꞽ).t m n =k ꞽr(.t)-ḥr.w šmi̯.t =f ꞽr =...
   6. sdꜣ.n n =sn psḏ.t.DU ꞽn.y ꜣ pw mri̯ ppy nfr-kꜣ-rꜥw...
   7. sṯꜣ.ꞽn.tw n =f ḏdꞽ...
   8. ꜥḥꜥ.n rḏi̯.n sꞽ ꜣs.t ḫft-ḥr =s nb.t-ḥw.t ḥꜣ =s ḥq....
   9. ꞽw mdw =f ḏi̯ =f ṯꜣm n =f ḥr...
   10. mꞽtr(.t) n.ꞽ(t)-ꜥnḫ-ḥw.t-ḥr.w...

✅ Prepared 10 test samples

📊 Testing K values: 5 to 200 (step=5)
   Total K values: 40
   Test samples: 10
   Total translations: 400
   Estimated time: ~13.3 minutes



Testing K values:   0%|          | 0/40 [00:00<?, ?it/s]

❌ API Error: 429
   Response: {"error": "you've reached your session usage limit, please wait or upgrade to continue"}
❌ API Error: 429
   Response: {"error": "you've reached your session usage limit, please wait or upgrade to continue"}

✅ K-value testing complete!

📊 AGGREGATING RESULTS ACROSS ALL SAMPLES
✅ Aggregated results for 40 K values
   Average samples per K: 9.9

📊 BEST K VALUES (Averaged Across 10 Samples)

🏆 BEST K VALUE FOR EACH METRIC:
──────────────────────────────────────────────────────────────────────────────────────────
Metric          Best K     Mean Score   Std Dev      Min        Max       
──────────────────────────────────────────────────────────────────────────────────────────
BLEU            55         32.82        35.44        1.62       100.00    
ROUGE1          55         60.74        30.63        10.53      100.00    
METEOR          55         47.33        31.34        4.35       99.90     
CHRF            55         55.42        29.36        11.26     