# ═══════════════════════════════════════════════════════════════════
# 🏛️ TLA Dataset Preparation for Egyptian Transliteration RAG System
# ═══════════════════════════════════════════════════════════════════

## 📦 Part 1: Install & Import Libraries

In [5]:
# 🔹 Upgrade pip first
!pip install --upgrade pip --quiet

# 🔹 Core Dependencies
!pip install datasets>=2.18.0 --quiet
!pip install transformers>=4.38.0 --quiet
!pip install torch>=2.2.0 --quiet
!pip install pandas>=2.2.0 --quiet
!pip install numpy>=1.26.0 --quiet

# 🔹 Translation
!pip install sentencepiece>=0.2.0 --quiet

# 🔹 Vector Database
!pip install qdrant-client>=1.7.0 --quiet

# 🔹 Ollama Cloud API
!pip install httpx>=0.25.2,<0.26.0 --quiet
!pip install ollama>=0.1.7 --quiet

# 🔹 BM25 for Hybrid Search
!pip install rank-bm25>=0.2.2 --quiet

# 🔹 Utilities
!pip install tqdm>=4.66.0 --quiet
!pip install python-dotenv>=1.0.0 --quiet
!pip install jupyter>=1.0.0 --quiet
!pip install ipywidgets>=8.1.0 --quiet
!pip install scikit-learn --quiet
!pip install matplotlib --quiet
!pip install sentence-transformers --quiet
!pip install nltk==3.9.2 --quiet
!pip install rouge-score==0.1.2 --quiet
!pip install sacrebleu==2.6.0 --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25h/bin/bash: line 1: 0.26.0: No such file or directory
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for rouge-score (pyproject.toml) ... [?25l[?25hdone


In [6]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [7]:
import pandas as pd
import numpy as np
import re
import unicodedata
from tqdm.auto import tqdm
from datasets import load_dataset
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import subprocess
import json
import ollama

print("✅ All imports successful!")


✅ All imports successful!


In [8]:
## configuration
# Models
EMBEDDING_MODEL = "bge-m3:latest"  # Ollama local

# Settings
VECTOR_DIM = 1024
TRAIN_SPLIT = 0.95  # 95% for training, 5% for testing

# Egyptian character mapping (uniliteral signs)
EGYPTIAN_CHAR_MAP = {
    # Traditional → Normalized
    'ꜣ': 'a',      # vulture (aleph)
    'ꞽ': 'i',      # reed (yodh)
    'y': 'y',      # double yodh
    'ʾ': 'a',      # arm (ayin)
    'w': 'w',      # quail
    'b': 'b',      # leg
    'p': 'p',      # stool
    'f': 'f',      # viper
    'm': 'm',      # owl
    'n': 'n',      # water
    'r': 'r',      # mouth
    'h': 'h',      # shelter
    'ḥ': 'h',      # wick
    'ḫ': 'kh',     # placenta
    'ẖ': 'kh',     # belly
    's': 's',      # cloth
    'š': 'sh',     # pool
    'ḳ': 'q',      # hill
    'q': 'q',      # hill
    'k': 'k',      # basket
    'g': 'g',      # stand
    't': 't',      # bun
    'ṯ': 'tj',     # rope
    'd': 'd',      # hand
    'ḏ': 'dj',     # cobra

    # Additional special characters
    'ṭ': 't',
    'ḍ': 'd',
    'ṣ': 's',
    'ẓ': 'z',
    'ḥ': 'h',
}

# Suffixes to remove (pronouns and particles)
SUFFIXES_TO_REMOVE = [
    '=f',   # his/him
    '=k',   # your/you (masc)
    '=ṯ',   # your/you (fem)
    '=s',   # her/it
    '=sn',  # their/them
    '=ꞽ',   # my/me
    '=n',   # our/us
    '=tn',  # your/you (pl)
    '=fꞽ',  # variant
]

print(f"🔧 Configuration loaded")
print(f"   Training split: {TRAIN_SPLIT*100}%")
print(f"   Embedding model: {EMBEDDING_MODEL}")

🔧 Configuration loaded
   Training split: 95.0%
   Embedding model: bge-m3:latest


In [9]:
## part 3: Load dataset
print("📥 Loading TLA dataset from HuggingFace...")

dataset = load_dataset(
    "thesaurus-linguae-aegyptiae/tla-Earlier_Egyptian_original-v18-premium",
    split="train"
)

df = pd.DataFrame(dataset)

print(f"✅ Loaded {len(df)} records")
print(f"\nColumns: {list(df.columns)}")
print(f"\nSample record:")
print(df.iloc[0][['transliteration', 'translation', 'UPOS']].to_dict())

📥 Loading TLA dataset from HuggingFace...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/5.92M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

✅ Loaded 12773 records

Columns: ['hieroglyphs', 'transliteration', 'lemmatization', 'UPOS', 'glossing', 'translation', 'dateNotBefore', 'dateNotAfter']

Sample record:
{'transliteration': 'nḏ (w)di̯ r =s', 'translation': '(es) werde zerrieben, (es) werde darauf gelegt.', 'UPOS': 'VERB VERB ADP PRON'}


In [10]:
## part 4 : Data Cleaning
print("\n" + "="*70)
print("🧹 STEP 1: Removing unwanted columns")
print("="*70)

# Remove unwanted columns
columns_to_drop = ['hieroglyphs', 'dateNotBefore', 'dateNotAfter']
df_clean = df.drop(columns=columns_to_drop)

print(f"✅ Removed columns: {columns_to_drop}")
print(f"   Remaining columns: {list(df_clean.columns)}")

# Remove rows with missing critical data
print("\n🧹 STEP 2: Removing rows with missing data")
initial_count = len(df_clean)

df_clean = df_clean.dropna(subset=['transliteration', 'translation'])
df_clean = df_clean[df_clean['transliteration'].str.strip() != '']
df_clean = df_clean[df_clean['translation'].str.strip() != '']

print(f"✅ Removed {initial_count - len(df_clean)} rows with missing data")
print(f"   Records remaining: {len(df_clean)}")

# Remove duplicates
print("\n🧹 STEP 3: Removing duplicates")
initial_count = len(df_clean)

df_clean = df_clean.drop_duplicates(subset=['transliteration'], keep='first')

print(f"✅ Removed {initial_count - len(df_clean)} duplicate records")
print(f"   Unique records: {len(df_clean)}")

df_clean = df_clean.reset_index(drop=True)


🧹 STEP 1: Removing unwanted columns
✅ Removed columns: ['hieroglyphs', 'dateNotBefore', 'dateNotAfter']
   Remaining columns: ['transliteration', 'lemmatization', 'UPOS', 'glossing', 'translation']

🧹 STEP 2: Removing rows with missing data
✅ Removed 0 rows with missing data
   Records remaining: 12773

🧹 STEP 3: Removing duplicates
✅ Removed 3685 duplicate records
   Unique records: 9088


In [11]:
## part 5: Transliteration Normalization
def normalize_transliteration(text):
    """
    Normalize Egyptian transliteration:
    1. Remove brackets
    2. Lowercase
    3. Map special characters
    4. Remove suffixes
    5. Clean spaces
    """
    if not isinstance(text, str) or text.strip() == '':
        return ""

    # Step 1: Remove brackets (but keep content)
    text = re.sub(r'[()]', '', text)

    # Step 2: Normalize Unicode (NFC form)
    text = unicodedata.normalize('NFC', text)

    # REMOVE combining marks (important for di̯, etc.)
    text = ''.join(c for c in text if not unicodedata.combining(c))

    # Step 3: Lowercase
    text = text.lower()

    # Step 4: Map Egyptian characters
    for egy_char, normalized in EGYPTIAN_CHAR_MAP.items():
        text = text.replace(egy_char.lower(), normalized)

    # Step 5: Remove suffixes (pronouns/particles)
    for suffix in SUFFIXES_TO_REMOVE:
        # Match suffix at word boundaries or before spaces/dots
        pattern = re.escape(suffix) + r'(?=[\s\.]|$)'
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Step 6: Clean up extra spaces and dots
    text = re.sub(r'\.+', '.', text)  # Multiple dots to single
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
    text = text.strip('. ')  # Remove leading/trailing dots and spaces

    return text

print("\n" + "="*70)
print("🔤 STEP 4: Normalizing transliterations")
print("="*70)

# Test normalization on sample
sample_text = df_clean.iloc[0]['transliteration']
normalized_sample = normalize_transliteration(sample_text)

print(f"\n📝 Sample normalization:")
print(f"   Original:   {sample_text}")
print(f"   Normalized: {normalized_sample}")

# Apply normalization to entire dataset
print(f"\n🔄 Normalizing {len(df_clean)} transliterations...")

df_clean['transliteration_normalized'] = df_clean['transliteration'].apply(
    normalize_transliteration
)


# Remove empty normalizations
df_clean = df_clean[df_clean['transliteration_normalized'].str.len() > 0]
df_clean = df_clean.reset_index(drop=True)

print(f"✅ Normalization complete!")
print(f"   Valid records: {len(df_clean)}")

# Show more examples
print(f"\n📋 Sample normalizations:")
for i in range(min(5, len(df_clean))):
    orig = df_clean.iloc[i]['transliteration']
    norm = df_clean.iloc[i]['transliteration_normalized']
    print(f"   {i+1}. {orig[:40]:40} → {norm[:40]}")



🔤 STEP 4: Normalizing transliterations

📝 Sample normalization:
   Original:   nḏ (w)di̯ r =s
   Normalized: ndj wdi r

🔄 Normalizing 9088 transliterations...
✅ Normalization complete!
   Valid records: 9088

📋 Sample normalizations:
   1. nḏ (w)di̯ r =s                           → ndj wdi r
   2. n ṯw ꞽm =sn                              → n tjw im
   3. ḫꜣ m tʾ ḥnq.t kꜣ(.PL) ꜣpd(.PL) n ꞽmꜣḫ ꞽm → kha m ta hnq.t ka.pl apd.pl n imakh im.i
   4. ꜥḥꜥ                                      → ꜥhꜥ
   5. (w)sꞽr wnꞽs m n =k ꞽr.t-ḥr.w ꞽꜥb n =k s( → wsir wnis m n ir.t-hr.w iꜥb n si ir ra


In [12]:
## part 6: Train/ test split

print("\n" + "="*70)
print(f"📊 STEP 5: Creating train/test split ({TRAIN_SPLIT*100}%/{(1-TRAIN_SPLIT)*100}%)")
print("="*70)

# Shuffle dataset
df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)

# Split
split_idx = int(len(df_clean) * TRAIN_SPLIT)
df_train = df_clean.iloc[:split_idx].copy()
df_test = df_clean.iloc[split_idx:].copy()

print(f"✅ Split complete!")
print(f"   Training set: {len(df_train)} records ({len(df_train)/len(df_clean)*100:.1f}%)")
print(f"   Test set:     {len(df_test)} records ({len(df_test)/len(df_clean)*100:.1f}%)")

# Save test set for later evaluation
df_test.to_csv('tla_test_set.csv', index=False)
print(f"\n💾 Test set saved to: tla_test_set.csv")


📊 STEP 5: Creating train/test split (95.0%/5.000000000000004%)
✅ Split complete!
   Training set: 8633 records (95.0%)
   Test set:     455 records (5.0%)

💾 Test set saved to: tla_test_set.csv


In [13]:
## part 7: Generate Embedding

from sentence_transformers import SentenceTransformer

print("\n📥 Loading embedding model...")
# Load model (do this ONCE before the loop)
embedding_model = SentenceTransformer('BAAI/bge-m3')
print(f"✅ Model loaded: BAAI/bge-m3")

def get_embedding_fast(text):
    """Generate embedding using sentence-transformers"""
    try:
        # Generate embedding
        embedding = embedding_model.encode(text, normalize_embeddings=True)
        return embedding.tolist()
    except Exception as e:
        print(f"❌ Error: {e}")
        return np.random.randn(VECTOR_DIM).tolist()

print("\n" + "="*70)
print(f"🔢 STEP 6: Generating embeddings for {len(df_train)} records")
print("="*70)

print(f"\n⚙️ Using model: BAAI/bge-m3")
print(f"   Vector dimension: {VECTOR_DIM}")

# Generate embeddings in batches (MUCH faster!)
batch_size = 32
all_embeddings = []

for i in tqdm(range(0, len(df_train), batch_size), desc="Generating embeddings"):
    batch_end = min(i + batch_size, len(df_train))
    batch_texts = df_train.iloc[i:batch_end]['transliteration_normalized'].tolist()

    try:
        # Process entire batch at once (FAST!)
        batch_embeddings = embedding_model.encode(
            batch_texts,
            normalize_embeddings=True,
            show_progress_bar=False
        )
        all_embeddings.extend(batch_embeddings.tolist())
    except Exception as e:
        print(f"❌ Batch error at {i}: {e}")
        # Fallback: process individually
        for text in batch_texts:
            all_embeddings.append(get_embedding_fast(text))

df_train['embedding'] = all_embeddings

print(f"\n✅ Embedding generation complete!")
print(f"   Total: {len(all_embeddings)} embeddings")
print(f"   Dimension: {len(all_embeddings[0])}")

# Verify
sample_embedding = all_embeddings[0]
print(f"\n📊 Sample embedding (first 10 values):")
print(f"   {sample_embedding[:10]}")


📥 Loading embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

✅ Model loaded: BAAI/bge-m3

🔢 STEP 6: Generating embeddings for 8633 records

⚙️ Using model: BAAI/bge-m3
   Vector dimension: 1024


Generating embeddings:   0%|          | 0/270 [00:00<?, ?it/s]


✅ Embedding generation complete!
   Total: 8633 embeddings
   Dimension: 1024

📊 Sample embedding (first 10 values):
   [0.009475680999457836, 0.012296928092837334, -0.03066054731607437, 0.0029091022443026304, -0.038571588695049286, -0.0011071120388805866, -0.002732239430770278, -0.013377217575907707, 0.02956884168088436, -0.00481629790738225]


In [14]:
## part 9: Setup Qdrant Vector Database
print("\n" + "="*70)
print("Setting up Qdrant database")
print("="*70)

# Initialize Qdrant (in-memory for development)
# For production, use: QdrantClient(host="localhost", port=6333)
qdrant = QdrantClient(":memory:")

print(f"✅ Qdrant client initialized (in-memory)")

# Create collection
collection_name = "egyptian_transliterations"

qdrant.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=VECTOR_DIM,
        distance=Distance.COSINE
    )
)

print(f"✅ Collection created: {collection_name}")
print(f"   Vector size: {VECTOR_DIM}")
print(f"   Distance metric: COSINE")


Setting up Qdrant database
✅ Qdrant client initialized (in-memory)
✅ Collection created: egyptian_transliterations
   Vector size: 1024
   Distance metric: COSINE


In [17]:
## part 10: upload data to qdrant


print("\n" + "="*70)
print(f" Uploading {len(df_train)} records to Qdrant")
print("="*70)

# Prepare points
points = []

for idx, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Preparing points"):
    point = PointStruct(
        id=idx,
        vector=row['embedding'],
        payload={
            "transliteration_original": row['transliteration'],
            "transliteration_normalized": row['transliteration_normalized'],
            "UPOS": row.get('UPOS', ''),
            "glossing": row.get('glossing', ''),
            "translation_de": row['translation']
        }
    )
    points.append(point)

# Upload in batches
batch_size = 100
print(f"\n📦 Uploading in batches of {batch_size}...")

for i in tqdm(range(0, len(points), batch_size), desc="Uploading batches"):
    batch = points[i:i+batch_size]
    qdrant.upsert(
        collection_name=collection_name,
        points=batch
    )

print(f"\n✅ Upload complete!")
print(f"   Total records in database: {len(points)}")




 Uploading 8633 records to Qdrant


Preparing points:   0%|          | 0/8633 [00:00<?, ?it/s]


📦 Uploading in batches of 100...


Uploading batches:   0%|          | 0/87 [00:00<?, ?it/s]


✅ Upload complete!
   Total records in database: 8633


In [18]:
## part 11 : verify Database

print("\n" + "="*70)
print("✅ STEP 10: Verifying database")
print("="*70)

count_info = qdrant.count(
    collection_name=collection_name,
    exact=True
)

print(f"📊 Collection statistics:")
print(f"   Name: {collection_name}")
print(f"   Points count: {count_info.count}")

# Test search
print(f"\n🔍 Testing search functionality...")

test_query = df_train.iloc[0]['transliteration_normalized']
test_embedding = df_train.iloc[0]['embedding']

search_results = qdrant.query_points(
    collection_name=collection_name,
    query=test_embedding,
    limit=3
).points

print(f"\n📝 Test query: {test_query}")
print(f"\n🎯 Top 3 search results:")

for i, result in enumerate(search_results, 1):
    print(f"\n   {i}. Score: {result.score:.4f}")
    print(f"      Transliteration: {result.payload['transliteration_normalized']}")
    print(f"      Translation: {result.payload['translation_de'][:60]}...")



✅ STEP 10: Verifying database
📊 Collection statistics:
   Name: egyptian_transliterations
   Points count: 8633

🔍 Testing search functionality...

📝 Test query: hm-ntjr-khwi=f-wi hr.i-sshta

🎯 Top 3 search results:

   1. Score: 1.0000
      Transliteration: hm-ntjr-khwi=f-wi hr.i-sshta
      Translation: Priester des Cheops und Hüter des Geheimnisses....

   2. Score: 0.9444
      Transliteration: hm-ntjr-khwi=f-wi hr.i-sshta ka=i-n.i-nswt
      Translation: Priester des Cheops und Hüter des Geheimnisses Kai-ni-nisut....

   3. Score: 0.8680
      Transliteration: wꜥb-nswt hr.i-sshta im.i-s.t-ka=i
      Translation: Der Wab-Priester des Königs und Hüter des Geheimnisses Imi-s...


# ═══════════════════════════════════════════════════════════════════
# 🔮 PART 2: RAG Translation Pipeline
# ═══════════════════════════════════════════════════════════════════

## 📦 Part 12: Install Additional Libraries

In [21]:
import os
from google.colab import userdata

# Load Ollama API Key securely from Colab Secrets
OLLAMA_API_KEY = userdata.get('OLLAMA_API_KEY')

if OLLAMA_API_KEY is None:
    raise ValueError("❌ OLLAMA_API_KEY not found in Colab Secrets")

# Set env var for libraries that expect it
os.environ['OLLAMA_API_KEY'] = OLLAMA_API_KEY

# Configuration
LLM_MODEL = "qwen3-vl:235b-cloud"
TOP_K_RESULTS = 10

print("🔧 RAG Pipeline Configuration:")
print(f"   LLM Model: {LLM_MODEL}")
print(f"   Top-K Results: {TOP_K_RESULTS}")
print(f"   API Key: ✅ Loaded securely from Colab Secrets")


🔧 RAG Pipeline Configuration:
   LLM Model: qwen3-vl:235b-cloud
   Top-K Results: 10
   API Key: ✅ Loaded securely from Colab Secrets


In [23]:
!pip install rank-bm25



In [25]:
## part 14: prepare BM25 index for Sparce Search

from rank_bm25 import BM25Okapi

print("\n" + "="*70)
print(" Building BM25 index for sparse search")
print("="*70)

# Tokenize corpus for BM25
corpus_texts = df_train['transliteration_normalized'].tolist()
tokenized_corpus = [text.split() for text in corpus_texts]

# Build BM25 index
bm25 = BM25Okapi(tokenized_corpus)

print(f"✅ BM25 index built!")
print(f"   Documents indexed: {len(tokenized_corpus)}")


 Building BM25 index for sparse search
✅ BM25 index built!
   Documents indexed: 8633


In [26]:
## part 15: Hybrid Search Function
def hybrid_search(query_text, query_embedding, top_k=10, alpha=0.5):
    """
    Perform hybrid search: Dense (Vector) + Sparse (BM25)

    Args:
        query_text: Normalized transliteration query
        query_embedding: Embedding vector of query
        top_k: Number of results to return
        alpha: Weight for dense search (1-alpha for sparse)

    Returns:
        List of search results with scores
    """

    # 1. Dense Search (Vector Similarity)
    dense_results = qdrant.query_points(
        collection_name=collection_name,
        query=query_embedding,
        limit=top_k * 2
    ).points


    # 2. Sparse Search (BM25)
    query_tokens = query_text.split()
    bm25_scores = bm25.get_scores(query_tokens)

    # Get top BM25 indices
    top_bm25_indices = np.argsort(bm25_scores)[-top_k*2:][::-1]

    # 3. Reciprocal Rank Fusion (RRF)
    combined_scores = {}

    # Add dense scores
    for rank, result in enumerate(dense_results):
        doc_id = result.id
        rrf_score = 1 / (rank + 60)  # RRF formula
        combined_scores[doc_id] = {
            'rrf_score': rrf_score,
            'dense_score': result.score,
            'sparse_score': 0,
            'payload': result.payload
        }

    # Add sparse scores
    for rank, idx in enumerate(top_bm25_indices):
        if idx in combined_scores:
            combined_scores[idx]['rrf_score'] += 1 / (rank + 60)
            combined_scores[idx]['sparse_score'] = bm25_scores[idx]
        else:
            # Retrieve payload from Qdrant
            point = qdrant.retrieve(
                collection_name=collection_name,
                ids=[int(idx)]
            )
            if point:
                combined_scores[idx] = {
                    'rrf_score': 1 / (rank + 60),
                    'dense_score': 0,
                    'sparse_score': bm25_scores[idx],
                    'payload': point[0].payload
                }

    # 4. Sort by combined RRF score
    sorted_results = sorted(
        combined_scores.items(),
        key=lambda x: x[1]['rrf_score'],
        reverse=True
    )[:top_k]

    # 5. Format results
    final_results = []
    for doc_id, scores in sorted_results:
        final_results.append({
            'id': doc_id,
            'rrf_score': scores['rrf_score'],
            'dense_score': scores['dense_score'],
            'sparse_score': scores['sparse_score'],
            'payload': scores['payload']
        })

    return final_results

print("✅ Hybrid search function ready!")


✅ Hybrid search function ready!


In [36]:
## part 16: LLM Translation Function
import requests
OLLAMA_API_URL = "https://ollama.com/api/chat"

def translate_with_llm(query_original, query_normalized, retrieved_examples):
    """
    Use LLM to translate Egyptian to German based on retrieved examples
    """

    # Build examples context (same as before)
    examples_text = ""
    for i, example in enumerate(retrieved_examples, 1):
        payload = example['payload']
        examples_text += f"""
Example {i}:
- Original: {payload['transliteration_original']}
- Normalized: {payload['transliteration_normalized']}
- POS Tags: {payload['UPOS']}
- Glossing: {payload['glossing']}
- German: {payload['translation_de']}
---
"""

    # Build prompt (same as before)
    prompt = f"""You are an expert in Earlier Egyptian linguistics.

**QUERY TO TRANSLATE:**
- Original Transliteration: {query_original}
- Normalized Transliteration: {query_normalized}

**RETRIEVED EXAMPLES FROM DATABASE:**
{examples_text}

**YOUR TASK:**
1. Analyze the query's grammatical structure and word functions
2. Identify and align suffixes, verb forms, and morphological patterns
3. Use the retrieved examples to understand linguistic patterns (NOT to copy translations)
4. Compose the most likely German translation based on:
   - Grammatical patterns from examples
   - Lemma meanings from examples
   - Morphological alignment (suffixes, verb conjugations)
5. If there is uncertainty, briefly explain why

**IMPORTANT RULES:**
- Do NOT copy any retrieved German translation directly
- Use examples only to understand patterns and meanings
- Generate a NEW German translation for the query
- Keep your response concise

**OUTPUT FORMAT:**
German Translation: [your translation here]
Confidence: [High/Medium/Low]
Notes: [brief explanation if needed]
"""

    # Call Ollama Cloud API with CORRECT endpoint
    try:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {OLLAMA_API_KEY}"
        }

        # Use Ollama's native format (not OpenAI format)
        payload = {
            "model": LLM_MODEL,
            "messages": [
                {
                    "role": "system",
                    "content": "You are an expert Ancient Egyptian linguist specializing in translating Earlier Egyptian to German."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "stream": False
        }

        response = requests.post(
            OLLAMA_API_URL,
            headers=headers,
            json=payload,
            timeout=180
        )

        if response.status_code == 200:
            result = response.json()
            # Ollama format uses 'message' -> 'content'
            llm_output = result['message']['content']

            # Extract German translation
            import re
            match = re.search(r'German Translation:\s*(.+?)(?:\n|$)', llm_output, re.IGNORECASE)
            if match:
                german_translation = match.group(1).strip()
                return german_translation, llm_output
            else:
                return llm_output.split('\n')[0].strip(), llm_output
        else:
            print(f"❌ API Error: {response.status_code}")
            print(f"   Response: {response.text}")
            return None, None

    except Exception as e:
        print(f"❌ LLM Error: {e}")
        return None, None


In [37]:
## part 17: German to English Tranlslation
from transformers import MarianMTModel, MarianTokenizer

print("\n" + "="*70)
print("Loading German→English translation model")
print("="*70)

# Load MarianMT model
print("📥 Loading MarianMT model...")
de_en_model_name = "Helsinki-NLP/opus-mt-de-en"
de_en_tokenizer = MarianTokenizer.from_pretrained(de_en_model_name)
de_en_model = MarianMTModel.from_pretrained(de_en_model_name)

print(f"✅ Model loaded: {de_en_model_name}")

def translate_german_to_english(german_text):
    """Translate German to English using MarianMT"""
    try:
        # Tokenize
        inputs = de_en_tokenizer(
            german_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )

        # Generate translation
        outputs = de_en_model.generate(**inputs)

        # Decode
        english_text = de_en_tokenizer.decode(
            outputs[0],
            skip_special_tokens=True
        )

        return english_text

    except Exception as e:
        print(f"❌ Translation error: {e}")
        return None

print("✅ German→English translation ready!")




Loading German→English translation model
📥 Loading MarianMT model...




✅ Model loaded: Helsinki-NLP/opus-mt-de-en
✅ German→English translation ready!


In [38]:
## part 18: Complete Translation pipeline

def translate_egyptian_to_english(query_original, show_details=True):
    """
    Complete pipeline: Egyptian → German → English

    Args:
        query_original: Original Egyptian transliteration
        show_details: Print intermediate steps

    Returns:
        dict with results
    """

    if show_details:
        print("\n" + "="*70)
        print(f"📝 TRANSLATING: {query_original}")
        print("="*70)

    # Step 1: Normalize query
    query_normalized = normalize_transliteration(query_original)

    if show_details:
        print(f"\n1️⃣ Normalization:")
        print(f"   Original:   {query_original}")
        print(f"   Normalized: {query_normalized}")

    # Step 2: Generate embedding
    query_embedding = embedding_model.encode(
        query_normalized,
        normalize_embeddings=True
    ).tolist()

    if show_details:
        print(f"\n2️⃣ Embedding generated (dim={len(query_embedding)})")

    # Step 3: Hybrid search
    if show_details:
        print(f"\n3️⃣ Hybrid search (Dense + BM25)...")

    search_results = hybrid_search(
        query_text=query_normalized,
        query_embedding=query_embedding,
        top_k=TOP_K_RESULTS
    )

    if show_details:
        print(f"   ✅ Found {len(search_results)} results")
        print(f"\n   📊 Top 3 matches:")
        for i, result in enumerate(search_results[:3], 1):
            print(f"\n   {i}. RRF Score: {result['rrf_score']:.4f}")
            print(f"      Transliteration: {result['payload']['transliteration_normalized']}")
            print(f"      German: {result['payload']['translation_de'][:50]}...")

    # Step 4: LLM Translation (German)
    if show_details:
        print(f"\n4️⃣ LLM Translation (Egyptian → German)...")

    german_translation, llm_full_output = translate_with_llm(
        query_original=query_original,
        query_normalized=query_normalized,
        retrieved_examples=search_results
    )

    if not german_translation:
        return {
            'success': False,
            'error': 'LLM translation failed'
        }

    if show_details:
        print(f"   🇩🇪 German: {german_translation}")

    # Step 5: German → English
    if show_details:
        print(f"\n5️⃣ Translation (German → English)...")

    english_translation = translate_german_to_english(german_translation)

    if not english_translation:
        return {
            'success': False,
            'error': 'German→English translation failed'
        }

    # Final result
    if show_details:
        print("\n" + "="*70)
        print("✅ TRANSLATION COMPLETE")
        print("="*70)
        print(f"🏛️ Egyptian:  {query_original}")
        print(f"🔤 Normalized: {query_normalized}")
        print(f"🇩🇪 German:    {german_translation}")
        print(f"🇬🇧 English:   {english_translation}")
        print("="*70 + "\n")

    return {
        'success': True,
        'query_original': query_original,
        'query_normalized': query_normalized,
        'german': german_translation,
        'english': english_translation,
        'llm_output': llm_full_output,
        'top_matches': search_results[:3]
    }

print("✅ Complete translation pipeline ready!")


✅ Complete translation pipeline ready!


In [39]:
## part 19: Batch Processing Test set
print("\n" + "="*70)
print("📊 STEP 13: Processing test set")
print("="*70)

# Load test set
print("📥 Loading test set...")
df_test = pd.read_csv('tla_test_set.csv')
print(f"✅ Loaded {len(df_test)} test records")

# Process subset (first 10 for demo)
print(f"\n🔄 Translating first 10 test queries...")
print("(Processing all {len(df_test)} would take ~{len(df_test)*2/60:.1f} minutes)\n")

results = []

for idx in tqdm(range(min(10, len(df_test))), desc="Translating"):
    query = df_test.iloc[idx]['transliteration']

    result = translate_egyptian_to_english(
        query_original=query,
        show_details=False
    )

    if result['success']:
        results.append({
            'query_original': result['query_original'],
            'query_normalized': result['query_normalized'],
            'reference_german': df_test.iloc[idx]['translation'],
            'predicted_german': result['german'],
            'predicted_english': result['english']
        })

# Create results dataframe
df_results = pd.DataFrame(results)

print(f"\n✅ Processed {len(results)} queries successfully!")



📊 STEP 13: Processing test set
📥 Loading test set...
✅ Loaded 455 test records

🔄 Translating first 10 test queries...
(Processing all {len(df_test)} would take ~{len(df_test)*2/60:.1f} minutes)



Translating:   0%|          | 0/10 [00:00<?, ?it/s]

❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=180)
❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=180)

✅ Processed 8 queries successfully!


## part 20: Display result

In [41]:
print("\n" + "="*70)
print("📈 SAMPLE RESULTS")
print("="*70)

for i in range(min(5, len(df_results))):
    row = df_results.iloc[i]
    print(f"\n{'─'*70}")
    print(f"Query {i+1}:")
    print(f"{'─'*70}")
    print(f"🏛️ Egyptian:    {row['query_original']}")
    print(f"🔤 Normalized:  {row['query_normalized']}")
    print(f"📖 Reference:   {row['reference_german']}")
    print(f"🤖 Predicted:   {row['predicted_german']}")
    print(f"🇬🇧 English:    {row['predicted_english']}")


📈 SAMPLE RESULTS

──────────────────────────────────────────────────────────────────────
Query 1:
──────────────────────────────────────────────────────────────────────
🏛️ Egyptian:    n nfr rn =f ḫr nṯr
🔤 Normalized:  n nfr rn khr ntjr
📖 Reference:   Nicht soll sein Name gut sein durch den Gott.
🤖 Predicted:   Sein Name ist unter dem Gott nicht gut.
🇬🇧 English:    His name is not good under God.

──────────────────────────────────────────────────────────────────────
Query 2:
──────────────────────────────────────────────────────────────────────
🏛️ Egyptian:    tp.ꞽ-ḥꜣ.t sfṯ
🔤 Normalized:  tp.i-ha.t sftj
📖 Reference:   Allerbestes Sefetj-Öl.
🤖 Predicted:   Bestes Sefetj-Öl
🇬🇧 English:    Best Sefetj oil

──────────────────────────────────────────────────────────────────────
Query 3:
──────────────────────────────────────────────────────────────────────
🏛️ Egyptian:    sḏꜣ.t ḥnꜥ nṯr-ꜥꜣ r nmt.t.PL =f nb.t(.PL) ꞽn ꞽmꜣḫ.w (ꞽ)m(.ꞽ)-rʾ-mšꜥ m tꜣ r-ḏr =f ꞽni̯-ꞽt=f mꜣꜥ-ḫrw
🔤 Normalized:  sdja.

# ═══════════════════════════════════════════════════════════════════
# 📊 EVALUATION METRICS FOR EGYPTIAN TRANSLITERATION RAG SYSTEM
# ═══════════════════════════════════════════════════════════════════

## 📦 Part 24: Install Evaluation Libraries

In [42]:
print(len(df_test))
print(df_test.columns.tolist())
df_test.head()

455
['transliteration', 'lemmatization', 'UPOS', 'glossing', 'translation', 'transliteration_normalized']


Unnamed: 0,transliteration,lemmatization,UPOS,glossing,translation,transliteration_normalized
0,ḥm.t =f mri̯.t =f ꞽmꜣḫ.wt-ḫr-nṯr-ꜥꜣ ḫntꞽ.t-kꜣ....,104730|ḥm.t 10050|=f 72470|mri̯ 10050|=f 85038...,NOUN PRON VERB PRON NOUN PROPN NOUN PRON PROPN,N.f:stpr -3sg.m V\rel.f.sg:stpr -3sg.m TITL PE...,"Seine Frau, die er liebt, die Versorgte durch ...",hm.t mri.t imakh.wt-khr-ntjr-ꜥa khnti.t-ka.w r...
1,n nfr rn =f ḫr nṯr,78890|n 83470|nfr 94700|rn 10050|=f 80011|ḫr 9...,PART VERB NOUN PRON ADP NOUN,PTCL V\tam.act N.m:stpr -3sg.m PREP N.m,Nicht soll sein Name gut sein durch den Gott.,n nfr rn khr ntjr
2,tp.ꞽ-ḥꜣ.t sfṯ,171640|tp.ꞽ-ḥꜣ.t 133990|sfṯ,NOUN NOUN,N.m N.m,Allerbestes Sefetj-Öl.,tp.i-ha.t sftj
3,sḏꜣ.t ḥnꜥ nṯr-ꜥꜣ r nmt.t.PL =f nb.t(.PL) ꞽn ꞽm...,150210|sḏꜣ 106200|ḥnꜥ 90360|nṯr-ꜥꜣ 91903|r 845...,VERB ADP NOUN ADP NOUN PRON ADJ ADP NOUN NOUN ...,V\inf PREP DIVN PREP N.f:pl:stpr -3sg.m ADJ:f....,Mit dem großen Gott auf allen seinen Gängen wa...,sdja.t hnꜥ ntjr-ꜥa r nmt.t.pl nb.t.pl in imakh...
4,kꜣ zbn,162950|kꜣ 131760|zbn,NOUN VERB,N.m V\imp.sg,"$kꜣ$-Schlange, kriech davon!",ka zbn


In [44]:
## 📦 Part 19: Install Evaluation Libraries
import sys
print("\n📦 Installing evaluation libraries...")
eval_packages = [
    'nltk',
    'rouge-score',
]

for package in eval_packages:
    subprocess.run([sys.executable, '-m', 'pip', 'install', package, '--break-system-packages', '-q'])

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
print("📥 Downloading NLTK data...")
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    print("✅ NLTK data ready!")
except:
    print("⚠️ NLTK download warning (may still work)")


📦 Installing evaluation libraries...
📥 Downloading NLTK data...
✅ NLTK data ready!


In [45]:
## 📊 Part 20: Define Evaluation Metrics

def calculate_bleu(reference, hypothesis):
    """Calculate BLEU score (0-100)"""
    try:
        reference_tokens = reference.lower().split()
        hypothesis_tokens = hypothesis.lower().split()
        smoothing = SmoothingFunction()
        bleu_score = sentence_bleu(
            [reference_tokens],
            hypothesis_tokens,
            smoothing_function=smoothing.method1
        )
        return bleu_score * 100
    except:
        return 0.0

def calculate_rouge(reference, hypothesis):
    """Calculate ROUGE scores"""
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference, hypothesis)
        return {
            'rouge1': scores['rouge1'].fmeasure * 100,
            'rouge2': scores['rouge2'].fmeasure * 100,
            'rougeL': scores['rougeL'].fmeasure * 100
        }
    except:
        return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

def calculate_meteor(reference, hypothesis):
    """Calculate METEOR score (0-100)"""
    try:
        reference_tokens = reference.lower().split()
        hypothesis_tokens = hypothesis.lower().split()
        meteor = meteor_score([reference_tokens], hypothesis_tokens)
        return meteor * 100
    except:
        return 0.0

def calculate_exact_match(reference, hypothesis):
    """Calculate exact match"""
    return 100.0 if reference.strip().lower() == hypothesis.strip().lower() else 0.0

def calculate_word_overlap(reference, hypothesis):
    """Calculate word-level overlap percentage"""
    try:
        ref_words = set(reference.lower().split())
        hyp_words = set(hypothesis.lower().split())
        if len(ref_words) == 0:
            return 0.0
        overlap = len(ref_words.intersection(hyp_words))
        return (overlap / len(ref_words)) * 100
    except:
        return 0.0

print("✅ Evaluation metrics defined!")

✅ Evaluation metrics defined!


In [46]:
## 🔄 Part 21: Process Entire Test Set

print("\n" + "="*70)
print("🚀 PROCESSING COMPLETE TEST SET")
print("="*70)

# Load test set
print("\n📥 Loading test set...")
df_test = pd.read_csv('tla_test_set.csv')[:25]  # Limit to first 25 for demo
print(f"✅ Loaded {len(df_test)} test records")

# Initialize results storage
test_results = []
failed_translations = []

print(f"\n🔄 Processing {len(df_test)} test samples...")
print("⏱️ Estimated time: ~{:.1f} minutes\n".format(len(df_test) * 3 / 60))

# Process each test sample
for idx in tqdm(range(len(df_test)), desc="Translating test set"):
    try:
        # Get query
        query_original = df_test.iloc[idx]['transliteration']
        reference_german = df_test.iloc[idx]['translation']

        # Translate Egyptian → German → English using RAG
        result = translate_egyptian_to_english(
            query_original=query_original,
            show_details=False
        )

        if result['success']:
            # Translate reference German → English
            reference_english = translate_german_to_english(reference_german)

            if reference_english:
                # Store results
                test_results.append({
                    'sample_id': idx,
                    'transliteration': query_original,
                    'transliteration_normalized': result['query_normalized'],
                    'reference_german': reference_german,
                    'reference_english': reference_english,
                    'predicted_german': result['german'],
                    'predicted_english': result['english']
                })
            else:
                failed_translations.append({
                    'sample_id': idx,
                    'reason': 'Reference translation to English failed'
                })
        else:
            failed_translations.append({
                'sample_id': idx,
                'reason': result.get('error', 'RAG translation failed')
            })

    except Exception as e:
        failed_translations.append({
            'sample_id': idx,
            'reason': f'Exception: {str(e)}'
        })
        continue

# Create results DataFrame
df_test_results = pd.DataFrame(test_results)

print(f"\n✅ Processing complete!")
print(f"   Successful: {len(test_results)}")
print(f"   Failed: {len(failed_translations)}")


🚀 PROCESSING COMPLETE TEST SET

📥 Loading test set...
✅ Loaded 25 test records

🔄 Processing 25 test samples...
⏱️ Estimated time: ~1.2 minutes



Translating test set:   0%|          | 0/25 [00:00<?, ?it/s]

❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=180)
❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=180)
❌ API Error: 503
   Response: {"error":"Service Temporarily Unavailable"}

❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=180)
❌ LLM Error: HTTPSConnectionPool(host='ollama.com', port=443): Read timed out. (read timeout=180)

✅ Processing complete!
   Successful: 20
   Failed: 5


In [47]:
## 📊 Part 22: Calculate Metrics for All Results

print("\n" + "="*70)
print("📊 CALCULATING EVALUATION METRICS")
print("="*70)

metrics_list = []

print(f"\n🔄 Computing metrics for {len(df_test_results)} translations...\n")

for idx, row in tqdm(df_test_results.iterrows(), total=len(df_test_results), desc="Computing metrics"):
    reference = row['reference_english']
    hypothesis = row['predicted_english']

    # Calculate all metrics
    rouge_scores = calculate_rouge(reference, hypothesis)

    metrics = {
        'sample_id': row['sample_id'],
        'bleu': calculate_bleu(reference, hypothesis),
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'meteor': calculate_meteor(reference, hypothesis),
        'exact_match': calculate_exact_match(reference, hypothesis),
        'word_overlap': calculate_word_overlap(reference, hypothesis)
    }

    metrics_list.append(metrics)

# Create metrics DataFrame
df_metrics = pd.DataFrame(metrics_list)

print("✅ Metrics calculation complete!")


📊 CALCULATING EVALUATION METRICS

🔄 Computing metrics for 20 translations...



Computing metrics:   0%|          | 0/20 [00:00<?, ?it/s]

✅ Metrics calculation complete!


In [48]:

## 📈 Part 23: Display Summary Statistics

print("\n" + "="*70)
print("📈 EVALUATION SUMMARY - COMPLETE TEST SET")
print("="*70)

# Calculate averages
avg_bleu = df_metrics['bleu'].mean()
avg_rouge1 = df_metrics['rouge1'].mean()
avg_rouge2 = df_metrics['rouge2'].mean()
avg_rougeL = df_metrics['rougeL'].mean()
avg_meteor = df_metrics['meteor'].mean()
avg_exact = df_metrics['exact_match'].mean()
avg_overlap = df_metrics['word_overlap'].mean()

# Determine quality levels
def get_quality_emoji(metric_name, score):
    """Get quality emoji based on metric and score"""
    if metric_name == 'exact_match':
        return '🟢' if score > 20 else '🟡' if score > 5 else '🔴'
    else:
        return '🟢' if score > 50 else '🟡' if score > 30 else '🔴'

print("\n📊 Average Scores:")
print("─" * 70)
print(f"   {get_quality_emoji('bleu', avg_bleu)} BLEU:         {avg_bleu:6.2f}%")
print(f"   {get_quality_emoji('rouge1', avg_rouge1)} ROUGE-1:      {avg_rouge1:6.2f}%")
print(f"   {get_quality_emoji('rouge2', avg_rouge2)} ROUGE-2:      {avg_rouge2:6.2f}%")
print(f"   {get_quality_emoji('rougeL', avg_rougeL)} ROUGE-L:      {avg_rougeL:6.2f}%")
print(f"   {get_quality_emoji('meteor', avg_meteor)} METEOR:       {avg_meteor:6.2f}%")
print(f"   {get_quality_emoji('exact_match', avg_exact)} Exact Match:  {avg_exact:6.2f}% ({int(df_metrics['exact_match'].sum() / 100)} matches)")
print(f"   {get_quality_emoji('overlap', avg_overlap)} Word Overlap: {avg_overlap:6.2f}%")

print("\n📈 BLEU Score Distribution:")
print("─" * 70)
print(f"   Min:    {df_metrics['bleu'].min():6.2f}%")
print(f"   25%:    {df_metrics['bleu'].quantile(0.25):6.2f}%")
print(f"   Median: {df_metrics['bleu'].median():6.2f}%")
print(f"   75%:    {df_metrics['bleu'].quantile(0.75):6.2f}%")
print(f"   Max:    {df_metrics['bleu'].max():6.2f}%")
print(f"   Std:    {df_metrics['bleu'].std():6.2f}%")

print("\n📈 METEOR Score Distribution:")
print("─" * 70)
print(f"   Min:    {df_metrics['meteor'].min():6.2f}%")
print(f"   25%:    {df_metrics['meteor'].quantile(0.25):6.2f}%")
print(f"   Median: {df_metrics['meteor'].median():6.2f}%")
print(f"   75%:    {df_metrics['meteor'].quantile(0.75):6.2f}%")
print(f"   Max:    {df_metrics['meteor'].max():6.2f}%")
print(f"   Std:    {df_metrics['meteor'].std():6.2f}%")

print("\n" + "="*70)



📈 EVALUATION SUMMARY - COMPLETE TEST SET

📊 Average Scores:
──────────────────────────────────────────────────────────────────────
   🔴 BLEU:          25.04%
   🟢 ROUGE-1:       65.24%
   🟡 ROUGE-2:       45.36%
   🟢 ROUGE-L:       64.63%
   🟢 METEOR:        54.27%
   🔴 Exact Match:    5.00% (1 matches)
   🟢 Word Overlap:  56.44%

📈 BLEU Score Distribution:
──────────────────────────────────────────────────────────────────────
   Min:      0.00%
   25%:      7.37%
   Median:  17.41%
   75%:     30.33%
   Max:    100.00%
   Std:     27.01%

📈 METEOR Score Distribution:
──────────────────────────────────────────────────────────────────────
   Min:      0.00%
   25%:     45.83%
   Median:  60.87%
   75%:     70.49%
   Max:     99.98%
   Std:     26.18%



In [49]:
## 📊 Part 24: Visual Comparison Chart

print("\n" + "="*70)
print("📊 METRIC COMPARISON (Visual)")
print("="*70)

metrics_to_plot = {
    'BLEU': avg_bleu,
    'ROUGE-1': avg_rouge1,
    'ROUGE-2': avg_rouge2,
    'ROUGE-L': avg_rougeL,
    'METEOR': avg_meteor,
    'Word Overlap': avg_overlap
}

print()
for metric, score in metrics_to_plot.items():
    bar_length = int((score / 100) * 50)
    bar = '█' * bar_length
    print(f"{metric:15s} {bar} {score:6.2f}%")

print("\n" + "="*70)

## 📊 Part 25: Score Distribution Histogram

print("\n" + "="*70)
print("📊 BLEU SCORE DISTRIBUTION")
print("="*70)

bins = [0, 20, 40, 60, 80, 100]
hist, _ = np.histogram(df_metrics['bleu'].values, bins=bins)

max_count = max(hist) if max(hist) > 0 else 1

print()
for i, (start, end) in enumerate(zip(bins[:-1], bins[1:])):
    count = hist[i]
    bar_length = int((count / max_count) * 40)
    bar = '█' * bar_length
    percentage = (count / len(df_metrics)) * 100
    print(f"  {start:3.0f}-{end:3.0f}%: {bar} {count} samples ({percentage:.1f}%)")

print("\n" + "="*70)



📊 METRIC COMPARISON (Visual)

BLEU            ████████████  25.04%
ROUGE-1         ████████████████████████████████  65.24%
ROUGE-2         ██████████████████████  45.36%
ROUGE-L         ████████████████████████████████  64.63%
METEOR          ███████████████████████████  54.27%
Word Overlap    ████████████████████████████  56.44%


📊 BLEU SCORE DISTRIBUTION

    0- 20%: ████████████████████████████████████████ 13 samples (65.0%)
   20- 40%: █████████ 3 samples (15.0%)
   40- 60%: ███ 1 samples (5.0%)
   60- 80%: ██████ 2 samples (10.0%)
   80-100%: ███ 1 samples (5.0%)



In [50]:
## 📊 Part 25: Score Distribution Histogram

print("\n" + "="*70)
print("📊 BLEU SCORE DISTRIBUTION")
print("="*70)

bins = [0, 20, 40, 60, 80, 100]
hist, _ = np.histogram(df_metrics['bleu'].values, bins=bins)

max_count = max(hist) if max(hist) > 0 else 1

print()
for i, (start, end) in enumerate(zip(bins[:-1], bins[1:])):
    count = hist[i]
    bar_length = int((count / max_count) * 40)
    bar = '█' * bar_length
    percentage = (count / len(df_metrics)) * 100
    print(f"  {start:3.0f}-{end:3.0f}%: {bar} {count} samples ({percentage:.1f}%)")

print("\n" + "="*70)


📊 BLEU SCORE DISTRIBUTION

    0- 20%: ████████████████████████████████████████ 13 samples (65.0%)
   20- 40%: █████████ 3 samples (15.0%)
   40- 60%: ███ 1 samples (5.0%)
   60- 80%: ██████ 2 samples (10.0%)
   80-100%: ███ 1 samples (5.0%)



In [51]:
## 🏆 Part 26: Best and Worst Examples

print("\n" + "="*70)
print("🏆 TOP 5 BEST TRANSLATIONS (by BLEU)")
print("="*70)

# Merge metrics with results
df_test_results['bleu'] = df_metrics['bleu'].values
df_test_results['meteor'] = df_metrics['meteor'].values
df_test_results['rouge1'] = df_metrics['rouge1'].values

top_5 = df_test_results.nlargest(5, 'bleu')

for i, (idx, row) in enumerate(top_5.iterrows(), 1):
    print(f"\n{'─'*70}")
    print(f"#{i} - BLEU: {row['bleu']:.2f}%")
    print(f"{'─'*70}")
    print(f"🏛️  Egyptian:        {row['transliteration'][:60]}...")
    print(f"📖  Reference (DE):   {row['reference_german'][:60]}...")
    print(f"📖  Reference (EN):   {row['reference_english'][:60]}...")
    print(f"🤖  Predicted (DE):   {row['predicted_german'][:60]}...")
    print(f"🤖  Predicted (EN):   {row['predicted_english'][:60]}...")
    print(f"\n📊  Scores:")
    print(f"     BLEU={row['bleu']:.1f}% | METEOR={row['meteor']:.1f}% | ROUGE-1={row['rouge1']:.1f}%")

print("\n" + "="*70)
print("⚠️ BOTTOM 5 WORST TRANSLATIONS (by BLEU)")
print("="*70)

bottom_5 = df_test_results.nsmallest(5, 'bleu')

for i, (idx, row) in enumerate(bottom_5.iterrows(), 1):
    print(f"\n{'─'*70}")
    print(f"#{i} - BLEU: {row['bleu']:.2f}%")
    print(f"{'─'*70}")
    print(f"🏛️  Egyptian:        {row['transliteration'][:60]}...")
    print(f"📖  Reference (DE):   {row['reference_german'][:60]}...")
    print(f"📖  Reference (EN):   {row['reference_english'][:60]}...")
    print(f"🤖  Predicted (DE):   {row['predicted_german'][:60]}...")
    print(f"🤖  Predicted (EN):   {row['predicted_english'][:60]}...")
    print(f"\n📊  Scores:")
    print(f"     BLEU={row['bleu']:.1f}% | METEOR={row['meteor']:.1f}% | ROUGE-1={row['rouge1']:.1f}%")

print("\n" + "="*70)


🏆 TOP 5 BEST TRANSLATIONS (by BLEU)

──────────────────────────────────────────────────────────────────────
#1 - BLEU: 100.00%
──────────────────────────────────────────────────────────────────────
🏛️  Egyptian:        ꞽ kꜣ n(.ꞽ) ppy nfr-kꜣ-rꜥw ꞽni̯ wnm ppy nfr-kꜣ-rꜥw ḥnꜥ =k...
📖  Reference (DE):   O Ka des Pepi Neferkare, hol herbei, damit Pepi Neferkare zu...
📖  Reference (EN):   O Ka of Pepi Neferkare, bring in so Pepi Neferkare can eat w...
🤖  Predicted (DE):   O Ka des Pepi Neferkare, hol herbei, damit Pepi Neferkare zu...
🤖  Predicted (EN):   O Ka of Pepi Neferkare, bring in so Pepi Neferkare can eat w...

📊  Scores:
     BLEU=100.0% | METEOR=100.0% | ROUGE-1=100.0%

──────────────────────────────────────────────────────────────────────
#2 - BLEU: 74.97%
──────────────────────────────────────────────────────────────────────
🏛️  Egyptian:        ꞽ:bꜣ =k sbꜣ ꜥnḫ ꞽs ḫnt(.ꞽ) sn.w.PL =f...
📖  Reference (DE):   Mögest du ba-mächtig sein wie der lebendige Stern, der an de...
📖  Referen

In [52]:
## 💾 Part 27: Save All Results

print("\n" + "="*70)
print("💾 SAVING EVALUATION RESULTS")
print("="*70)

# Create comprehensive results DataFrame
df_comprehensive = df_test_results.copy()
df_comprehensive['rouge2'] = df_metrics['rouge2'].values
df_comprehensive['rougeL'] = df_metrics['rougeL'].values
df_comprehensive['exact_match'] = df_metrics['exact_match'].values
df_comprehensive['word_overlap'] = df_metrics['word_overlap'].values

# Save comprehensive results
df_comprehensive.to_csv('test_set_results_comprehensive.csv', index=False)
print(f"✅ Comprehensive results saved to: test_set_results_comprehensive.csv")

# Save metrics only
df_metrics.to_csv('test_set_metrics.csv', index=False)
print(f"✅ Metrics saved to: test_set_metrics.csv")

# Save summary statistics
summary_data = {
    'Metric': ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'METEOR', 'Exact Match', 'Word Overlap'],
    'Average': [avg_bleu, avg_rouge1, avg_rouge2, avg_rougeL, avg_meteor, avg_exact, avg_overlap],
    'Min': [
        df_metrics['bleu'].min(),
        df_metrics['rouge1'].min(),
        df_metrics['rouge2'].min(),
        df_metrics['rougeL'].min(),
        df_metrics['meteor'].min(),
        df_metrics['exact_match'].min(),
        df_metrics['word_overlap'].min()
    ],
    'Max': [
        df_metrics['bleu'].max(),
        df_metrics['rouge1'].max(),
        df_metrics['rouge2'].max(),
        df_metrics['rougeL'].max(),
        df_metrics['meteor'].max(),
        df_metrics['exact_match'].max(),
        df_metrics['word_overlap'].max()
    ],
    'Median': [
        df_metrics['bleu'].median(),
        df_metrics['rouge1'].median(),
        df_metrics['rouge2'].median(),
        df_metrics['rougeL'].median(),
        df_metrics['meteor'].median(),
        df_metrics['exact_match'].median(),
        df_metrics['word_overlap'].median()
    ]
}
df_summary = pd.DataFrame(summary_data)
df_summary.to_csv('test_set_summary.csv', index=False)
print(f"✅ Summary statistics saved to: test_set_summary.csv")

# Save failed translations log
if failed_translations:
    df_failed = pd.DataFrame(failed_translations)
    df_failed.to_csv('failed_translations.csv', index=False)
    print(f"⚠️ Failed translations log saved to: failed_translations.csv")

print("\n" + "="*70)

## ✅ Part 28: Final Report

print("\n" + "="*70)
print("✅ COMPLETE TEST SET EVALUATION FINISHED!")
print("="*70)

# Overall quality assessment
if avg_bleu > 50:
    quality = "EXCELLENT 🌟"
    recommendation = "Your RAG system achieves excellent translation quality!"
elif avg_bleu > 30:
    quality = "GOOD ✅"
    recommendation = "Your RAG system shows good translation quality with room for improvement."
else:
    quality = "NEEDS IMPROVEMENT ⚠️"
    recommendation = "Consider fine-tuning your model or improving the retrieval mechanism."

print(f"""
📊 Final Statistics:
   • Total test samples: {len(df_test)}
   • Successful translations: {len(test_results)} ({len(test_results)/len(df_test)*100:.1f}%)
   • Failed translations: {len(failed_translations)} ({len(failed_translations)/len(df_test)*100:.1f}%)

   • Average BLEU score: {avg_bleu:.2f}%
   • Average METEOR score: {avg_meteor:.2f}%
   • Average ROUGE-1 score: {avg_rouge1:.2f}%
   • Exact matches: {int(df_metrics['exact_match'].sum() / 100)} ({avg_exact:.1f}%)

🎯 Overall Quality: {quality}

💡 Recommendation: {recommendation}

📁 Output Files:
   1. test_set_results_comprehensive.csv - Full results with translations and metrics
   2. test_set_metrics.csv - Metrics only for each sample
   3. test_set_summary.csv - Summary statistics table
   {f"4. failed_translations.csv - Log of failed translations ({len(failed_translations)} samples)" if failed_translations else ""}

📈 Metric Interpretation:
   • BLEU > 50%: Excellent translation quality ✅
   • BLEU 30-50%: Good translation quality 🟡
   • BLEU < 30%: Needs improvement 🔴

   Your system: {avg_bleu:.1f}% BLEU - {quality}

🎓 Key Insights:
   • Best performing translations: BLEU = {df_metrics['bleu'].max():.1f}%
   • Worst performing translations: BLEU = {df_metrics['bleu'].min():.1f}%
   • Median performance: BLEU = {df_metrics['bleu'].median():.1f}%
   • Standard deviation: {df_metrics['bleu'].std():.1f}%
""")

print("="*70)
print("\n🎉 Evaluation complete! Check the CSV files for detailed results.")
print("="*70 + "\n")

# Display sample of comprehensive results
print("\n📋 Sample of Comprehensive Results (first 3 rows):")
print("="*70)
print(df_comprehensive[['transliteration', 'predicted_english', 'reference_english', 'bleu', 'meteor']].head(3).to_string())
print("="*70)



💾 SAVING EVALUATION RESULTS
✅ Comprehensive results saved to: test_set_results_comprehensive.csv
✅ Metrics saved to: test_set_metrics.csv
✅ Summary statistics saved to: test_set_summary.csv
⚠️ Failed translations log saved to: failed_translations.csv


✅ COMPLETE TEST SET EVALUATION FINISHED!

📊 Final Statistics:
   • Total test samples: 25
   • Successful translations: 20 (80.0%)
   • Failed translations: 5 (20.0%)
   
   • Average BLEU score: 25.04%
   • Average METEOR score: 54.27%
   • Average ROUGE-1 score: 65.24%
   • Exact matches: 1 (5.0%)
   
🎯 Overall Quality: NEEDS IMPROVEMENT ⚠️

💡 Recommendation: Consider fine-tuning your model or improving the retrieval mechanism.

📁 Output Files:
   1. test_set_results_comprehensive.csv - Full results with translations and metrics
   2. test_set_metrics.csv - Metrics only for each sample
   3. test_set_summary.csv - Summary statistics table
   4. failed_translations.csv - Log of failed translations (5 samples)

📈 Metric Interpretation:
