## Experiment: Creation of Amharic Triplet Dataset 

AmQA Dataset USED

Credit for the initial dataset: 
https://github.com/semantic-systems/amharic-qa

In [1]:
import json
import os

path = './amqa_data/'
files = ['dev_data.json', 'test_data.json', 'train_data.json']

def load_data(files=files, path=path):
    data = {}
    for file in files:
        with open(os.path.join(path, file)) as f:
            data[file.split('_')[0]] = json.load(f)
    return data

data = load_data()

In [2]:
import json

def get_json_skeleton(data):
    if isinstance(data, dict):
        return {key: get_json_skeleton(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [get_json_skeleton(data[0]) if data else []]
    else:
        # Replace with a generic placeholder for scalar values
        return type(data).__name__.lower()

In [3]:
assert get_json_skeleton(data['train']) == get_json_skeleton(data['dev']) == get_json_skeleton(data['test'])
get_json_skeleton(data['dev'])

{'data': [{'paragraphs': [{'qas': [{'question': 'str',
       'id': 'int',
       'answers': [{'answer_id': 'int',
         'document_id': 'int',
         'question_id': 'int',
         'text': 'str',
         'answer_start': 'int',
         'answer_end': 'int',
         'answer_category': 'nonetype'}],
       'is_impossible': 'bool'}],
     'context': 'str',
     'document_id': 'int'}]}],
 'version': 'str'}

In [28]:
# Step 1 - Load and Validate Data

import json
from typing import Dict, List, Any
import random

In [29]:
def load_json(file_path: str) -> Dict:
    """Load JSON data from a file."""
    with open(file_path, 'r') as file:
        return json.load(file)

In [43]:
def validate_and_extract_qa(raw_data: Dict) -> List[Dict]:
    """
    Extract valid QA pairs from the nested dataset structure.
    Ensures the schema matches exactly to avoid silent failures.
    """
    qa_list = []
    
    # Validate top-level structure
    if "data" not in raw_data:
        raise ValueError("Invalid schema: Missing 'data' key.")
    # Iterate through documents and paragraphs
    for document in raw_data["data"]:
        if "paragraphs" not in document:
            continue  # Skip documents without paragraphs
        
        for paragraph in document["paragraphs"]:
            try:
                context = paragraph.get("context", "")
                document_id = paragraph.get("document_id", -1)
                
                if not context or "qas" not in paragraph:
                    continue  # Skip invalid paragraphs
                
                # Process each QA in the paragraph
                for qa in paragraph["qas"]:
                    # Skip unanswerable or malformed QAs
                    if qa.get("is_impossible", True) or not qa.get("answers"):
                        continue  
                    
                    # Extract the first valid answer (as per your schema)
                    first_answer = qa["answers"][0]
                    answer_text = first_answer.get("text", "")
                    if not answer_text:
                        continue  # Skip answers without text
                    
                    qa_list.append({
                        "question": qa["question"],
                        "context": context,
                        "answer": answer_text,
                        "question_id": qa["id"],
                        "document_id": document_id
                    })
            except Exception as e:
               continue # Skip any other exceptions
    
    return qa_list

In [44]:
# Map answer texts to their associated context for effective negative sampling

def build_answer_context_map(qa_list: List[Dict]) -> Dict[str, List[str]]:
    """Group contexts by their answer text."""
    answer_to_contexts = {}
    for qa in qa_list:
        answer = qa["answer"]
        context = qa["context"]
        if answer not in answer_to_contexts:
            answer_to_contexts[answer] = []
        answer_to_contexts[answer].append(context)
    return answer_to_contexts

In [45]:
# triplet

def generate_contrastive_triplets(
    qa_list: List[Dict],
    answer_context_map: Dict[str, List[str]],
    num_negatives: int = 3
) -> List[Dict[str, Any]]:
    """Generate (anchor, positive, negatives) triplets."""
    triplets = []
    
    for qa in qa_list:
        anchor = qa["question"]
        positive = qa["context"]
        answer = qa["answer"]
        
        # Collect all contexts from other answers as negatives
        negative_pool = [
            ctx 
            for ans, ctx_list in answer_context_map.items() 
            if ans != answer
            for ctx in ctx_list
        ]
        
        # Deduplicate and sample negatives
        unique_negatives = list(set(negative_pool))
        sampled_negatives = random.sample(
            unique_negatives, 
            min(num_negatives, len(unique_negatives))
        )
        
        triplets.append({
            "anchor": anchor,
            "positive": positive,
            "negatives": sampled_negatives
        })
    
    return triplets

In [46]:
# End to End Pipeline

def generate_triplets_from_file(
    file_path: str, 
    num_negatives: int = 3
) -> List[Dict[str, Any]]:
    """End-to-end triplet generation from a JSON file."""
    raw_data = load_json(file_path)
    qa_list = validate_and_extract_qa(raw_data)
    answer_context_map = build_answer_context_map(qa_list)
    return generate_contrastive_triplets(qa_list, answer_context_map, num_negatives)

In [47]:
train_triplets = generate_triplets_from_file("amqa_data/train_data.json", num_negatives=3)
dev_triplets = generate_triplets_from_file("amqa_data/dev_data.json", num_negatives=3)
test_triplets = generate_triplets_from_file("amqa_data/test_data.json", num_negatives=3)

In [52]:
print("Result Representation")
print(f"Number of Triplets: Training {len(train_triplets)}, Dev {len(dev_triplets)}, Test {len(test_triplets)}")
print("Sample Triplet from Train Triplets:")
print(json.dumps(train_triplets[10], indent=2, ensure_ascii=False))
print("Sample Triplet from Dev Triplets:")
print(json.dumps(dev_triplets[-10], indent=2, ensure_ascii=False))
print("Sample Triplet from Test Triplets:")
print(json.dumps(test_triplets[10], indent=2, ensure_ascii=False))

Result Representation
Number of Triplets: Training 1723, Dev 595, Test 299
Sample Triplet from Train Triplets:
{
  "anchor": "ብርጋዴር ጄነራል መአሾ በኢትዮጵያ አየር ኃይል ውስጥ ለስንት ጊዜ አገልግለዋል?",
  "positive": "በኢትዮጵያ አየር ኃይል ውስጥ ለ25 ዓመታት ያገለገሉት ብርጋዴር ጄነራል መአሾ ሀጎስ ስዩም የተባበሩት መንግስታት ድርጅት የደቡብ ሱዳን ሰላም ማስከበር ተልእኮ አዛዥ ሆነው ተሰየሙ። ብርጋዴየር ጄነራል መአሾ  በተመድ የላይቤሪያ እንዲሁም ዳርፉር ሰላም አስከባሪ ኃይል ውስጥ ማገልገላቸውም ታውቋል። አዲሱን ሹመት ከጥር 04 ቀን 2013 ዓ.ም ጀምሮ እንደተሰጣቸውም በተመድ መረጃ ላይ ተገልጿል።",
  "negatives": [
    "ዶናልድ ትራምፕ ከኒው ዮርክ ከአምስቱ ቀጠናዎች አንዱ በሆነው በክዊንስ በእ.ኤ.አ. ጁን 14 1946 ተወለደ። ለእናቱ ሜሪ አን እና ለአባቱ ፍሬድ ትራምፕ ከአምስት ልጆች መሃል አራተኛው ልጃቸው ነበር። እናቱ የተወለደችው በስኮትላንድ ሉዊስ ኤንድ ሃሪስ ደሴት ላይ ቶንግ በተባለው ስፍራ ነው። በእ.ኤ.አ. 1930 በ18 ዓመቷ ዩናይትድ ስቴትስን ጎበኘች እናም ከፍሬድ ትራምፕ ጋር ተገናኘች። በእ.ኤ.አ. 1936 ትዳር ይዘው በጃማይካ ኢስቴትስ ክዊንስ መኖር ጀመሩ። በዚህም ስፍራ ፍሬድ ትራምፕ ታላቅ የሪል ኢስቴት ገንቢ ሆኖ ነበር። ዶናልድ ትራምፕ፥ ሮበርት የተባለ አንድ ወንድም፣ ሜሪአን እና ኤሊዛቤት የተባሉ ሁለት እህቶች አሉት። ፍሬድ ጁኒየር የተባለ ወንድሙ ደግሞ ከአልኮል ሱስ ጋር በተያያዘ ምክንያት ሕይወቱ አልፏል ፤ ይህም ከአልኮሆል መጠጥ እና ከትምባሆ እንዲታቀብ እንዳደረገውም ዶናልድ ትራምፕ ይናገራል። የዶናልድ ትራምፕ አባት ከ