## Experiment: Creation of Amharic Triplet Dataset 

AmQA Dataset USED

Initial AmQA Dataset: 
https://github.com/semantic-systems/amharic-qa

Author: Abdulmunim J. Jemal

Addis Ababa Institute of Technology

## Approach

We transformed the original QA dataset into two types of triplet datasets:

### 1. **Context-Based Triplets**
- **Anchor**: Question
- **Positive**: Context containing the correct answer
- **Negatives**: Contexts from other answers (sampled randomly)
- **Use Case**: Useful for tasks like passage retrieval or contrastive learning.

### 2. **Answer-Based Triplets**
- **Anchor**: Question
- **Positive**: Correct answer text
- **Negatives**: Answers from other questions (sampled randomly)
- **Use Case**: Ideal for question-answer matching or distractor generation.

#### Key Steps:
1. **Data Validation**: Ensured the dataset schema was correct and filtered invalid entries.
2. **Grouping**: Grouped contexts or answers for efficient sampling.
3. **Triplet Generation**: Created triplets by pairing questions with their positives and sampling negatives from unrelated contexts/answers.
4. **Flexibility**: Added a `mode` parameter to switch between the two approaches.
5. **Save**: Finally, saved both.

**Note:** For our usecase, we will focus on context-based triplets.

This modular pipeline ensures clean, reusable, and schema-compliant triplet generation.



In [76]:
import json
import os

path = './amqa_data/'
files = ['dev_data.json', 'test_data.json', 'train_data.json']

def load_data(files=files, path=path):
    data = {}
    for file in files:
        with open(os.path.join(path, file)) as f:
            data[file.split('_')[0]] = json.load(f)
    return data

data = load_data()

In [2]:
import json

def get_json_skeleton(data):
    if isinstance(data, dict):
        return {key: get_json_skeleton(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [get_json_skeleton(data[0]) if data else []]
    else:
        # Replace with a generic placeholder for scalar values
        return type(data).__name__.lower()

In [3]:
assert get_json_skeleton(data['train']) == get_json_skeleton(data['dev']) == get_json_skeleton(data['test'])
get_json_skeleton(data['dev'])

{'data': [{'paragraphs': [{'qas': [{'question': 'str',
       'id': 'int',
       'answers': [{'answer_id': 'int',
         'document_id': 'int',
         'question_id': 'int',
         'text': 'str',
         'answer_start': 'int',
         'answer_end': 'int',
         'answer_category': 'nonetype'}],
       'is_impossible': 'bool'}],
     'context': 'str',
     'document_id': 'int'}]}],
 'version': 'str'}

## Data Extraction Logic

In [58]:
# Step 1 - Load and Validate Data

from typing import Dict, List, Any, Literal
import random

In [59]:
def load_json(file_path: str) -> Dict:
    """Load JSON data from file with validation"""
    with open(file_path, "r") as f:
        data = json.load(f)
    if "data" not in data:
        raise ValueError(f"Invalid file format in {file_path}")
    return data

In [66]:
def validate_and_extract_qa(raw_data: Dict) -> List[Dict]:
    """Validate schema and extract QA pairs with strict type checking"""
    qa_list = []
    
    for document in raw_data["data"]:
        try:
            for paragraph in document.get("paragraphs", []):
                context = paragraph.get("context", "")
                for qa in paragraph.get("qas", []):
                    # Validate question structure
                    if not all(key in qa for key in ["question", "id", "answers"]):
                        continue
                    
                    # Skip unanswerable questions
                    if qa.get("is_impossible", True):
                        continue
                    
                    # Validate answer structure
                    valid_answers = []
                    for ans in qa["answers"]:
                        if all(k in ans for k in ["text", "answer_start", "answer_end"]):
                            valid_answers.append(ans)
                    
                    if valid_answers:
                        qa_list.append({
                            "question": qa["question"],
                            "answer": valid_answers[0]["text"],
                            "question_id": qa["id"],
                            "context": context,
                            "document_id": paragraph["document_id"]
                        })
        except Exception as e:
            continue
    return qa_list

In [67]:
# --------------------------
# Triplet Generation Core
# --------------------------
def build_answer_answer_map(qa_list: List[Dict]) -> Dict[int, str]:
    """Create mapping of question_id -> correct answer"""
    return {qa["question_id"]: qa["answer"] for qa in qa_list}

def generate_answer_triplets(
    qa_list: List[Dict],
    num_negatives: int = 3
) -> List[Dict[str, Any]]:
    """Generate (question, correct_answer, other_answers) triplets"""
    answer_map = build_answer_answer_map(qa_list)
    all_answers = list(answer_map.values())
    question_ids = list(answer_map.keys())
    
    triplets = []
    for qa in qa_list:
        current_id = qa["question_id"]
        current_answer = qa["answer"]
        
        # Get answers from other questions
        negative_pool = [
            ans for qid, ans in answer_map.items()
            if qid != current_id
        ]
        
        # Deduplicate and sample
        unique_negatives = list(set(negative_pool))
        sampled_negatives = random.sample(
            unique_negatives,
            min(num_negatives, len(unique_negatives))
        )
        
        triplets.append({
            "anchor": qa["question"],
            "positive": current_answer,
            "negatives": sampled_negatives
        })
    
    return triplets

In [68]:
# triplet

def generate_contrastive_triplets(
    qa_list: List[Dict],
    answer_context_map: Dict[str, List[str]],
    num_negatives: int = 3
) -> List[Dict[str, Any]]:
    """Generate (anchor, positive, negatives) triplets."""
    triplets = []
    
    for qa in qa_list:
        anchor = qa["question"]
        positive = qa["context"]
        answer = qa["answer"]
        
        # Collect all contexts from other answers as negatives
        negative_pool = [
            ctx 
            for ans, ctx_list in answer_context_map.items() 
            if ans != answer
            for ctx in ctx_list
        ]
        
        # Deduplicate and sample negatives
        unique_negatives = list(set(negative_pool))
        sampled_negatives = random.sample(
            unique_negatives, 
            min(num_negatives, len(unique_negatives))
        )
        
        triplets.append({
            "anchor": anchor,
            "positive": positive,
            "negatives": sampled_negatives
        })
    
    return triplets

In [69]:
# --------------------------
# Unified Interface
# --------------------------
def generate_triplets_from_file(
    file_path: str,
    num_negatives: int = 3,
    mode: Literal["context", "answer"] = "context"
) -> List[Dict[str, Any]]:
    """
    Unified triplet generator with mode switching
    - 'context' mode: (question, context, other_contexts)
    - 'answer' mode: (question, correct_answer, other_answers)
    """
    raw_data = load_json(file_path)
    qa_list = validate_and_extract_qa(raw_data)
    
    if mode == "context":
        # Original context-based implementation
        answer_context_map = build_answer_context_map(qa_list)
        return generate_contrastive_triplets(qa_list, answer_context_map, num_negatives)
    elif mode == "answer":
        # New answer-based implementation
        return generate_answer_triplets(qa_list, num_negatives)
    else:
        raise ValueError(f"Invalid mode: {mode}. Choose 'context' or 'answer'")


In [70]:
# Two modes; 
# question - right context - wrong context
# question - right answer - wrong answers
train_context_triplets = generate_triplets_from_file("amqa_data/train_data.json", num_negatives=5)
train_answer_triplets = generate_triplets_from_file("amqa_data/train_data.json", num_negatives=5, mode="answer")

dev_context_triplets = generate_triplets_from_file("amqa_data/dev_data.json", num_negatives=5)
dev_answer_triplets = generate_triplets_from_file("amqa_data/dev_data.json", num_negatives=5, mode="answer")

test_context_triplets = generate_triplets_from_file("amqa_data/test_data.json", num_negatives=5)
test_answer_triplets = generate_triplets_from_file("amqa_data/test_data.json", num_negatives=5, mode="answer")

In [71]:
print("Results:")
print(f"Train Context Triplets: {len(train_context_triplets)}")
print(f"Train Answer Triplets: {len(train_answer_triplets)}")
print(f"Dev Context Triplets: {len(dev_context_triplets)}")
print(f"Dev Answer Triplets: {len(dev_answer_triplets)}")
print(f"Test Context Triplets: {len(test_context_triplets)}")
print(f"Test Answer Triplets: {len(test_answer_triplets)}")

Results:
Train Context Triplets: 1343
Train Answer Triplets: 1343
Dev Context Triplets: 504
Dev Answer Triplets: 504
Test Context Triplets: 288
Test Answer Triplets: 288


In [73]:
# Example of each type of triplets. using json

print("Train - Context Triplets")
print(json.dumps(train_context_triplets[0], indent=2, ensure_ascii=False))
print("Train - Answer Triplets")
print(json.dumps(train_answer_triplets[0], indent=2, ensure_ascii=False))

Train - Context Triplets
{
  "anchor": "የታክስ ገቢ ከ2010-2012 በመቶኛ የምን ያህል መጠን እድገት አሳየ?",
  "positive": "ጠቅላይ ሚኒስትር ዐቢይ አሕመድ ከ2010 ጀምሮ በፋይናንሱ ዘርፍ ስኬታማ ለውጦች መመዝገባቸውን ገለጹ፡፡ ጠቅላይ ሚኒስትር ዐቢይ የፋይናንስ ዘርፍ ዐበይት ስኬቶች በሚል በማህበራዊ ትስስር ገፃቸው ላይ እንዳስታወቁት የታክስ ገቢ በ2010 ከነበረበት 229 ቢሊየን ብር በ2012 የ36 በመቶ ጭማሪ በማሳየት 311 ቢሊየን ማድረስ ተችሏል።",
  "negatives": [
    "ፋሲለደስ ዓፄ ፋሲለደስ ወይም ዓፄ ፋሲል (የዙፋን ስማቸው ዓለም ሰገድ)  ከአባታቸው አፄ ሱሰኒዮስ  እና እናታቸው ልዕልት ስልጣነ ምገሴ  በመገዛዝ፣ ሸዋ ህዳር 10፣ 1603  (እ.ኤ.አ) ተወለዱ። የነገሱበትም ዘመን ከ1632  እስከ ጥቅምት 18, 1667 (እ.ኤ.አ) ነበር። በስረፀ ክርስቶስ በተመራው አመፅ ምክንያት በ1630 ፋሲለደስ ለንግስና ቢበቃም፣ ዘውዱን ግን እስከ 1632 አልጫነም ነበር። ሲመተ ንግስናው በ1632 እንደተገባደደ የመጀመሪያው ስራው የተዋህዶ ቤ/ክርስቲያንን የቀድሞው ቁመና መመለስና የካቶሊኮችን መሬት በመቀማት ከደንካዝ በማባረር በፍሪሞና እንዲወሰኑ ማድረግ ነበር። ወዲያውም በማከታተል ከግብፅ አገር አዲስ ጳጳስ እንዲላክለት በማድረግ በአባቱ ዘመን እንዲደበዝዝ ተደርጎ የነበረውን የግብፅና ኢትዮጵያ አብያተ ክርስቲያናት ግንኙነት እንዲጸና አደረገ።  በኬኒያ የሚገኘው የሞምባሳ ወደብ በፖርቱጋሎች መደብደቡን ሲሰማ፣ የሮማው ፓፓ ከበስትጀርባው ያለበት ሴራ ነው በማለት በምድሩ የነበሩትን የካቶሊክ ጀስዊቶች በመሰብሰብ አባረራቸው። አፄ ፋሲል አዘዞ ተብላ በምትታወቀው ከጎንደር ከተማ 5 ማይል ርቃ በምትገኘው ከተማ ጥ

In [74]:
# Step 2 - Save Triplets as CSV

import csv
def save_triplets_to_csv(triplets, file_path):
    try:
        with open(file_path, "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=triplets[0].keys())
            writer.writeheader()
            writer.writerows(triplets)
        print(f"Saved {len(triplets)} triplets to {file_path}")
    except Exception as e:
        print(f"Failed to save triplets: {e}")
        

In [None]:
folder = "data"
os.makedirs(folder, exist_ok=True)

# For our current usecase, the context triplets are more useful
save_triplets_to_csv(train_context_triplets, f"{folder}/train.csv")
save_triplets_to_csv(dev_context_triplets, f"{folder}/dev.csv")
save_triplets_to_csv(test_context_triplets, f"{folder}/test.csv")


Saved 1343 triplets to data/train.csv
Saved 504 triplets to data/dev.csv
Saved 288 triplets to data/test.csv
