# Amharic Legal Text to Speech Generation

This notebook generates audio files from normalized legal text using three TTS voices:
- **Mekdes** (edge-tts): am-ET-MekdesNeural
- **Ameha** (edge-tts): am-ET-AmehaNeural  
- **gTTS**: Google Text-to-Speech (Amharic)

Input: Normalized text from `Dataset/normalized_legal_text/`
Output: Audio files in `Dataset/audio/` with metadata CSV


In [1]:
import asyncio
import edge_tts
from gtts import gTTS
from pathlib import Path
import pandas as pd
import time
from tqdm import tqdm
import os
import re
import librosa
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

INPUT_DIR = Path("Dataset/normalized_legal_text")
AUDIO_OUTPUT_DIR = Path("Dataset/audio")
METADATA_FILE = "Dataset/audio/metadata.csv"

VOICES = {
    "mekdes": "am-ET-MekdesNeural",
    "ameha": "am-ET-AmehaNeural",
    "gtts": "gtts"
}

DOMAIN_NAMES = {
    1: "Contracts and Commercial Law",
    2: "Criminal Law and Procedures",
    3: "Constitutional and Administrative Law",
    4: "Property and Civil Law",
    5: "Family Law and Labor Law",
    6: "Tax and Financial Law",
    7: "International Law and Treaties",
    8: "Court Procedures and Judicial Processes",
    9: "Regulatory and Administrative Procedures"
}

print("Configuration:")
print(f"  Input directory: {INPUT_DIR}")
print(f"  Output directory: {AUDIO_OUTPUT_DIR}")
print(f"  Metadata file: {METADATA_FILE}")
print(f"  Voices: {list(VOICES.keys())}")


Configuration:
  Input directory: Dataset/normalized_legal_text
  Output directory: Dataset/audio
  Metadata file: Dataset/audio/metadata.csv
  Voices: ['mekdes', 'ameha', 'gtts']


In [2]:
def load_sentences_from_file(file_path):
    """Load sentences from a normalized text file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    sentences = []
    for line in lines:
        line = line.strip()
        if line and line.endswith('።'):
            sentences.append(line)
    
    return sentences

def extract_domain_from_filename(filename):
    """Extract domain number from batch filename"""
    import re
    match = re.search(r'domain(\d+)', filename)
    if match:
        return int(match.group(1))
    return None

def load_all_sentences_with_domains(input_dir):
    """Load sentences from all normalized batch files with domain tracking"""
    batch_files = sorted(input_dir.glob("legal_text_batch_*.txt"))
    
    all_data = []
    for batch_file in batch_files:
        domain_num = extract_domain_from_filename(batch_file.name)
        domain_name = DOMAIN_NAMES.get(domain_num, f"Domain {domain_num}")
        
        sentences = load_sentences_from_file(batch_file)
        for sentence in sentences:
            all_data.append({
                "sentence": sentence,
                "domain_num": domain_num,
                "domain_name": domain_name,
                "batch_file": batch_file.name
            })
    
    return all_data

all_sentences_data = load_all_sentences_with_domains(INPUT_DIR)
print(f"Loaded {len(all_sentences_data)} sentences from {INPUT_DIR}")
print(f"\nDomain distribution:")
domain_counts = pd.Series([d["domain_name"] for d in all_sentences_data]).value_counts()
for domain, count in domain_counts.items():
    print(f"  {domain}: {count} sentences")
print(f"\nFirst 3 sentences:")
for i, data in enumerate(all_sentences_data[:3], 1):
    print(f"{i}. [{data['domain_name']}] {data['sentence'][:60]}...")


Loaded 7590 sentences from Dataset/normalized_legal_text

Domain distribution:
  Criminal Law and Procedures: 1188 sentences
  Contracts and Commercial Law: 1041 sentences
  Constitutional and Administrative Law: 1033 sentences
  Tax and Financial Law: 877 sentences
  Property and Civil Law: 812 sentences
  Regulatory and Administrative Procedures: 774 sentences
  Court Procedures and Judicial Processes: 702 sentences
  Family Law and Labor Law: 667 sentences
  International Law and Treaties: 496 sentences

First 3 sentences:
1. [Contracts and Commercial Law] ይህ የውል ስምምነት በሁለት ሺህ አስራ አምስት አመተ ምህረት ጥር ወር አስራ ሁለት ቀን በአዲስ...
2. [Contracts and Commercial Law] የአከራካሪ ጉዳዮች የዳኝነት ስልጣን ለአዲስ አበባ ከተማ ፍርድ ቤቶች ብቻ የሚሰጥ ከሆነ የውሉ ...
3. [Contracts and Commercial Law] በዚህ ስምምነት የሁለቱ ወገኖች የጋራ ፍላጎትና የንግድ ሚስጥሮች በጥንቃቄ እንዲጠበቁ በግልጽ ተ...


In [3]:
AUDIO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

existing_transcriptions = set()
existing_metadata = []
existing_audio_files = set()

if Path(METADATA_FILE).exists():
    df_existing = pd.read_csv(METADATA_FILE)
    
    # Check which audio files actually exist on disk
    for _, row in df_existing.iterrows():
        audio_file_path = AUDIO_OUTPUT_DIR / row['file_name']
        if audio_file_path.exists():
            existing_transcriptions.add(row['transcription'].strip())
            existing_metadata.append(row.to_dict())
            existing_audio_files.add(row['file_name'])
    
    print(f"Loaded existing metadata: {len(existing_audio_files)} audio files found on disk")
    print(f"  Unique sentences with audio: {len(existing_transcriptions)}")
else:
    print("No existing metadata found - starting fresh")

# Filter out sentences that already have audio files
filtered_sentences_data = [
    data for data in all_sentences_data 
    if data['sentence'].strip() not in existing_transcriptions
]

total_sentences = len(all_sentences_data)
remaining_sentences = len(filtered_sentences_data)
already_have_audio = total_sentences - remaining_sentences

print(f"\nFiltering results:")
print(f"  Total sentences in corpus: {total_sentences}")
print(f"  Already have audio files: {already_have_audio}")
print(f"  Remaining to generate: {remaining_sentences}")

if len(filtered_sentences_data) == 0:
    print("\nAll sentences already have audio files! Nothing to generate.")
else:
    print(f"\nDomain distribution of remaining sentences to generate:")
    domain_counts = pd.Series([d["domain_name"] for d in filtered_sentences_data]).value_counts()
    for domain, count in domain_counts.items():
        print(f"  {domain}: {count} sentences")


No existing metadata found - starting fresh

Filtering results:
  Total sentences in corpus: 7590
  Already have audio files: 0
  Remaining to generate: 7590

Domain distribution of remaining sentences to generate:
  Criminal Law and Procedures: 1188 sentences
  Contracts and Commercial Law: 1041 sentences
  Constitutional and Administrative Law: 1033 sentences
  Tax and Financial Law: 877 sentences
  Property and Civil Law: 812 sentences
  Regulatory and Administrative Procedures: 774 sentences
  Court Procedures and Judicial Processes: 702 sentences
  Family Law and Labor Law: 667 sentences
  International Law and Treaties: 496 sentences


In [4]:
def distribute_sentences_data(sentences_data, mekdes_pct=0.4, ameha_pct=0.4, gtts_pct=0.2):
    """Distribute sentences with 40%, 40%, 20% split across voices"""
    total = len(sentences_data)
    mekdes_count = int(total * mekdes_pct)
    ameha_count = int(total * ameha_pct)
    gtts_count = total - mekdes_count - ameha_count  # Remaining goes to gtts
    
    distribution = {
        "mekdes": sentences_data[:mekdes_count],
        "ameha": sentences_data[mekdes_count:mekdes_count + ameha_count],
        "gtts": sentences_data[mekdes_count + ameha_count:]
    }
    
    return distribution

if len(filtered_sentences_data) > 0:
    distribution = distribute_sentences_data(filtered_sentences_data)
    print(f"Remaining sentence distribution across voices (40%, 40%, 20%):")
    for voice, data_list in distribution.items():
        pct = (len(data_list) / len(filtered_sentences_data)) * 100 if len(filtered_sentences_data) > 0 else 0
        print(f"  {voice}: {len(data_list)} remaining sentences ({pct:.1f}% of remaining)")
        domain_counts = pd.Series([d["domain_name"] for d in data_list]).value_counts()
        print(f"    Domains: {dict(domain_counts.head(3))}")
else:
    distribution = {"mekdes": [], "ameha": [], "gtts": []}
    print("No remaining sentences to distribute - all audio files already generated!")


Remaining sentence distribution across voices (40%, 40%, 20%):
  mekdes: 3036 remaining sentences (40.0% of remaining)
    Domains: {'Criminal Law and Procedures': 753, 'Constitutional and Administrative Law': 556, 'Property and Civil Law': 422}
  ameha: 3036 remaining sentences (40.0% of remaining)
    Domains: {'Family Law and Labor Law': 581, 'Tax and Financial Law': 575, 'Property and Civil Law': 390}
  gtts: 1518 remaining sentences (20.0% of remaining)
    Domains: {'Regulatory and Administrative Procedures': 323, 'Contracts and Commercial Law': 311, 'Court Procedures and Judicial Processes': 289}


In [5]:
async def generate_edge_tts_audio(text, voice, output_path):
    """Generate audio using edge-tts"""
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(str(output_path))
    return output_path

def generate_gtts_audio(text, output_path):
    """Generate audio using gTTS"""
    tts = gTTS(text=text, lang='am', slow=False)
    tts.save(str(output_path))
    return output_path

def get_audio_duration(audio_path):
    """Get duration of audio file in seconds"""
    try:
        duration = librosa.get_duration(path=str(audio_path))
        return duration
    except Exception as e:
        print(f"  Warning: Could not get duration for {audio_path.name}: {e}")
        return None


In [6]:
# Scan for audio files that exist but aren't in metadata.csv
print("Scanning for audio files missing from metadata...")

# Get all audio files on disk
all_audio_files = set()
audio_file_pattern = "legal_*.mp3"
for audio_file in AUDIO_OUTPUT_DIR.glob(audio_file_pattern):
    all_audio_files.add(audio_file.name)

print(f"  Total audio files on disk: {len(all_audio_files)}")

# Get files already in metadata
files_in_metadata = set()
if Path(METADATA_FILE).exists():
    df_metadata = pd.read_csv(METADATA_FILE)
    files_in_metadata = set(df_metadata['file_name'].tolist())
    print(f"  Files in metadata.csv: {len(files_in_metadata)}")

# Find missing files
missing_files = all_audio_files - files_in_metadata
print(f"  Missing from metadata: {len(missing_files)}")

if len(missing_files) > 0:
    print(f"\nProcessing {len(missing_files)} missing audio files...")
    
    # Extract counter and voice from filenames
    missing_file_data = []
    for filename in missing_files:
        match = re.match(r'legal_(\d+)_(\w+)\.mp3', filename)
        if match:
            counter = int(match.group(1))
            voice = match.group(2)
            missing_file_data.append({
                'filename': filename,
                'counter': counter,
                'voice': voice
            })
    
    # Sort by counter
    missing_file_data.sort(key=lambda x: x['counter'])
    
    # To match files to sentences, we need to reconstruct the distribution
    # Apply distribution to all_sentences_data (not filtered) to get the mapping
    total_sentences = len(all_sentences_data)
    mekdes_count = int(total_sentences * 0.4)
    ameha_count = int(total_sentences * 0.4)
    gtts_count = total_sentences - mekdes_count - ameha_count
    
    # Find the minimum counter from ALL files on disk (not just metadata) to determine the offset
    all_counters = []
    for filename in all_audio_files:
        match = re.match(r'legal_(\d+)_', filename)
        if match:
            all_counters.append(int(match.group(1)))
    
    counter_offset = min(all_counters) if all_counters else 0
    print(f"  Determined counter offset from all files on disk: {counter_offset}")
    
    
    # Create counter -> sentence mapping
    # Counters are assigned sequentially starting from counter_offset
    counter_to_sentence = {}
    sentence_idx = 0
    counter = counter_offset
    
    # mekdes: first 40%
    for i in range(mekdes_count):
        counter_to_sentence[counter] = {**all_sentences_data[sentence_idx], 'voice': 'mekdes'}
        sentence_idx += 1
        counter += 1
    
    # ameha: next 40%
    for i in range(ameha_count):
        counter_to_sentence[counter] = {**all_sentences_data[sentence_idx], 'voice': 'ameha'}
        sentence_idx += 1
        counter += 1
    
    # gtts: last 20%
    for i in range(gtts_count):
        counter_to_sentence[counter] = {**all_sentences_data[sentence_idx], 'voice': 'gtts'}
        sentence_idx += 1
        counter += 1
    
    # Now match missing files to sentences and create metadata entries
    new_metadata_entries = []
    matched_count = 0
    unmatched_count = 0
    
    for file_data in missing_file_data:
        filename = file_data['filename']
        file_counter = file_data['counter']
        file_voice = file_data['voice']
        
        # Check if we have a mapping for this counter
        if file_counter in counter_to_sentence:
            sentence_data = counter_to_sentence[file_counter]
            
            # Verify voice matches
            if sentence_data['voice'] == file_voice:
                # Calculate duration
                audio_path = AUDIO_OUTPUT_DIR / filename
                duration = get_audio_duration(audio_path)
                
                if duration is not None:
                    new_metadata_entries.append({
                        'file_name': filename,
                        'transcription': sentence_data['sentence'],
                        'domain_num': sentence_data['domain_num'],
                        'domain_name': sentence_data['domain_name'],
                        'voice': file_voice,
                        'duration': duration
                    })
                    matched_count += 1
                else:
                    print(f"  Warning: Could not get duration for {filename}")
                    unmatched_count += 1
            else:
                print(f"  Warning: Voice mismatch for {filename} (expected {sentence_data['voice']}, got {file_voice})")
                unmatched_count += 1
        else:
            print(f"  Warning: No mapping found for counter {file_counter} (file: {filename})")
            unmatched_count += 1
    
    print(f"\n  Matched and processed: {matched_count} files")
    print(f"  Unmatched/Skipped: {unmatched_count} files")
    
    if len(new_metadata_entries) > 0:
        # Load existing metadata
        if Path(METADATA_FILE).exists():
            df_existing = pd.read_csv(METADATA_FILE)
        else:
            df_existing = pd.DataFrame(columns=['file_name', 'transcription', 'domain_num', 'domain_name', 'voice', 'duration'])
        
        # Create DataFrame from new entries
        df_new = pd.DataFrame(new_metadata_entries)
        
        # Combine and save
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        df_combined.to_csv(METADATA_FILE, index=False, encoding='utf-8')
        
        print(f"\n  Added {len(new_metadata_entries)} files to metadata.csv")
        print(f"  Total files in metadata: {len(df_combined)}")
        
        # Update existing_metadata list for use in subsequent cells
        existing_metadata = df_combined.to_dict('records')
        existing_audio_files = set(df_combined['file_name'].tolist())
        
        print(f"\n  Metadata updated successfully!")
    else:
        print(f"\n  No new metadata entries to add.")
else:
    print("\n  All audio files are already in metadata.csv. No action needed.")


Scanning for audio files missing from metadata...
  Total audio files on disk: 0
  Missing from metadata: 0

  All audio files are already in metadata.csv. No action needed.


In [7]:
if len(filtered_sentences_data) == 0:
    print("No sentences to process. All sentences already have audio files.")
    metadata_records = existing_metadata
else:
    metadata_records = list(existing_metadata)
    file_counter = len(existing_metadata)
    metadata_lock = Lock()
    
    # Find the highest existing file counter to avoid overlaps
    existing_counters = set()
    if Path(METADATA_FILE).exists():
        df_existing = pd.read_csv(METADATA_FILE)
        for filename in df_existing['file_name']:
            match = re.match(r'legal_(\d+)_', filename)
            if match:
                existing_counters.add(int(match.group(1)))
    
    max_existing_counter = max(existing_counters) if existing_counters else -1
    file_counter = max(file_counter, max_existing_counter + 1)
    
    def chunk_list(lst, num_chunks):
        """Split list into approximately equal chunks"""
        chunk_size = len(lst) // num_chunks
        remainder = len(lst) % num_chunks
        chunks = []
        start = 0
        for i in range(num_chunks):
            end = start + chunk_size + (1 if i < remainder else 0)
            chunks.append(lst[start:end])
            start = end
        return chunks
    
    async def generate_edge_tts_batch(sentences_batch, voice_id, voice_name, start_counter, batch_id):
        """Generate audio for a batch of sentences using edge-tts"""
        batch_metadata = []
        counter = start_counter
        skipped_count = 0
        
        for data in tqdm(sentences_batch, desc=f"{voice_name}_batch{batch_id}"):
            audio_filename = f"legal_{counter:06d}_{voice_name}.mp3"
            audio_path = AUDIO_OUTPUT_DIR / audio_filename
            
            # Check if THIS specific file already exists
            if audio_path.exists():
                skipped_count += 1
                counter += 1
                continue
            
            # Check if ANY file with this counter number exists (prevent overlaps)
            counter_pattern = f"legal_{counter:06d}_*.mp3"
            matching_files = list(AUDIO_OUTPUT_DIR.glob(counter_pattern))
            if matching_files:
                skipped_count += 1
                counter += 1
                continue
            
            try:
                await generate_edge_tts_audio(data["sentence"], voice_id, audio_path)
                duration = get_audio_duration(audio_path)
                
                batch_metadata.append({
                    "file_name": audio_filename,
                    "transcription": data["sentence"],
                    "domain_num": data["domain_num"],
                    "domain_name": data["domain_name"],
                    "voice": voice_name,
                    "duration": duration
                })
            except Exception as e:
                print(f"  Error generating {audio_filename}: {e}")
            
            counter += 1
        
        if skipped_count > 0:
            print(f"  {voice_name}_batch{batch_id}: Skipped {skipped_count} files that already exist")
        
        return batch_metadata, counter
    
    def generate_gtts_batch(sentences_batch, voice_name, start_counter, batch_id):
        """Generate audio for a batch of sentences using gTTS"""
        batch_metadata = []
        counter = start_counter
        skipped_count = 0
        
        for data in tqdm(sentences_batch, desc=f"{voice_name}_batch{batch_id}"):
            audio_filename = f"legal_{counter:06d}_{voice_name}.mp3"
            audio_path = AUDIO_OUTPUT_DIR / audio_filename
            
            # Check if THIS specific file already exists
            if audio_path.exists():
                skipped_count += 1
                counter += 1
                continue
            
            # Check if ANY file with this counter number exists (prevent overlaps)
            counter_pattern = f"legal_{counter:06d}_*.mp3"
            matching_files = list(AUDIO_OUTPUT_DIR.glob(counter_pattern))
            if matching_files:
                skipped_count += 1
                counter += 1
                continue
            
            try:
                generate_gtts_audio(data["sentence"], audio_path)
                duration = get_audio_duration(audio_path)
                
                batch_metadata.append({
                    "file_name": audio_filename,
                    "transcription": data["sentence"],
                    "domain_num": data["domain_num"],
                    "domain_name": data["domain_name"],
                    "voice": voice_name,
                    "duration": duration
                })
            except Exception as e:
                print(f"  Error generating {audio_filename}: {e}")
            
            counter += 1
        
        if skipped_count > 0:
            print(f"  {voice_name}_batch{batch_id}: Skipped {skipped_count} files that already exist")
        
        return batch_metadata, counter
    
    async def generate_all_audio_parallel():
        """Generate audio in parallel batches"""
        
        # Determine number of chunks per voice (1-2 chunks each = 3-6 total)
        chunks_per_voice = 2  # This gives us 6 total chunks (2 per voice)
        
        # Prepare all tasks
        async_tasks = []
        sync_tasks = []
        counter_assignments = {}  # Track counter assignments for validation
        
        current_counter = file_counter
        print(f"\nStarting counter assignment from: {current_counter}")
        
        for voice_name, sentences_data in distribution.items():
            if len(sentences_data) == 0:
                continue
            
            # Split sentences into chunks
            chunks = chunk_list(sentences_data, chunks_per_voice)
            print(f"\n{voice_name}: {len(sentences_data)} sentences split into {len(chunks)} chunks")
            
            if voice_name == "gtts":
                # gTTS: use ThreadPoolExecutor
                for batch_id, chunk in enumerate(chunks):
                    start_counter = current_counter
                    end_counter = current_counter + len(chunk) - 1
                    counter_assignments[f"{voice_name}_batch{batch_id}"] = (start_counter, end_counter, len(chunk))
                    print(f"  {voice_name}_batch{batch_id}: counters {start_counter:06d} to {end_counter:06d} ({len(chunk)} files)")
                    current_counter += len(chunk)
                    sync_tasks.append((chunk, voice_name, start_counter, batch_id))
            else:
                # edge-tts: use async
                voice_id = VOICES[voice_name]
                for batch_id, chunk in enumerate(chunks):
                    start_counter = current_counter
                    end_counter = current_counter + len(chunk) - 1
                    counter_assignments[f"{voice_name}_batch{batch_id}"] = (start_counter, end_counter, len(chunk))
                    print(f"  {voice_name}_batch{batch_id}: counters {start_counter:06d} to {end_counter:06d} ({len(chunk)} files)")
                    current_counter += len(chunk)
                    async_tasks.append(generate_edge_tts_batch(chunk, voice_id, voice_name, start_counter, batch_id))
        
        # Validate no overlapping counter ranges
        print(f"\nCounter assignment validation:")
        all_ranges = list(counter_assignments.values())
        for i, (start1, end1, len1) in enumerate(all_ranges):
            for j, (start2, end2, len2) in enumerate(all_ranges[i+1:], i+1):
                if not (end1 < start2 or end2 < start1):
                    print(f"  ERROR: Overlap detected between ranges!")
                    print(f"    Range 1: {start1:06d}-{end1:06d}")
                    print(f"    Range 2: {start2:06d}-{end2:06d}")
        print(f"  Total counter ranges: {len(counter_assignments)}")
        print(f"  All ranges are non-overlapping: ✓")
        
        # Run ALL tasks concurrently (async + sync)
        def run_gtts_batches_sync():
            """Run gTTS batches in ThreadPoolExecutor (sync function to be run in thread)"""
            if not sync_tasks:
                return []
            
            batch_results = []
            with ThreadPoolExecutor(max_workers=len(sync_tasks)) as executor:
                futures = {
                    executor.submit(generate_gtts_batch, chunk, voice_name, start_counter, batch_id): 
                    (chunk, voice_name, start_counter, batch_id)
                    for chunk, voice_name, start_counter, batch_id in sync_tasks
                }
                
                for future in as_completed(futures):
                    try:
                        batch_metadata, _ = future.result()
                        batch_results.append(batch_metadata)
                    except Exception as e:
                        print(f"  Error in gTTS batch: {e}")
            
            return batch_results
        
        print(f"\nRunning {len(async_tasks)} async batches and {len(sync_tasks)} gTTS batches concurrently (6 total)...")
        
        # Run async tasks and gTTS tasks concurrently
        if async_tasks and sync_tasks:
            async_results, gtts_results = await asyncio.gather(
                asyncio.gather(*async_tasks),
                asyncio.to_thread(run_gtts_batches_sync)
            )
            
            # Combine results
            for batch_metadata, _ in async_results:
                with metadata_lock:
                    metadata_records.extend(batch_metadata)
            for batch_metadata in gtts_results:
                with metadata_lock:
                    metadata_records.extend(batch_metadata)
        elif async_tasks:
            async_results = await asyncio.gather(*async_tasks)
            for batch_metadata, _ in async_results:
                with metadata_lock:
                    metadata_records.extend(batch_metadata)
        elif sync_tasks:
            gtts_results = await asyncio.to_thread(run_gtts_batches_sync)
            for batch_metadata in gtts_results:
                with metadata_lock:
                    metadata_records.extend(batch_metadata)
    
    await generate_all_audio_parallel()
    
    new_files_count = len(metadata_records) - len(existing_metadata)
    print(f"\nGeneration complete!")
    print(f"  New audio files generated: {new_files_count}")
    print(f"  Total audio files (existing + new): {len(metadata_records)}")
    print(f"  Remaining to generate: {len(all_sentences_data) - len(metadata_records)}")
    
    # Check if all files are complete
    if new_files_count == 0:
        print("\n" + "="*70)
        print("ALL AUDIO GENERATION COMPLETE!")
        print("="*70)
        print(f"All {len(metadata_records)} audio files have been generated.")
        print("No further generation needed.")
        print("="*70)



Starting counter assignment from: 0

mekdes: 3036 sentences split into 2 chunks
  mekdes_batch0: counters 000000 to 001517 (1518 files)
  mekdes_batch1: counters 001518 to 003035 (1518 files)

ameha: 3036 sentences split into 2 chunks
  ameha_batch0: counters 003036 to 004553 (1518 files)
  ameha_batch1: counters 004554 to 006071 (1518 files)

gtts: 1518 sentences split into 2 chunks
  gtts_batch0: counters 006072 to 006830 (759 files)
  gtts_batch1: counters 006831 to 007589 (759 files)

Counter assignment validation:
  Total counter ranges: 6
  All ranges are non-overlapping: ✓

Running 4 async batches and 2 gTTS batches concurrently (6 total)...


mekdes_batch0:   0%|          | 0/1518 [00:00<?, ?it/s]


[A

[A[A



[A[A[A[A


[A[A[A


[A[A[A

[A[A



[A[A[A[A
mekdes_batch0:   0%|          | 1/1518 [00:02<1:01:37,  2.44s/it]

[A[A
mekdes_batch0:   0%|          | 2/1518 [00:03<36:49,  1.46s/it]  



[A[A[A[A


mekdes_batch0:   0%|          | 3/1518 [00:03<28:31,  1.13s/it]
[A

[A[A
[A



[A[A[A[A

[A[A


[A[A[A
mekdes_batch0:   0%|          | 4/1518 [00:05<36:16,  1.44s/it]



[A[A[A[A
[A

mekdes_batch0:   0%|          | 5/1518 [00:06<29:54,  1.19s/it]


[A[A[A
mekdes_batch0:   0%|          | 6/1518 [00:07<26:25,  1.05s/it]

[A[A



mekdes_batch0:   0%|          | 7/1518 [00:08<23:40,  1.06it/s]
[A

[A[A


[A[A[A
[A



[A[A[A[A

mekdes_batch0:   1%|          | 8/1518 [00:09<29:25,  1.17s/it]
[A

[A[A


[A[A[A
[A



[A[A[A[A

mekdes_batch0:   1%|          | 9/1518 [00:11<32:31,  1.29s/it]
[A


mekdes_batch0:   1%|          | 10/1518 [00:12<28:35,  1.14s/it]



[A[A[A[A
[A

mekdes_batch0:   1%|          | 11/1518


Generation complete!
  New audio files generated: 7590
  Total audio files (existing + new): 7590
  Remaining to generate: 0





In [8]:
df = pd.DataFrame(metadata_records)
df.to_csv(METADATA_FILE, index=False, encoding='utf-8')

STATISTICS_FILE = "Dataset/audio/duration_statistics.txt"

print(f"Metadata saved to {METADATA_FILE}")
print(f"\nMetadata preview:")
print(df.head(10).to_string())
print(f"\nTotal records: {len(df)}")

print(f"\nVoice distribution:")
print(df['voice'].value_counts())

print(f"\nDomain distribution:")
print(df['domain_name'].value_counts().head(10))

if 'duration' in df.columns:
    from datetime import datetime
    
    stats_output = []
    stats_output.append("="*70)
    stats_output.append("AUDIO DURATION STATISTICS")
    stats_output.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    stats_output.append("="*70)
    
    stats_output.append(f"\nOverall Statistics:")
    stats_output.append(f"  Total duration: {df['duration'].sum() / 3600:.2f} hours ({df['duration'].sum() / 60:.2f} minutes)")
    stats_output.append(f"  Average duration: {df['duration'].mean():.2f} seconds")
    stats_output.append(f"  Min duration: {df['duration'].min():.2f} seconds")
    stats_output.append(f"  Max duration: {df['duration'].max():.2f} seconds")
    stats_output.append(f"  Total files: {len(df)}")
    
    stats_output.append(f"\n" + "-"*70)
    stats_output.append("1. TOTAL DURATION PER DOMAIN:")
    stats_output.append("-"*70)
    domain_durations = df.groupby('domain_name')['duration'].agg(['sum', 'count', 'mean']).round(2)
    domain_durations['sum_hours'] = (domain_durations['sum'] / 3600).round(2)
    domain_durations['sum_minutes'] = (domain_durations['sum'] / 60).round(2)
    domain_durations = domain_durations.sort_values('sum', ascending=False)
    domain_durations.columns = ['Total (seconds)', 'Count', 'Mean (seconds)', 'Total (hours)', 'Total (minutes)']
    stats_output.append(domain_durations[['Count', 'Total (seconds)', 'Total (minutes)', 'Total (hours)', 'Mean (seconds)']].to_string())
    
    stats_output.append(f"\n" + "-"*70)
    stats_output.append("2. TOTAL DURATION PER VOICE:")
    stats_output.append("-"*70)
    voice_durations = df.groupby('voice')['duration'].agg(['sum', 'count', 'mean']).round(2)
    voice_durations['sum_hours'] = (voice_durations['sum'] / 3600).round(2)
    voice_durations['sum_minutes'] = (voice_durations['sum'] / 60).round(2)
    voice_durations = voice_durations.sort_values('sum', ascending=False)
    voice_durations.columns = ['Total (seconds)', 'Count', 'Mean (seconds)', 'Total (hours)', 'Total (minutes)']
    stats_output.append(voice_durations[['Count', 'Total (seconds)', 'Total (minutes)', 'Total (hours)', 'Mean (seconds)']].to_string())
    
    stats_output.append(f"\n" + "-"*70)
    stats_output.append("3. DURATION PER VOICE PER DOMAIN (Voice x Domain Breakdown):")
    stats_output.append("-"*70)
    voice_domain_durations = df.groupby(['voice', 'domain_name'])['duration'].agg(['sum', 'count', 'mean']).round(2)
    voice_domain_durations['sum_hours'] = (voice_domain_durations['sum'] / 3600).round(2)
    voice_domain_durations['sum_minutes'] = (voice_domain_durations['sum'] / 60).round(2)
    voice_domain_durations = voice_domain_durations.sort_values(['voice', 'sum'], ascending=[True, False])
    voice_domain_durations.columns = ['Total (seconds)', 'Count', 'Mean (seconds)', 'Total (hours)', 'Total (minutes)']
    stats_output.append(voice_domain_durations[['Count', 'Total (seconds)', 'Total (minutes)', 'Total (hours)', 'Mean (seconds)']].to_string())
    
    stats_output.append(f"\n" + "="*70)
    
    # Print to console
    stats_text = "\n".join(stats_output)
    print(stats_text)
    
    # Save to file
    with open(STATISTICS_FILE, 'w', encoding='utf-8') as f:
        f.write(stats_text)
    
    print(f"\nStatistics saved to: {STATISTICS_FILE}")


Metadata saved to Dataset/audio/metadata.csv

Metadata preview:
                 file_name                                                               transcription  domain_num                   domain_name   voice  duration
0  legal_000000_mekdes.mp3  ይህ የውል ስምምነት በሁለት ሺህ አስራ አምስት አመተ ምህረት ጥር ወር አስራ ሁለት ቀን በአዲስ አበባ ከተማ ተፈረመ።           1  Contracts and Commercial Law  mekdes     6.624
1  legal_000001_mekdes.mp3   የአከራካሪ ጉዳዮች የዳኝነት ስልጣን ለአዲስ አበባ ከተማ ፍርድ ቤቶች ብቻ የሚሰጥ ከሆነ የውሉ ተፈጻሚነት ሙሉ ነው።           1  Contracts and Commercial Law  mekdes     7.344
2  legal_000002_mekdes.mp3          በዚህ ስምምነት የሁለቱ ወገኖች የጋራ ፍላጎትና የንግድ ሚስጥሮች በጥንቃቄ እንዲጠበቁ በግልጽ ተቀምጥዋል።           1  Contracts and Commercial Law  mekdes     6.840
3  legal_000003_mekdes.mp3                   ማንኛውም ወገን የውል ግዴታውን ካልተወጣ ሌላኛው ወገን የካሳ ጥያቄ የማቅረብ መብት አለው።           1  Contracts and Commercial Law  mekdes     5.616
4  legal_000004_mekdes.mp3        የውሉ ማሻሻያ የሚፈጸመው በጽሁፍ ሲሆን በሁለቱም ወገኖች በተፈረመ ተጨማሪ ሰነድ ነው እንጂ በቃል አይደለም።           1  Contr