Split corpus in train, and test sets.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
Split a text corpus into training and evaluation sets.
"""
import os
import re
import argparse
import random
from pathlib import Path
from typing import List, Tuple

def split_into_sentences(text: str) -> List[str]:
    # match sentence boundaries followed by spaces and capital letters
    sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z])'
    # split on the pattern, keep the separators
    sentences = re.split(sentence_pattern, text)
    # split by newlines for paragraphs and lists
    result = []
    for sentence in sentences:
        for line in sentence.split('\n'):
            if line.strip():
                result.append(line.strip())
    return result

def split_corpus(input_file: str, eval_size: float = 0.05, random_seed: int = 42, sentence_level: bool = True) -> Tuple[List[str], List[str]]:
    random.seed(random_seed)
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    # split into units (sentences or lines)
    if sentence_level:
        units = split_into_sentences(content)
        print(f"Split corpus into {len(units)} sentences")
    else:
        units = [line.strip() for line in content.split('\n') if line.strip()]
        print(f"Split corpus into {len(units)} lines")
    random.shuffle(units)
    # calculate split point
    eval_count = max(1, int(len(units) * eval_size))
    # split corpus
    eval_units = units[:eval_count]
    train_units = units[eval_count:]
    print(f"Training set: {len(train_units)} units ({100 - eval_size*100:.1f}%)")
    print(f"Evaluation set: {len(eval_units)} units ({eval_size*100:.1f}%)")
    return train_units, eval_units

def write_output_files(train_units: List[str], eval_units: List[str], output_dir: str, base_filename: str) -> Tuple[str, str]:
    os.makedirs(output_dir, exist_ok=True)
    base_name = Path(base_filename).stem
    train_path = os.path.join(output_dir, f"{base_name}_train.txt")
    eval_path = os.path.join(output_dir, f"{base_name}_eval.txt")
    with open(train_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(train_units))
    with open(eval_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(eval_units))
    train_size = os.path.getsize(train_path) / (1024 * 1024)
    eval_size = os.path.getsize(eval_path) / (1024 * 1024)
    print(f"Training file size: {train_size:.2f} MB")
    print(f"Evaluation file size: {eval_size:.2f} MB")
    return train_path, eval_path

def validate_data_distribution(train_units: List[str], eval_units: List[str]) -> dict:
    train_chars = ''.join(train_units)
    eval_chars = ''.join(eval_units)
    train_unique_chars = set(train_chars)
    eval_unique_chars = set(eval_chars)
    eval_only_chars = eval_unique_chars - train_unique_chars

    train_avg_len = sum(len(unit) for unit in train_units) / len(train_units) if train_units else 0
    eval_avg_len = sum(len(unit) for unit in eval_units) / len(eval_units) if eval_units else 0

    train_words = ' '.join(train_units).split()
    eval_words = ' '.join(eval_units).split()

    train_unique_words = set(train_words)
    eval_unique_words = set(eval_words)

    eval_only_words = eval_unique_words - train_unique_words
    eval_only_words_pct = len(eval_only_words) / len(eval_unique_words) * 100 if eval_unique_words else 0

    validation = {
        "train_units": len(train_units),
        "eval_units": len(eval_units),
        "train_chars": len(train_chars),
        "eval_chars": len(eval_chars),
        "train_unique_chars": len(train_unique_chars),
        "eval_unique_chars": len(eval_unique_chars),
        "eval_only_chars": len(eval_only_chars),
        "eval_only_chars_list": ''.join(sorted(eval_only_chars))[:100] if len(eval_only_chars) > 0 else "",
        "train_avg_unit_length": train_avg_len,
        "eval_avg_unit_length": eval_avg_len,
        "train_unique_words": len(train_unique_words),
        "eval_unique_words": len(eval_unique_words),
        "eval_only_words": len(eval_only_words),
        "eval_only_words_pct": eval_only_words_pct,
    }

    print("\nDistribution Validation:")
    print(f"Average unit length - Train: {train_avg_len:.1f}, Eval: {eval_avg_len:.1f} chars")
    print(f"Unique characters - Train: {len(train_unique_chars)}, Eval: {len(eval_unique_chars)}")

    if eval_only_chars:
        print(f"Warning: {len(eval_only_chars)} characters appear in evaluation but not in training")
        print(f"First few eval-only chars: {validation['eval_only_chars_list'][:20]}...")

    print(f"Unique words - Train: {len(train_unique_words)}, Eval: {len(eval_unique_words)}")
    print(f"Words only in evaluation set: {len(eval_only_words)} ({eval_only_words_pct:.1f}%)")

    return validation

def main():
    parser = argparse.ArgumentParser(description="Split a corpus into training and evaluation sets")
    parser.add_argument("--input_file", type=str, required=True,
                        help="Path to the input corpus file")
    parser.add_argument("--output_dir", type=str, default="./",
                        help="Directory to write output files")
    parser.add_argument("--eval_size", type=float, default=0.05,
                        help="Proportion of corpus to use for evaluation")
    parser.add_argument("--random_seed", type=int, default=42,
                        help="Random seed for reproducible splits")
    parser.add_argument("--sentence_level", action="store_true",
                        help="Split at sentence boundaries instead of line boundaries")

    args = argparse.Namespace(
        input_file="/content/drive/My Drive/Colab Notebooks/LRLs/yoruba/dataset/yo.txt",
        output_dir="/content/drive/My Drive/Colab Notebooks/LRLs/yoruba/dataset",
        eval_size=0.05,
        random_seed=42,
        sentence_level=True
    )

    print(f"Splitting corpus file: {args.input_file}")
    print(f"Evaluation set size: {args.eval_size * 100:.1f}%")
    print(f"Using {'sentence' if args.sentence_level else 'line'} level splitting")

    train_units, eval_units = split_corpus(
        args.input_file,
        args.eval_size,
        args.random_seed,
        args.sentence_level
    )

    base_filename = os.path.basename(args.input_file)
    train_path, eval_path = write_output_files(
        train_units,
        eval_units,
        args.output_dir,
        base_filename
    )

    validation = validate_data_distribution(train_units, eval_units)

    print(f"  Training corpus: {train_path}")
    print(f"  Fertility evaluation corpus: {eval_path}")

if __name__ == "__main__":
    main()