In [None]:
from google.colab import drive
drive.mount('/content/drive')

Filter corpus.

In [None]:
"""
Filter a Tamil text corpus.
"""
import argparse
import random
import os
import hashlib
import re
import string
import time
from collections import defaultdict
from tqdm import tqdm  # For progress bar (optional, can be removed if not available)

def get_file_size_mb(file_path):
    return os.path.getsize(file_path) / (1024 * 1024)

def sample_avg_line_size(input_file, sample_size=10000):
    """sample a number of lines to determine average line size"""
    total_size = 0
    count = 0
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
        for _ in range(sample_size):
            line = f.readline()
            if not line:
                break
            total_size += len(line.encode('utf-8'))
            count += 1
    return total_size / count

def get_default_filter_patterns():
    """get default regex patterns to filter out common metadata and non-linguistic content"""
    return [
        # common metadata prefixes
        r'^Labels:',
        r'^Tags:',
        r'^Categories:',
        r'^Posted by:',
        r'^Author:',
        r'^Date:',
        r'^குறிச்சொற்கள்:',

        # date patterns
        r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
        r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b',
        r'\d{4}\s+(ஜனவரி|பிப்ரவரி|மார்ச்|ஏப்ரல்|மே|ஜூன்|ஜூலை|ஆகஸ்ட்|செப்டம்பர்|அக்டோபர்|நவம்பர்|டிசம்பர்)',
        r'(திங்கள்|செவ்வாய்|புதன்|வியாழன்|வெள்ளி|சனி|ஞாயிறு)க்கிழமை',

        r'Admin\s+\w+day',

        # common blog/website formatting
        r'^\d+\s+comments$',
        r'^Share this:',
        r'^பகிர்:',
        r'Comments\s+-\s+\d+',
        r'Views\s+-\s+\{\{[\w\.]+\}\}',

        # social media markers
        r'^வணக்கம்\s+\w+\s*!',
        r'^நன்றி\s+\w+',
        r'அருமை\.{2,}',
        r'தேனம்மை வருகைக்கு நன்றி',

        # emotional expressions
        r'^ஹ{2,}',
        r'^\){1,}$',

        # cpyright
        r'^©',
        r'Copyright\s+\d{4}',
        r'All rights reserved',

        # URL patterns
        r'^https?://',
        r'^www\.',
        r'இருந்து மீள்விக்கப்பட்டது',

        # email patterns
        r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',

        # question-answer indicators
        r'^ANSWER\s*:',
        r'^\d+\)\s+',
        r'^\d+\.\s+',

        # blog/forum patterns
        r'\w+\s+wrote:',
        r'^//.*//\s*$',
        r'^என்னங்க',
        r'சார்!',
        r'மாதிரி$',

        # time indicators
        r'நேரம்\s+காலை|பகல்|மாலை',

        r'ரூபாய்\s+மட்டுமே$',
        r'சர்வீஸ்\s+சார்ஜ்',

        # news headline indicators
        r'!! -\s+\w+\s+News$',
        r'^மோடிக்கு|^இந்நிலையில்',

        r'Edit$',
        r'படங்கள்:',
        r'இதுல என்ன அரசியல்',
        r'சிரிச்சு முடியலை',
    ]

def is_filtered_line(line, min_line_length=10, filter_patterns=None, tamil_ratio_threshold=0.3):
    """check if a line should be filtered out based on various criteria"""
    line = line.strip()

    if len(line) < min_line_length:
        return True

    # apply regex pattern filtering
    if filter_patterns:
        for pattern in filter_patterns:
            if re.search(pattern, line):
                return True

    # check Tamil to non-Tamil ratio
    if tamil_ratio_threshold > 0:
        tamil_pattern = re.compile(r'[\u0B80-\u0BFF]')
        tamil_chars = len(re.findall(tamil_pattern, line))
        # skip lines with too few Tamil characters relative to length
        if tamil_chars / max(1, len(line)) < tamil_ratio_threshold:
            # keep lines that are pure English/Latin and very long
            latin_pattern = re.compile(r'[a-zA-Z0-9\s.,;:!?()\[\]{}\'""`~@#$%^&*+=_\\|<>/\-]')
            latin_chars = len(re.findall(latin_pattern, line))
            if latin_chars / max(1, len(line)) > 0.8 and len(line) > 50:
                return False
            return True

    return False

def process_in_chunks(input_file, output_file, target_size_mb, chunk_size=500000, min_line_length=20,
                     tamil_ratio=0.6, use_filtering=True, resume_from_chunk=0):
    """
    Args:
        input_file: Path to input file
        output_file: Path to output file
        target_size_mb: Target size in MB
        chunk_size: Number of lines to process in each chunk
        min_line_length: Minimum line length
        tamil_ratio: Minimum Tamil character ratio
        use_filtering: Whether to use pattern filtering
        resume_from_chunk: Resume from a specific chunk
    """
    start_time = time.time()

    original_size_mb = get_file_size_mb(input_file)

    filter_patterns = get_default_filter_patterns() if use_filtering else None

    # get target line count based on sampled average line size
    avg_line_size_bytes = sample_avg_line_size(input_file)
    target_lines = int((target_size_mb * 1024 * 1024) / avg_line_size_bytes)

    print(f"Original file size: {original_size_mb:.2f} MB")
    print(f"Target file size: {target_size_mb:.2f} MB")
    print(f"Average line size: {avg_line_size_bytes:.2f} bytes")
    print(f"Target number of lines: ~{target_lines:,}")

    estimated_total_lines = int((original_size_mb * 1024 * 1024) / avg_line_size_bytes)
    print(f"Estimated total lines: ~{estimated_total_lines:,}")

    reservoir = []
    seen_hashes = set()  # for deduplication
    total_processed = 0
    total_filtered = 0
    unique_count = 0

    pattern_match_counts = {i: 0 for i in range(len(filter_patterns))} if filter_patterns else {}

    # load existing reservoir if resuming
    if resume_from_chunk > 0:
        resume_file = f"{output_file}.part{resume_from_chunk}"
        if os.path.exists(resume_file):
            print(f"Resuming from chunk {resume_from_chunk}, loading existing reservoir...")
            with open(resume_file, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    reservoir.append(line.strip())
                    # rebuild hash set for deduplication
                    line_hash = hashlib.md5(line.strip().encode('utf-8')).hexdigest()
                    seen_hashes.add(line_hash)
            print(f"Loaded {len(reservoir):,} lines from previous chunks")

    # process each chunk
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as infile:
        if resume_from_chunk > 0:
            lines_to_skip = resume_from_chunk * chunk_size
            print(f"Skipping {lines_to_skip:,} lines to resume...")
            for _ in tqdm(range(lines_to_skip), desc="Skipping lines"):
                infile.readline()
            total_processed = lines_to_skip

        chunk_num = resume_from_chunk
        while True:
            chunk_num += 1
            chunk_start_time = time.time()

            chunk_lines = []
            for _ in range(chunk_size):
                line = infile.readline()
                if not line:
                    break
                chunk_lines.append(line)

            if not chunk_lines:
                break

            print(f"\nProcessing chunk {chunk_num} ({len(chunk_lines):,} lines)...")

            chunk_filtered = 0
            chunk_unique = 0

            for line in tqdm(chunk_lines, desc="Filtering and deduplicating"):
                total_processed += 1
                line = line.strip()

                if is_filtered_line(line, min_line_length, filter_patterns, tamil_ratio):
                    total_filtered += 1
                    chunk_filtered += 1

                    # debug: count which patterns are matching
                    if filter_patterns:
                        for j, pattern in enumerate(filter_patterns):
                            if re.search(pattern, line):
                                pattern_match_counts[j] += 1
                    continue

                # apply deduplication
                line_hash = hashlib.md5(line.encode('utf-8')).hexdigest()
                if line_hash in seen_hashes:
                    continue

                seen_hashes.add(line_hash)
                unique_count += 1
                chunk_unique += 1

                # reservoir sampling
                if len(reservoir) < target_lines:
                    reservoir.append(line)
                else:
                    j = random.randint(0, total_processed)
                    if j < target_lines:
                        reservoir[j % len(reservoir)] = line

            chunk_time = time.time() - chunk_start_time
            elapsed_time = time.time() - start_time
            print(f"Chunk {chunk_num} statistics:")
            print(f"  Processed: {len(chunk_lines):,} lines")
            print(f"  Filtered: {chunk_filtered:,} lines ({chunk_filtered/len(chunk_lines)*100:.2f}%)")
            print(f"  Unique: {chunk_unique:,} lines")
            print(f"  Time: {chunk_time:.2f} seconds ({len(chunk_lines)/chunk_time:.2f} lines/sec)")
            print(f"  Processed: {total_processed:,} lines (~{total_processed/estimated_total_lines*100:.2f}% of file)")
            print(f"  Filtered: {total_filtered:,} lines ({total_filtered/total_processed*100:.2f}%)")
            print(f"  Unique: {unique_count:,} lines")
            print(f"  Current reservoir size: {len(reservoir):,} lines")
            print(f"  Elapsed time: {elapsed_time:.2f} seconds")

            if total_processed > 0:
                lines_per_second = total_processed / elapsed_time
                remaining_lines = estimated_total_lines - total_processed
                estimated_remaining_time = remaining_lines / lines_per_second
                print(f"  Estimated remaining time: {estimated_remaining_time/60:.2f} minutes")

            with open(f"{output_file}.part{chunk_num}", "w", encoding="utf-8") as outfile:
                for line in reservoir:
                    outfile.write(line + "\n")
            print(f"Saved intermediate results to {output_file}.part{chunk_num}")

            with open(f"{output_file}.stats{chunk_num}", "w", encoding="utf-8") as outfile:
                outfile.write(f"Processed: {total_processed:,} lines\n")
                outfile.write(f"Filtered: {total_filtered:,} lines\n")
                outfile.write(f"Unique: {unique_count:,} lines\n")
                outfile.write(f"Reservoir size: {len(reservoir):,} lines\n")

            if total_processed >= estimated_total_lines:
                print("Reached or exceeded estimated total lines, finishing...")
                break

    if filter_patterns and total_filtered > 0:
        print("\nFilter pattern statistics:")
        for j, pattern in enumerate(filter_patterns):
            if pattern_match_counts[j] > 0:
                print(f"Pattern '{pattern}' matched {pattern_match_counts[j]:,} lines")
        print(f"Total filtered: {total_filtered:,} lines ({total_filtered/total_processed*100:.2f}%)")

    with open(output_file, "w", encoding="utf-8") as outfile:
        for line in reservoir:
            outfile.write(line + "\n")

    final_size_mb = get_file_size_mb(output_file)
    total_time = time.time() - start_time

    print(f"\nFinal results:")
    print(f"Final file size: {final_size_mb:.2f} MB")
    print(f"Reduction ratio: {final_size_mb/original_size_mb*100:.4f}%")
    print(f"Lines saved: {len(reservoir):,}")
    print(f"Total processing time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

def main():
    args = argparse.Namespace(
        input="/content/drive/My Drive/Colab Notebooks/LRLs/tamil/dataset/ta.txt",
        output="/content/drive/My Drive/Colab Notebooks/LRLs/tamil/dataset/ta_reduced.txt",
        target_size=6,  # target file size in MB
        chunk_size=500000,  # lines per chunk
        min_line_length=20,  # minimum line length
        tamil_ratio=0.6,  # minimum ratio of Tamil characters required
        use_filtering=True,  # enable pattern-based filtering
        resume_from_chunk=0  # start from beginning
    )

    print(f"Reducing {args.input} to approximately {args.target_size} MB using chunk processing...")

    process_in_chunks(
        args.input,
        args.output,
        args.target_size,
        chunk_size=args.chunk_size,
        min_line_length=args.min_line_length,
        tamil_ratio=args.tamil_ratio,
        use_filtering=args.use_filtering,
        resume_from_chunk=args.resume_from_chunk
    )

    print(f"Reduced file saved to {args.output}")

if __name__ == "__main__":
    main()

Split corpus in train and eval sets.

In [None]:
"""
Split a text corpus into training and evaluation sets.
"""
import os
import re
import argparse
import random
from pathlib import Path
from typing import List, Tuple

def split_into_sentences(text: str) -> List[str]:
    # match sentence boundaries followed by spaces and capital letters
    sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z])'
    # split on the pattern, keep the separators
    sentences = re.split(sentence_pattern, text)
    # split by newlines for paragraphs and lists
    result = []
    for sentence in sentences:
        for line in sentence.split('\n'):
            if line.strip():
                result.append(line.strip())
    return result

def split_corpus(input_file: str, eval_size: float = 0.05, random_seed: int = 42, sentence_level: bool = True) -> Tuple[List[str], List[str]]:
    random.seed(random_seed)
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    # split into units (sentences or lines)
    if sentence_level:
        units = split_into_sentences(content)
        print(f"Split corpus into {len(units)} sentences")
    else:
        units = [line.strip() for line in content.split('\n') if line.strip()]
        print(f"Split corpus into {len(units)} lines")
    random.shuffle(units)
    # calculate split point
    eval_count = max(1, int(len(units) * eval_size))
    # split corpus
    eval_units = units[:eval_count]
    train_units = units[eval_count:]
    print(f"Training set: {len(train_units)} units ({100 - eval_size*100:.1f}%)")
    print(f"Evaluation set: {len(eval_units)} units ({eval_size*100:.1f}%)")
    return train_units, eval_units

def write_output_files(train_units: List[str], eval_units: List[str], output_dir: str, base_filename: str) -> Tuple[str, str]:
    os.makedirs(output_dir, exist_ok=True)
    base_name = Path(base_filename).stem
    train_path = os.path.join(output_dir, f"{base_name}_train.txt")
    eval_path = os.path.join(output_dir, f"{base_name}_eval.txt")
    with open(train_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(train_units))
    with open(eval_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(eval_units))
    train_size = os.path.getsize(train_path) / (1024 * 1024)
    eval_size = os.path.getsize(eval_path) / (1024 * 1024)
    print(f"Training file size: {train_size:.2f} MB")
    print(f"Evaluation file size: {eval_size:.2f} MB")
    return train_path, eval_path

def validate_data_distribution(train_units: List[str], eval_units: List[str]) -> dict:
    train_chars = ''.join(train_units)
    eval_chars = ''.join(eval_units)
    train_unique_chars = set(train_chars)
    eval_unique_chars = set(eval_chars)
    eval_only_chars = eval_unique_chars - train_unique_chars

    train_avg_len = sum(len(unit) for unit in train_units) / len(train_units) if train_units else 0
    eval_avg_len = sum(len(unit) for unit in eval_units) / len(eval_units) if eval_units else 0

    train_words = ' '.join(train_units).split()
    eval_words = ' '.join(eval_units).split()

    train_unique_words = set(train_words)
    eval_unique_words = set(eval_words)

    eval_only_words = eval_unique_words - train_unique_words
    eval_only_words_pct = len(eval_only_words) / len(eval_unique_words) * 100 if eval_unique_words else 0

    validation = {
        "train_units": len(train_units),
        "eval_units": len(eval_units),
        "train_chars": len(train_chars),
        "eval_chars": len(eval_chars),
        "train_unique_chars": len(train_unique_chars),
        "eval_unique_chars": len(eval_unique_chars),
        "eval_only_chars": len(eval_only_chars),
        "eval_only_chars_list": ''.join(sorted(eval_only_chars))[:100] if len(eval_only_chars) > 0 else "",
        "train_avg_unit_length": train_avg_len,
        "eval_avg_unit_length": eval_avg_len,
        "train_unique_words": len(train_unique_words),
        "eval_unique_words": len(eval_unique_words),
        "eval_only_words": len(eval_only_words),
        "eval_only_words_pct": eval_only_words_pct,
    }

    print("\nDistribution Validation:")
    print(f"Average unit length - Train: {train_avg_len:.1f}, Eval: {eval_avg_len:.1f} chars")
    print(f"Unique characters - Train: {len(train_unique_chars)}, Eval: {len(eval_unique_chars)}")

    if eval_only_chars:
        print(f"Warning: {len(eval_only_chars)} characters appear in evaluation but not in training")
        print(f"First few eval-only chars: {validation['eval_only_chars_list'][:20]}...")

    print(f"Unique words - Train: {len(train_unique_words)}, Eval: {len(eval_unique_words)}")
    print(f"Words only in evaluation set: {len(eval_only_words)} ({eval_only_words_pct:.1f}%)")

    return validation

def main():
    parser = argparse.ArgumentParser(description="Split a corpus into training and evaluation sets")
    parser.add_argument("--input_file", type=str, required=True,
                        help="Path to the input corpus file")
    parser.add_argument("--output_dir", type=str, default="./",
                        help="Directory to write output files")
    parser.add_argument("--eval_size", type=float, default=0.05,
                        help="Proportion of corpus to use for evaluation (0.0 to 1.0)")
    parser.add_argument("--random_seed", type=int, default=42,
                        help="Random seed for reproducible splits")
    parser.add_argument("--sentence_level", action="store_true",
                        help="Split at sentence boundaries instead of line boundaries")

    args = argparse.Namespace(
        input_file="/content/drive/My Drive/Colab Notebooks/LRLs/tamil/dataset/ta_reduced.txt",
        output_dir="/content/drive/My Drive/Colab Notebooks/LRLs/tamil/dataset",
        eval_size=0.05,
        random_seed=42,
        sentence_level=True
    )

    print(f"Splitting corpus file: {args.input_file}")
    print(f"Evaluation set size: {args.eval_size * 100:.1f}%")
    print(f"Using {'sentence' if args.sentence_level else 'line'} level splitting")

    train_units, eval_units = split_corpus(
        args.input_file,
        args.eval_size,
        args.random_seed,
        args.sentence_level
    )

    base_filename = os.path.basename(args.input_file)
    train_path, eval_path = write_output_files(
        train_units,
        eval_units,
        args.output_dir,
        base_filename
    )

    validation = validate_data_distribution(train_units, eval_units)

    print(f"  Training corpus: {train_path}")
    print(f"  Fertility evaluation corpus: {eval_path}")

if __name__ == "__main__":
    main()