This notebook helps you clean your dataset using bicleaner.

# 1. Preparation

## 1.1 Before we start, check the operating environment.

In [None]:
import sys
print(sys.executable)
print(sys.path)

In [None]:
import os
os.environ["PYTHONPATH"] = "/pfs/data5/home/kit/stud/u____/myEnv/lib/python3.9/site-packages:" + os.environ.get("PYTHONPATH", "")
os.environ["PATH"] = "/pfs/data5/home/kit/stud/u____/myEnv/bin:" + os.environ["PATH"]
!which python
!which pip
!echo $PYTHONPATH

## 1.2  Install Bicleaner-AI and download en-de model for our task.

In [None]:
!pip install bicleaner-ai git+https://github.com/MSeal/cython_hunspell@2.0.3
!pip install --config-settings="--build-option=--max_order=7" https://github.com/kpu/kenlm/archive/master.zip

In [None]:
# select a model for your task, here is en-de
!bicleaner-ai-download en de full /pfs/data5/home/kit/stud/u____

# 2. Process

In [None]:
!echo $PWD
%cd dataset/sample_data
!echo $PWD

In [None]:
name = "News-Commentary"

## 2.1 Merge

In [None]:
def merge_parallel_corpora(german_file, english_file, output_file):
    with open(german_file, 'r', encoding='utf-8') as g_file, \
         open(english_file, 'r', encoding='utf-8') as e_file, \
         open(output_file, 'w', encoding='utf-8') as out_file:
        
        for german_line, english_line in zip(g_file, e_file):
            merged_line = f"{english_line.strip()}\t{german_line.strip()}\n"
            out_file.write(merged_line)

german_file_path = f'{name}.de-en.de'
english_file_path = f'{name}.de-en.en'
output_file_path = 'merged.txt'

merge_parallel_corpora(german_file_path, english_file_path, output_file_path)

## 2.2 Apply Bicleaner

In [None]:
!echo $PWD
!bicleaner-ai-classify --scol 1 --tcol 2 merged.txt result.txt bicleaner-models
# 2000 Years later...

## 2.3 Split into two documents as original

In [None]:
# Filter German and English sentences based on score threshold
score_threshold = 0.9

def filter_corpora_and_output_separate_files(input_file, german_output_file, english_output_file, threshold):
    with open(input_file, 'r', encoding='utf-8') as in_file, \
         open(german_output_file, 'w', encoding='utf-8') as g_out_file, \
         open(english_output_file, 'w', encoding='utf-8') as e_out_file:
        
        for line in in_file:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue
            
            english_sentence = parts[0]
            german_sentence = parts[1]
            score = float(parts[2])
            
            if score >= threshold:
                g_out_file.write(f"{german_sentence}\n")
                e_out_file.write(f"{english_sentence}\n")

input_file_path = 'result.txt'
german_output_file_path = f'bicleaner.{name}.de-en.de'
english_output_file_path = f'bicleaner.{name}.de-en.en'

filter_corpora_and_output_separate_files(input_file_path, german_output_file_path, english_output_file_path, score_threshold)

## 2.4 Delete Intermediate Files

In [None]:
!rm merged.txt

In [None]:
!rm result.txt

## 2.5 Split into train/dev/tst

In [None]:
def split_parallel_datasets(german_file_path, english_file_path, german_output_paths, english_output_paths):
    filtered_lines = []
    with open(german_file_path, encoding="utf-8") as german_file, open(english_file_path, encoding="utf-8") as english_file:
        for german_line, english_line in zip(german_file, english_file):
            filtered_lines.append((german_line, english_line))
            # # If want to delete sentences longer than 25
            # if german_line.strip() and english_line.strip() and len(german_line.split()) <= 25:
            #     filtered_lines.append((german_line, english_line))

    print(f"Filtered Dataset size: {len(filtered_lines)}")

    # 96:2:2
    total_samples = len(filtered_lines)
    train_end = int(total_samples * 0.96)
    test_end = int(total_samples * 0.98)
    splits = {
        "train": filtered_lines[:train_end],
        "test": filtered_lines[train_end:test_end],
        "dev": filtered_lines[test_end:],
    }

    for split_name, split_data in splits.items():
        german_output_dir = os.path.dirname(german_output_paths[split_name])
        english_output_dir = os.path.dirname(english_output_paths[split_name])
        os.makedirs(german_output_dir, exist_ok=True)
        os.makedirs(english_output_dir, exist_ok=True)

        with open(german_output_paths[split_name], "w", encoding="utf-8") as german_output, \
            open(english_output_paths[split_name], "w", encoding="utf-8") as english_output:
            for german_line, english_line in split_data:
                german_output.write(german_line)
                english_output.write(english_line)


german_file_path = f"bicleaner.{name}.de-en.de"
english_file_path = f"bicleaner.{name}.de-en.en"
german_output_paths = {
    key: f"bicleaner.{name}/{split}.de-en.de"
    for key, split in {"train": "train", "test": "tst", "dev": "dev"}.items()
}
english_output_paths = {
    key: f"bicleaner.{name}/{split}.de-en.en"
    for key, split in {"train": "train", "test": "tst", "dev": "dev"}.items()
}

split_parallel_datasets(
    german_file_path, english_file_path, german_output_paths, english_output_paths
)
