# Coding: Machine Translation by RNN

 - Dataset: wmt-17, en-zh, select 5m high-quality pairs of data
 - Model: Seq2seq with Encoder & Decoder framework
 - GPU: 4090

In [4]:
# Download the data & select 5m high-quality pairs

from datasets import load_dataset
import re

# load full wmt-17 en-zh dataset
full_dataset = load_dataset("wmt/wmt17", "zh-en", split="train")

# Length & Ratio filter
def is_high_quality(x):
    en = x["translation"]["en"]
    zh = x["translation"]["zh"]
    if not en or not zh:
        return False
    if len(en) < 3 or len(zh) < 3:
        return False
    if len(en) > 100 or len(zh) > 100:
        return False
    ratio = len(en) / len(zh)
    if ratio < 0.5 or ratio > 2:
        return False
    if not re.search(r'[\u4e00-\u9fff]', zh):
        return False
    return True

filtered_dataset = full_dataset.filter(is_high_quality, num_proc=10)

dataset = filtered_dataset.select(range(min(5_000_000, len(filtered_dataset))))

print("Full Dataset Size: ", len(full_dataset))
print("Filtered Dataset Size: ", len(filtered_dataset))
print("Dataset Size: ", len(dataset))

# print 10 samples
sample = dataset.shuffle(seed=42).select(range(10))
print("-"*100)
for i in sample:
    print(i["translation"]["en"])
    print(i["translation"]["zh"])
    print("-"*100)

Full Dataset Size:  25134743
Filtered Dataset Size:  1141860
Dataset Size:  1141860
----------------------------------------------------------------------------------------------------
Zambia (7)
赞比亚(7)
----------------------------------------------------------------------------------------------------
15:00 to 18:00 Informal consultations (closed) Conference Room 5 (NLB)
下午3:00－6:00 非正式磋商(闭门会议) 第5会议室(北草坪会议大楼)
----------------------------------------------------------------------------------------------------
Spain
西班牙
----------------------------------------------------------------------------------------------------
Mr. Robert Morrison
Robert Morrison先生 加拿大自然资源部
----------------------------------------------------------------------------------------------------
This satisfied the kids, but not the husband.
"孩子们得到了满意的答案, 但她的丈夫却没有。
----------------------------------------------------------------------------------------------------
Shutaro Omura (Japan)
Shutaro Omura（日本）
---------------

In [7]:
# Create PyTorch Dataset and DataLoader for training

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

class TranslationDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length=100):
        """
        PyTorch Dataset wrapper for HuggingFace translation dataset
        
        Args:
            hf_dataset: HuggingFace dataset with translation pairs
            tokenizer_en: English tokenizer (optional, can be added later)
            tokenizer_zh: Chinese tokenizer (optional, can be added later)
            max_length: Maximum sequence length
        """
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        en_text = item["translation"]["en"]
        zh_text = item["translation"]["zh"]

        en_tokens = self.tokenizer(en_text, 
                                        max_length=self.max_length, 
                                        padding='max_length', 
                                        truncation=True, 
                                        return_tensors='pt')
            
            # Tokenize Chinese (target)
        zh_tokens = self.tokenizer(zh_text, 
                                        max_length=self.max_length, 
                                        padding='max_length', 
                                        truncation=True, 
                                        return_tensors='pt')
            
        return {
                'source_ids': en_tokens['input_ids'].squeeze(),
                'target_ids': zh_tokens['input_ids'].squeeze(),
                'source_text': en_text,
                'target_text': zh_text
        }

def create_dataloaders(dataset, batch_size=32, num_workers=4, train_split=0.95):
    """
    Create train and validation DataLoaders from HuggingFace dataset
    
    Args:
        dataset: HuggingFace dataset with translation pairs
        batch_size: Batch size for DataLoaders
        num_workers: Number of worker processes for data loading
        train_split: Fraction of data to use for training
    
    Returns:
        train_dataloader, val_dataloader, train_dataset, val_dataset
    """
    
    # Split dataset into train and validation
    train_size = int(train_split * len(dataset))
    
    # Create indices for splitting
    indices = list(range(len(dataset)))
    train_indices, val_indices = train_test_split(indices, 
                                                train_size=train_size, 
                                                random_state=42)
    
    # Create train and validation datasets
    train_dataset_hf = dataset.select(train_indices)
    val_dataset_hf = dataset.select(val_indices)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    
    # Create PyTorch datasets
    train_dataset = TranslationDataset(train_dataset_hf, tokenizer)
    val_dataset = TranslationDataset(val_dataset_hf, tokenizer)
    
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")
    
    # Create DataLoaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    print(f"Train DataLoader: {len(train_dataloader)} batches")
    print(f"Validation DataLoader: {len(val_dataloader)} batches")
    
    return train_dataloader, val_dataloader, train_dataset, val_dataset

def test_dataloader(dataloader):
    """Test the DataLoader by printing a sample batch"""
    print("\n" + "="*50)
    print("Sample batch from DataLoader:")
    print("="*50)
    
    for batch in dataloader:
        print(f"Batch size: {len(batch['source_text'])}")
        print(f"Source example: {batch['source_text'][0]}")
        print(f"Source tokens: {batch['source_ids'][0]}")
        print(f"Target example: {batch['target_text'][0]}")
        print(f"Target tokens: {batch['target_ids'][0]}")
        break

train_dataloader, val_dataloader, _, _ = create_dataloaders(dataset)
test_dataloader(train_dataloader)
test_dataloader(val_dataloader)

Train dataset size: 1084767
Validation dataset size: 57093
Train DataLoader: 33899 batches
Validation DataLoader: 1785 batches

Sample batch from DataLoader:
Batch size: 32
Source example:  Thus, business fixed investment is expected to rise in 1996, albeit slowly.
Source tokens: tensor([  101, 33115,   117, 14155, 37770, 37933, 10124, 25973, 10114, 28710,
        10106, 10389,   117, 98892, 22235, 63088,   119,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   