# [3주차] 심화과제: Machine translation(기계 번역)

In [10]:
# !pip install tqdm boto3 requests regex sentencepiece sacremoses datasets safetensors transformers tokenizers matplotlib torchinfo pandas kagglehub

# [MY CODE] Language Translation (English-French) dataset 준비

## ✅ 1. 데이터 불러오기 & 확인

In [11]:
import pandas as pd

eng_french_data = pd.read_csv('eng_-french.csv')
print(eng_french_data.shape)
print(eng_french_data.columns)
print(eng_french_data.head())

(175621, 2)
Index(['English words/sentences', 'French words/sentences'], dtype='object')
  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!                Cours !
2                    Run!               Courez !
3                    Who?                  Qui ?
4                    Wow!             Ça alors !


## ✅ 2. 훈련/테스트 셋 분리 (Train/Test Split)

In [12]:
from sklearn.model_selection import train_test_split

# 훈련 데이터: 80%, 테스트 데이터: 20%
train_data, test_data = train_test_split(eng_french_data, test_size=0.2, random_state=42)

print(f"훈련 데이터 크기: {len(train_data)}")
print(f"테스트 데이터 크기: {len(test_data)}")

훈련 데이터 크기: 140496
테스트 데이터 크기: 35125


## ✅ 3. T5 토크나이저 준비 & 토크나이징

In [16]:
import torch
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')  # t5-small, t5-base, t5-large

english_column = 'English words/sentences'
french_column  = 'French words/sentences'

# 훈련 및 테스트 데이터 토크나이징
train_encodings = tokenizer(list(train_data[english_column]), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(list(test_data[english_column]), padding=True, truncation=True, max_length=512)

# 라벨(프랑스어) 토크나이징
train_labels = tokenizer(list(train_data[french_column]), padding=True, truncation=True, max_length=512)
test_labels = tokenizer(list(test_data[french_column]), padding=True, truncation=True, max_length=512)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## ✅ 4. 데이터셋 클래스로 변환 (PyTorch Dataset)

In [17]:
import torch

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels['input_ids'][idx],
        }

train_dataset = TranslationDataset(train_encodings, train_labels)
test_dataset = TranslationDataset(test_encodings, test_labels)

## ✅ 5. DataLoader 준비

In [18]:
from torch.utils.data import DataLoader

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)