# English to French Translation using GPT-2
This notebook fine-tunes GPT-2 for English-to-French translation and runs inference with Hugging Face `pipeline`.

In [None]:
# Install dependencies
!pip install transformers datasets accelerate -q

In [None]:
from datasets import load_dataset

# Load a parallel dataset (English-French)
dataset = load_dataset('opus_books', 'en-fr')
dataset = dataset['train'].train_test_split(test_size=0.1)

print(dataset)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def preprocess(examples):
    inputs = [ex for ex in examples['translation']]
    en = [x['en'] for x in inputs]
    fr = [x['fr'] for x in inputs]
    # Format as: "Translate English to French: <en> => <fr>"
    inputs_with_targets = [f"Translate English to French: {e} => {f}" for e, f in zip(en, fr)]
    return tokenizer(inputs_with_targets, truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset['train'].column_names)
tokenized_dataset.set_format('torch')

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,  # For demo; increase for better results
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# Save model
trainer.save_model('./gpt2-translation')
tokenizer.save_pretrained('./gpt2-translation')

In [None]:
from transformers import pipeline

translator = pipeline('text-generation', model='./gpt2-translation', tokenizer='./gpt2-translation')

def translate(text):
    prompt = f'Translate English to French: {text} =>'
    result = translator(prompt, max_length=60, num_return_sequences=1)
    return result[0]['generated_text']

print(translate('Hello, how are you?'))
print(translate('I love learning new languages.'))