# **Translation**
Translation is the task of converting a sequence of text from one language to another. Translation systems are commonly used for translation between different language texts, but it can also be used for speech or some combination in between like text-to-speech or speech-to-text.

We shall fine-tune a pretrained language translation model, "opus-mt-en-fr" from "Helsinki-NLP" using TensorFlow on KDE4 dataset.

### **1. Install and Import Required Libraries**

In [None]:
!pip install datasets transformers[sentencepiece] evaluate sacrebleu

In [None]:
import tensorflow as tf
import numpy as np
import evaluate

from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer, pipeline
from datasets import load_dataset

### **2. Load Data**

In [None]:
raw_dataset = load_dataset('kde4', lang1='en', lang2='fr')

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

### **3. Preprocess Data**

In [None]:
split_dataset = raw_dataset['train'].train_test_split(train_size=11000, test_size=1000, seed=44)
test_dataset = split_dataset.pop('test')

split_dataset = split_dataset['train'].train_test_split(train_size=10000, test_size=1000, seed=44)
split_dataset['validation'] = split_dataset.pop('test')
split_dataset['test'] = test_dataset

In [None]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1000
    })
})

In [None]:
model_checkpoint = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors='pt')

# Helsinki-NLP/opus-mt-en-fr checkpoint only has PyTorch weights
# Library will automatically download and convert PyTorch weights on specifying from_pt=True
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

In [None]:
max_length = 128

def preprocess_function(examples):
  inputs = [ex['en'] for ex in examples['translation']]
  targets = [ex['fr'] for ex in examples['translation']]

  result = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
  return result

In [None]:
tokenized_dataset = split_dataset.map(preprocess_function, batched=True, remove_columns=split_dataset['train'].column_names)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors='tf')

tf_train_dataset = model.prepare_tf_dataset(
    tokenized_dataset['train'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32
)

tf_validation_dataset = model.prepare_tf_dataset(
    tokenized_dataset['validation'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16
)

### **4. Compute Metrics before Fine-tuning the Model**

In [None]:
metric = evaluate.load('sacrebleu')

generation_data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, pad_to_multiple_of=128, return_tensors='tf')

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_dataset['test'],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8
)

@tf.function(jit_compile=True)
def generate_with_xla(batch):
  return model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_new_tokens=128)

def compute_metrics():
  all_preds = list()
  all_labels = list()

  for batch, labels in tf_generate_dataset:
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds] # predictions should be list of sentences for sacrebleu
    decoded_labels = [[label.strip()] for label in decoded_labels] # references should be list of lists of sentences for sacrebleu
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)

  result = metric.compute(predictions=all_preds, references=all_labels)
  return {'bleu': result['score']}

In [None]:
print(compute_metrics())

{'bleu': 21.958967427031844}


### **5. Fine-tune the Model**

In [None]:
num_epochs = 5
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=0,
    weight_decay_rate=0.01
)

model.compile(optimizer=optimizer, metrics=['accuracy'])

In [None]:
# Training in mixed-precision float16
tf.keras.mixed_precision.set_global_policy('mixed_float16')

history = model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### **6. Compute Metrics after Fine-tuning the Model**

In [None]:
print(compute_metrics())

{'bleu': 32.86650289193321}


### **7. Predict using the Fine-tuned Model**

In [None]:
translator = pipeline('translation', model=model, tokenizer=tokenizer)

In [None]:
translator('Default to expanded threads')

[{'translation_text': 'Par défaut pour les fils étendus'}]