In [8]:
!pip install transformers tensorflow datasets rouge-score nltk tf_keras


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
import tensorflow as tf

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = TFAutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [10]:
from datasets import load_dataset

# Load the Helsinki-NLP/opus-100 dataset
dataset = load_dataset('Helsinki-NLP/opus-100', 'de-en')
print(dataset['train'][0])
print(dataset['train'][30000])


{'translation': {'de': 'Deine Habgier wird noch dein Tod sein.', 'en': "It's greed that it's gonna be the death of you, 'cause you..."}}
{'translation': {'de': 'Ihre Nachricht: Ich empfehle Erich von Däniken unter http://www.laendleticket.com/laendleticket/de/tickets/erich-von-daniken-feldkirchen-stadtsaal-411302/event.html?tellfriend&first', 'en': 'Your message: Check Erich von Däniken on http://www.laendleticket.com/laendleticket/en/tickets/erich-von-daniken-feldkirchen-stadtsaal-411302/event.html?videos&state'}}


In [12]:
# Preprocess the dataset for input into the model
def preprocess_data(examples):
    inputs = [f'Translate from English to German: {example["en"]}' for example in examples['translation']]
    targets = [example['de'] for example in examples['translation']]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="tf")

    # Tokenize targets
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="tf")

    model_inputs["labels"] = labels["input_ids"]

    # For decoder inputs
    decoder_inputs = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="tf")
    model_inputs["decoder_input_ids"] = decoder_inputs["input_ids"]

    return model_inputs


train_dataset = dataset['train'].select(range(10000)).map(preprocess_data, batched=True)
test_dataset = dataset['test'].select(range(1000)).map(preprocess_data, batched=True)

train_dataset = train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'decoder_input_ids'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=64,
    collate_fn=None
)

test_dataset = test_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'decoder_input_ids'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=64,
    collate_fn=None
)

print(f"done")

done


In [13]:
model.get_layer("shared").trainable = False
model.get_layer("encoder").trainable = False
model.get_layer("decoder").trainable = False

In [None]:
import keras
# Compile the model
print(f"Optimizer")
o=tf.keras.optimizers.Adam(learning_rate=2e-5)

print(f"Loss")
l=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

print(f"Compiling")
model.compile(optimizer=o, loss=l)

# Train the model
print(f"Training")
model.fit(train_dataset, epochs=1)

print(f"Complete")

Optimizer
Loss
Compiling
Training

In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt')

def translate(inputs):
    outputs = model.generate(inputs[0]["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Function to calculate ROUGE and BLEU scores
def calculate_scores(reference, hypothesis):
    # Initialize scorers
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bleu_smoothing = SmoothingFunction().method4

    # Calculate ROUGE scores
    rouge_scores = rouge.score(reference, hypothesis)

    # Calculate BLEU score
    reference_tokens = [nltk.word_tokenize(reference)]
    hypothesis_tokens = nltk.word_tokenize(hypothesis)
    bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=bleu_smoothing)

    return rouge_scores, bleu_score

# Evaluate translations and calculate scores
batch = next(iter(test_dataset))
translated_text = translate(batch)
reference_text = tokenizer.decode(batch[1][0], skip_special_tokens=True)
rouge_scores, bleu_score = calculate_scores(reference_text, translated_text)
print(f"Reference: {reference_text}")
print(f"Translation: {translated_text}")
print(f"ROUGE Scores: {rouge_scores}")
print(f"BLEU Score: {bleu_score}")
print()