In [1]:
import pandas as pd
import numpy as np
from transformers import TFAutoModelForSeq2SeqLM,DataCollatorForSeq2Seq,AutoTokenizer
from transformers import AdamWeightDecay
from torchinfo import summary
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
checkpoint = "Helsinki-NLP/opus-mt-en-fr"

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)




All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-fr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [3]:
# checking if the gpu is working or not

!nvidia-smi

Tue Mar 12 11:26:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.26                 Driver Version: 546.26       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   43C    P4              14W /  40W |      0MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
# data_path = "D:\Datasets\spa.txt"


# input_texts = []
# target_texts = []
# input_characters = set()
# target_characters = set()
# with open(data_path, 'r', encoding='utf-8') as f:
#     lines = f.read().split('\n')
# for line in lines[: min(10000, len(lines) - 1)]:
#     input_text, target_text = line.split('\t')
#     # We use "tab" as the "start sequence" character
#     # for the targets, and "\n" as "end sequence" character.
#     # target_text = '\t' + target_text + '\n'
#     input_texts.append(input_text)
#     target_texts.append(target_text)
#     for char in input_text:
#         if char not in input_characters:
#             input_characters.add(char)
#     for char in target_text:
#         if char not in target_characters:
#             target_characters.add(char)

In [5]:
# d = {'eng':input_texts,'fre':target_texts}

# dataset = pd.DataFrame(d)

# dataset.head()

# dataset.to_csv("eng_fre.csv")

In [6]:
raw_dataset = load_dataset("Helsinki-NLP/opus-100",'en-fr')

In [7]:
print(raw_dataset['train'][0])
print(raw_dataset['train'][1])

{'translation': {'en': 'The time now is 05:08 .', 'fr': 'The time now is 05:05 .'}}
{'translation': {'en': 'This Regulation shall enter into force on the seventh day following its publication in the Official Journal of the European Union.', 'fr': "Le présent règlement entre en vigueur le septième jour suivant celui de sa publication au Journal officiel de l'Union européenne."}}


In [8]:
tokenizer("testing if it is working or not")

{'input_ids': [5201, 235, 61, 32, 750, 57, 73, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
with tokenizer.as_target_tokenizer():
    print(tokenizer("Le présent règlement entre en vigueur le septième jour suivant celui de sa publication au Journal officiel de l'Union européenne."))

{'input_ids': [60, 662, 565, 164, 23, 1375, 19, 13112, 370, 1759, 901, 5, 146, 1642, 39, 3305, 4416, 5, 14, 6, 707, 455, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}




In [10]:
max_input_length = 128
max_target_length = 128

source_lang = 'en'
destination_lang = 'fr'

def preprocess_function(sentences):
    inputs = [sent[source_lang] for sent in sentences['translation']]
    targets = [sent[destination_lang] for sent in sentences['translation']]
    model_inputs = tokenizer(inputs,max_length=max_input_length,truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets,max_length=max_target_length,truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [11]:
preprocess_function(raw_dataset['train'][:2])

{'input_ids': [[35, 195, 453, 32, 6987, 37, 3140, 250, 0], [160, 788, 228, 3307, 208, 744, 30, 4, 15202, 613, 440, 96, 1642, 18, 4, 3972, 3305, 7, 4, 217, 546, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[35, 195, 81, 3022, 32, 6987, 37, 3187, 250, 0], [60, 662, 565, 164, 23, 1375, 19, 13112, 370, 1759, 901, 5, 146, 1642, 39, 3305, 4416, 5, 14, 6, 707, 455, 3, 0]]}

In [12]:
tokenized_dataset = raw_dataset.map(preprocess_function,batched=True)

In [13]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-fr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [14]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer,model=model,return_tensors="tf")

In [17]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer,model=model,return_tensors="tf",pad_to_multiple_of=128)

In [18]:
train_dataset = model.prepare_tf_dataset(tokenized_dataset['train'],
                                         batch_size = batch_size,
                                         shuffle=True,
                                         collate_fn = data_collator)

validation_dataset = model.prepare_tf_dataset(tokenized_dataset['validation'],
                                         batch_size = batch_size,
                                         shuffle=True,
                                         collate_fn = data_collator)

generation_dataset = model.prepare_tf_dataset(tokenized_dataset['validation'],
                                              batch_size = 8,
                                              shuffle = True,
                                              collate_fn = generation_data_collator)

In [19]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [20]:
model.fit(train_dataset,validation_data=validation_dataset,epochs=1)

In [None]:
model.save_pretrained("tf_model/")

Model Testing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSeq2SeqLM("add your model here")

In [21]:
input_text = "Hello how are you"

tokenized = tokenizer([input_text],return_length=128)
out = model.generate(**tokenized,max_length=128)

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0],skip_special_tokens=True))