## Process

In [1]:
from IPython.display import Image 

## Libraries

In [2]:
# pandas
import pandas as pd
# pytorch
import torch
# transformers 
from transformers import MarianMTModel, MarianTokenizer, MBartForConditionalGeneration, MBart50TokenizerFast, M2M100ForConditionalGeneration, M2M100Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Source

#### hugging face

### https://huggingface.co/

#### Transformers

### https://huggingface.co/docs/transformers/index

#### pytorch

### https://pytorch.org/

In [3]:
# Select device

## The bes solution is CUDA with GPu Nvidia

if torch.cuda.is_available():
    # Work with gpu Nvidia
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # Work with Apple Silicon M Series
    device = torch.device("mps")
else:
    # Work only with cpu
    device = torch.device("cpu")

print(device)

cuda


## Example

In [4]:
# Example DataFrame
#df = pd.DataFrame({"text": ["Hola, ¿Comó estás?", "Me gusta programar en python", "Por favor, me avisa que debo hacer.", "Estoy preocupado por esta situación"]})

df = pd.read_csv("C:/Users/andre/iCloudDrive/Documents/Tutoriales/1 Coding channel/6 Translate with MLL/Textos.csv")

df.head(5)

Unnamed: 0,Texto
0,Los dispositivos móviles son esenciales en la ...
1,La computación en la nube facilita el acceso y...
2,La realidad virtual y aumentada ofrecen nuevas...
3,El aprendizaje automático permite a las máquin...
4,Los dispositivos móviles son esenciales en la ...


## Code to traslate

### parameters function translate_open

* text: The input text to translate.
* model_name_b: The base name of the translation model to use. This variable is for loading the pre-trained model and its corresponding tokenizer.
* model_pr: Represents the module or class to load the pre-trained model.
* model_toke: Similar to model_pr, but refers to the module or class for loading the associated tokenizer. For example, it could be a reference to transformers.AutoTokenizer.
* include_src: A control boolean variable indicating whether the source and target languages should be included or specified in the tokenization process. This is relevant for models like mBART or M2M100, which require this information.
* src: The source language code, used to specify the language from which the text will be translated.
* tgt: The target language code, used to specify the language to which the text will be translated.


### other parameters

* tokenizer: Tokenization refers to the process of breaking down text into smaller units, called tokens, which can be words, phrases, or even characters. This technique is essential for effectively analyzing and understanding text data
* return_tensors='pt' tells the tokenizer to return the tokens as PyTorch tensors
* padding=True ensures all sequences are the same length by adding padding tokens
* truncation=True is used to indicate that the input text should be truncated to fit the model's maximum input size
* max_length=200 specifies the maximum length of sequences after tokenizing
* with torch.no_grad(): This context is used to disable gradient calculation, useful when you're making predictions with the model and don't need to update its weights. This saves memory and speeds up operations by not storing operations needed for gradient calculation.
* model.generate is a function that performs the translation based on the provided inputs
* forced_bos_token_id argument specifies the mandatory start token for the generated text, using the language ID corresponding to the target language (tgt)
* forced_bos_token_id: forced the speficic tokenizar
    * tokenizer.lang_code_to_id[tgt] the appropriate method for mBART.
    * tokenizer.get_lang_id(tgt), the appropriate method for M2M100 models.
* skip_special_tokens=True argument indicates that special tokens (like start, end, or padding tokens) should be skipped in the decoded output

In [5]:
def translate_open(text, model_name_b, model_pr, model_toke, include_src, src, tgt):
    # Make sure you have a CUDA-capable GPU
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
        
    model_name = model_name_b
    model = model_pr.from_pretrained(model_name).to(device)
    
    # Initialize tokenizer without src_lang and tgt_lang
    tokenizer = model_toke.from_pretrained(model_name)

    # Specify src_lang and tgt_lang during tokenization
    if include_src in ['mBART', 'M2M100']:
        tokenizer.src_lang = src
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length = 200)
        # max_length in future will be remove and will need to replace for max_new_tokens
    else:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length = 200)

    # Move the inputs tensor to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    
    with torch.no_grad():
        if include_src in ['mBART']:
            translated_ids = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt])
        elif include_src in ['M2M100']:
            translated_ids = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt))
        else:
            translated_ids = model.generate(**inputs)
        translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

    return translated_text

## Model 1: mBART

### https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt

In [6]:
list_Trans = []

for i in range(len(df.index)):
    trans1 = translate_open(text = df['Texto'][i], model_name_b = 'facebook/mbart-large-50-many-to-many-mmt', 
                            model_pr = MBartForConditionalGeneration, model_toke = MBart50TokenizerFast, 
                            include_src = 'mBART', src = 'es_XX', tgt = 'en_XX')
    list_Trans.append(trans1)

df["translated_text_mbart"] = list_Trans

In [7]:
Image(url="downloadModelF.png", width=871, height=235)

In [8]:
df.head()

Unnamed: 0,Texto,translated_text_mbart
0,Los dispositivos móviles son esenciales en la ...,"Mobile devices are essential in modern life, a..."
1,La computación en la nube facilita el acceso y...,Cloud computing facilitates data access and st...
2,La realidad virtual y aumentada ofrecen nuevas...,Virtual and augmented reality offers new immer...
3,El aprendizaje automático permite a las máquin...,Automated learning allows machines to improve ...
4,Los dispositivos móviles son esenciales en la ...,"Mobile devices are essential in modern life, a..."


## Model 2: MarianMT

### https://huggingface.co/Helsinki-NLP/opus-mt-es-en

In [9]:
list_Trans = []

for i in range(len(df.index)):

    trans1 = translate_open(text = df['Texto'][i], model_name_b = 'Helsinki-NLP/opus-mt-es-en', 
                            model_pr = MarianMTModel, model_toke = MarianTokenizer, 
                            include_src = 'Maria', src = 'no', tgt = 'no')
    
    list_Trans.append(trans1)

df["translated_text_marianmt"] = list_Trans



In [10]:
df.head()

Unnamed: 0,Texto,translated_text_mbart,translated_text_marianmt
0,Los dispositivos móviles son esenciales en la ...,"Mobile devices are essential in modern life, a...",Mobile devices are essential in modern life. C...
1,La computación en la nube facilita el acceso y...,Cloud computing facilitates data access and st...,Cloud computing facilitates data access and st...
2,La realidad virtual y aumentada ofrecen nuevas...,Virtual and augmented reality offers new immer...,Virtual and augmented reality offer new immers...
3,El aprendizaje automático permite a las máquin...,Automated learning allows machines to improve ...,Machine learning allows machines to improve fr...
4,Los dispositivos móviles son esenciales en la ...,"Mobile devices are essential in modern life, a...",Mobile devices are essential in modern life. C...


## Model 3: M2M100

### https://huggingface.co/facebook/m2m100_1.2B

In [11]:
list_Trans = []

for i in range(len(df.index)):

    trans1 = translate_open(text = df['Texto'][i], model_name_b = 'facebook/m2m100_1.2B', 
                            model_pr = M2M100ForConditionalGeneration, model_toke = M2M100Tokenizer, 
                            include_src = 'M2M100', src = 'es', tgt = 'en')
    list_Trans.append(trans1)

df["translated_text_m2m100"] = list_Trans

In [12]:
df.head()

Unnamed: 0,Texto,translated_text_mbart,translated_text_marianmt,translated_text_m2m100
0,Los dispositivos móviles son esenciales en la ...,"Mobile devices are essential in modern life, a...",Mobile devices are essential in modern life. C...,Mobile devices are essential in modern life. C...
1,La computación en la nube facilita el acceso y...,Cloud computing facilitates data access and st...,Cloud computing facilitates data access and st...,Cloud computing facilitates data access and st...
2,La realidad virtual y aumentada ofrecen nuevas...,Virtual and augmented reality offers new immer...,Virtual and augmented reality offer new immers...,Virtual and augmented reality offer immersive ...
3,El aprendizaje automático permite a las máquin...,Automated learning allows machines to improve ...,Machine learning allows machines to improve fr...,Machine learning allows machines to improve fr...
4,Los dispositivos móviles son esenciales en la ...,"Mobile devices are essential in modern life, a...",Mobile devices are essential in modern life. C...,Mobile devices are essential in modern life. c...


In [14]:
df.to_excel('base_trans.xlsx')

In [16]:
Image(url="used_gpu.png", width=711, height=930)