##### Translation using facebook NLLB 200 distilled model 

Using model loading for easier definition of source and output languages.

Using as texts to translate the review titles from preprocessed dataset.

In [None]:
# imports
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
import time

In [2]:
# Read the token from the JSON file
with open("..\.huggingface\config.json", "r") as file:
    token_data = json.load(file)

# Extract the token value
huggingface_token = token_data["huggingface_token"]


In [3]:
# Load dataset and convert to dataframe
filename = './data/preprocessed/dataset_shape(108,5).csv'
df = pd.read_csv(filename, delimiter=',', index_col=None, header=0)

# Separe english and non-english texts
df_non_en = df[df['language'] != 'en']

In [4]:
# Define the dictionary mapping languages to language codes
language_code_dict = {
    'en': 'eng_Latn',
    'it': 'ita_Latn',
    'es': 'spa_Latn',
    'fr': 'fra_Latn',
    'de': 'deu_Latn',
    'ja': 'jpn_Japn',
    'tr': 'tur_Latn',
    'pt': 'por_Latn'
}

In [5]:
def read_texts_and_language_codes(df):
    # Initialize lists to store texts and language codes
    texts = []
    language_codes = []
    
    # Iterate over rows of the dataframe
    for index, row in df.iterrows():
        # Append text to the list
        texts.append(row['review_title'])
        
        # Map language code to language code from dictionary and append to the list
        language_codes.append(language_code_dict[row['language']])
    
    return texts, language_codes


In [6]:

texts, language_codes = read_texts_and_language_codes(df_non_en)

In [7]:
# Start calculating running time
start_time = time.time()

# Definition of general variables for all
# Define the model checkpoint
model_checkpoint = "facebook/nllb-200-distilled-600M"

# Initialize model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, token=huggingface_token)

# Initialize lists to store translated texts
translated_texts = []

# Iterate through the texts list
i = 0 # Temporal iterator for accessing languages_codes list
for text in texts:
    # Define source language from language codes list
    src_lang = str(language_codes[i])

    # Initialize tokenizer for input language
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, token=huggingface_token, src_lang=src_lang)

    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Generate translation
    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30
    )
    
    # Decode translated tokens
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    
    # Append translated text to the list
    translated_texts.append(translated_text)

# Print translated texts
for original, translated in zip(texts, translated_texts):
    print("Original Text:", original)
    print("Translated Text:", translated)
    print()

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print("Total time taken:", elapsed_time, "seconds")


the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


Original Text: PERFETTE!!
Translated Text: It's perfect!

Original Text: delusione
Translated Text: Disappointment

Original Text: Molto belle
Translated Text: Very beautiful

Original Text: Molto carine e comode
Translated Text: Very nice and comfortable

Original Text: Bellissime....peccato per il numero
Translated Text: That's very nice. Sorry about the number.

Original Text: Scarpe ben realizzate, venditore ottimo
Translated Text: Well made shoes, great seller

Original Text: Sehr bequemer Schuh
Translated Text: The Commission shall adopt delegated acts in accordance with Article 21 of this Regulation.

Original Text: Perfetto
Translated Text: It's perfect.

Original Text: Grandi
Translated Text: Large

Original Text: Stupende
Translated Text: It's amazing.

Original Text: かわいいけど、色々残念
Translated Text: ♪ I'm not sure ♪

Original Text: 軽くて履き心地がすごくいいです！
Translated Text: It's a very good thing!

Original Text: Schmerzen am Kind...
Translated Text: I'm not a good person.

Original Text

In [8]:
(translated_texts[:len(translated_texts)])

["It's perfect!",
 'Disappointment',
 'Very beautiful',
 'Very nice and comfortable',
 "That's very nice. Sorry about the number.",
 'Well made shoes, great seller',
 'The Commission shall adopt delegated acts in accordance with Article 21 of this Regulation.',
 "It's perfect.",
 'Large',
 "It's amazing.",
 "♪ I'm not sure ♪",
 "It's a very good thing!",
 "I'm not a good person.",
 'The Commission has already adopted a proposal for a directive on the protection of the environment.',
 'Very sensitive',
 "I'm not sure I'm gonna be able to do that.",
 "What's the matter?",
 "I'm not going to say that.",
 'Perfekt shchuche',
 'The Commission shall adopt a decision on the',
 "I'm not going to lie.",
 'Top of the line',
 'The Commission shall adopt delegated acts in accordance with Article 21 of this Regulation.',
 'Good design.',
 'great, my son loves it',
 'Opening of the upper part',
 'Product of the original',
 'Oh, my God, Winterstiefel!',
 "The Commission has already adopted a proposal