### Language translation using Transformers Pipeline



In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5Tokenizer, T5ForConditionalGeneration, MarianTokenizer, MarianMTModel
import pandas as pd

In [2]:
# Load dataset 
filename = './data/preprocessed/dataset_shape(108,5).csv'
og_df = pd.read_csv(filename, delimiter=',', index_col=None, header=0)

In [3]:
# Number of reviews for each detected language
og_df['language'].value_counts()

language
en    54
de    20
it    19
ja     6
es     5
fr     2
tr     1
pt     1
Name: count, dtype: int64

##### Translate for Deutch reviews (Loading the model directly)

Step by step

In [4]:
de_review = og_df.loc[og_df['language']=='de', 'review_text']
de_review

10    Der Schuh 👟gefällt mir sehr gut. Der ist super...
34    An sich wunderschöner Schuh, auspacken und dir...
35    Auf einem der Fotos steht auf der Sohle die gr...
48    Die Oberfläche ist sehr empfindlich und sieht ...
51    Eigentlich ein ganz toller und bequemer Schuh ...
52                          Komfort und perfekt schucse
53    Ein wunderschöner Lederschuh, der allerdings d...
54    Unser Tochter (20 M) zieht diese Schuhe sehr g...
55    Beste Schuhe von Superfit generell. Wir sind i...
56    Super, hochwertiger Kinderschuh. Meine Tochter...
67    Mein 4- jähriger Sohn liebt diese Stiefel und ...
68    Beck Gummistiefel begleiten uns nun schon eini...
69    Habe die Gummistiefel für meine beiden Kinder ...
70    Ich habe diese Gummistiefel nun das zweite Mal...
71    Die Gummistiefel sind sehr schön verarbeitet u...
72    Mein Sohn ist sehr zufrieden. Die Stiefel sind...
73    Sohnemann brauchte neue Gummistiefel und diese...
75    Die Stiefel sehen schön aus und die Blinkl

In [None]:
# Load model directly

src = "de"  # source language
trg = "en"  # target language

model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [12]:
test_text = de_review.iloc[0]
print(f"text is: {test_text} \nand type is: {type(test_text)}")

text is: Der Schuh 👟gefällt mir sehr gut. Der ist super bequem und leicht. Binsehr zu frieden. Danke 
and type is: <class 'str'>


In [15]:
# AutoTokenizer class from transformers library explained
# First, the tokenizer is loaded from the pretrained model previously defined
# Then, the tokenizer received as input parameter a sentence or list of sentences
# Finally, it returns a dictionary string to list of tokens for each sentence

encoded_input = tokenizer(test_text)
print(encoded_input)

# These lists of tokens can be decoded to the original input sentences
print(tokenizer.decode(encoded_input['input_ids']))

{'input_ids': [119, 14784, 17, 1, 377, 18307, 278, 261, 519, 3, 119, 29, 3780, 9694, 10, 1847, 3, 8254, 6, 7936, 24, 17, 7415, 15, 3, 6174, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Der Schuh <unk> gefällt mir sehr gut. Der ist super bequem und leicht. Binsehr zu frieden. Danke</s>


In [16]:
# A batch is created when the tokens need to be feed to a model
# Then the output of the tokenizer are tensors
batch = tokenizer([test_text], return_tensors="pt")
print(batch)

{'input_ids': tensor([[  119, 14784,    17,     1,   377, 18307,   278,   261,   519,     3,
           119,    29,  3780,  9694,    10,  1847,     3,  8254,     6,  7936,
            24,    17,  7415,    15,     3,  6174,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}


In [17]:
generated_ids = model.generate(**batch)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'I like the shoe very well. It is super comfortable and light. Very happy. Thanks'

##### Translate for Spanish reviews and check if correctly translated

In [None]:
# Load model directly
src = "es"  # source language
trg = "en"  # target language

model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [27]:
# Select sentences to translate
es_review = og_df.loc[og_df['language']=='es', 'review_text']

# Select sample review text
test_text = []
for i in range(len(es_review)):
    test_text.append(es_review.iloc[i])
    print(f"text is: {test_text[i]}")

print(f"--------------------------------------")
# Tokenize input batch (for now list of texts) with padding and truncation to be able to create tensors
batch = tokenizer(test_text, padding=True, truncation=True, return_tensors="pt")

# Feed the model with tokenized batch and translate
generated_ids = model.generate(**batch)
for i in range(len(es_review)):
    print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[i])

text is: Muy cómodo para el Niño
text is: Muy lindos pero aprietan de la parte de Arriba
text is: Producto original, de buena calidad. Talla justa, Cumplió las expectativas.
text is: Resistentes, no entra agua los días de lluvia y con un diseño informal que tanto sirve para llevar con uniforme al colegio o para unos vaqueros en fin de semana. Perfectos
text is: Muy cómodos para trote o caminatas, los recomiendo
--------------------------------------
Very comfortable for the Child
Very nice but tight from the top part
Original product, good quality. Fair size, met expectations.
Resistant, no water enters the rainy days and with an informal design that serves so much to carry in uniform to the school or for jeans on weekend. Perfect
Very comfortable for jogging or hiking, I recommend them


##### Translate for French reviews (Using a pipeline as a high-level helper)

Note: actually 1 review is detected as fr incorrectly

In [4]:
fr_review = og_df.loc[og_df['language']=='fr', 'review_text']
fr_review

63                                         No complains
92    très belle  chaussure tient bien le pied avec ...
Name: review_text, dtype: object

In [5]:
# Locate and print the only true French review
(fr_review.iloc[1])

'très belle  chaussure tient bien le pied avec un amortie parfait pour des kms de footing  a recommander'

In [None]:
# Use a pipeline as a high-level helper

model_checkpoint = "Helsinki-NLP/opus-mt-fr-en"
translator = pipeline("translation", model=model_checkpoint)

In [15]:
translator(fr_review.iloc[1])

[{'translation_text': 'very nice shoe holds well the foot with a perfect cushion for kms of footing to recommend'}]

In [36]:
type(translator)

transformers.pipelines.text2text_generation.TranslationPipeline

T5-small model using a pipeline

In [13]:
# test en to fr for model=t5-small
# It does not seem to work from fr to en
en_fr_translator = pipeline("translation_fr_to_en", model='t5-small')
print(en_fr_translator(fr_review.iloc[1], max_length=400))
# It works from en to fr
en_fr_translator = pipeline("translation_en_to_fr", model='t5-small')
print(en_fr_translator('very nice shoe holds well the foot with a perfect cushion for kms of footing to recommend'))

[{'translation_text': "Très belle chaussure tient bien le pied avec un amortie parfait pour des kms de footing a recommander a recommander en recommandant l'adoption d'une recommandation."}]
[{'translation_text': 'très jolie chaussure tient bien le pied avec un coussin parfait pour les kilomètres de pied à recommander'}]


Loading T5-small model and using T5 tokenizer 

In [None]:
# Loading pre-trained T5 model from transformers library
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
# Defining input text (in French)
input_text = fr_review.iloc[1]
# Encode input text using tokenizer
input_ids = t5_tokenizer.encode(input_text, return_tensors='pt')
# Generate output tokens and decode them
outputs = t5_model.generate(input_ids)
output_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [14]:
# Print output text
print(output_text)

Très belle chaussure tient bien le pied avec un amortie parfait pour des kms


(T5 does not seem to work either)

#### Romance languages translation to English

Using OPUS-MT model and MarianMT Tokenizer. Having as inputs batches of texts in different languages.

In [None]:
# Load model and tokenizer
model_checkpoint = 'Helsinki-NLP/opus-mt-roa-en'
model = MarianMTModel.from_pretrained(model_checkpoint)
tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)

# Define input texts
input_text = [
    ">>en<< très belle  chaussure tient bien le pied avec un amortie parfait pour des kms de footing  a recommander",
    ">>en<< Resistentes, no entra agua los días de lluvia y con un diseño informal que tanto sirve para llevar con uniforme \
            al colegio o para unos vaqueros en fin de semana.",
    ">>en<< Muy lindos pero aprietan de la parte de Arriba"
]

translated = model.generate(**tokenizer(input_text, return_tensors="pt", padding=True))


In [None]:
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

#### Evaluate models using METEOR and BLUE