# Notebook to produce backtranslated versions of training texts

In [1]:
%load_ext autoreload
%autoreload 2

## Load resources

In [2]:
from data import load_data

train, val, test = load_data()

In [3]:
full = train.append(val)

In [4]:
full

Unnamed: 0,name,description
25291,fabric tote bag,tote bag in a combination of colours. braided ...
5328,knit cardigan with ruffle trims,knit cardigan with a round neck. featuring lon...
28974,mercurised glass soap dish,mercurised glass soap dish.
10697,joggers,relaxed fit trousers made of a linen blend. fe...
8656,ribbed knit cardigan,"cardigan with round neckline, long sleeves and..."
...,...,...
31096,water lily voile dress,children's dress featuring a water lily print ...
9682,striped sweatshirt,long sleeve hoodie. button fastening on the yo...
2355,ruffled t-shirt trf,round neck t-shirt with short sleeves and a ru...
31358,pine cone and sleigh bells napkin holders (pac...,"napkin holder with faux twigs, pine cone, slei..."


Compute number of characters to translate

In [5]:
sum([len(x) for x in full["description"]])

4579038

## Backtranslation tests using MarianMT

In [None]:
from transformers import MarianMTModel, MarianTokenizer
src_text = [
    '>>fra<< this is a sentence in english that we want to translate to french',
    '>>por<< This should go to portuguese',
    '>>esp<< And this to Spanish'
]

model_name = 'Helsinki-NLP/opus-mt-en-roa'
tokenizer = MarianTokenizer.from_pretrained(model_name)
print(tokenizer.supported_language_codes)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# ["c'est une phrase en anglais que nous voulons traduire en français",
# 'Isto deve ir para o português.',
# 'Y esto al español']

In [6]:
src_text = list(full["description"][:8])

In [7]:
src_text

['tote bag in a combination of colours. braided exterior in a combination of materials. shoulder straps with a decorative stud. lined interior with pocket and zip purse. magnetic clasp closure.height x length x width 26.3 x 38.5 x 14.5 cm. / 10.3 x 15.1 x 5.7″',
 'knit cardigan with a round neck. featuring long sleeves with ruffled cuffs, a button-up front and ribbed trims.',
 'mercurised glass soap dish.',
 'relaxed fit trousers made of a linen blend. featuring an elastic drawstring waistband, front pockets and a rear patch pocket.',
 'cardigan with round neckline, long sleeves and button-up fastening.',
 'relaxed fit bermuda shorts featuring an elastic drawstring waist, side pockets and rear patch pocket.',
 'v-neck top featuring wide straps, an elastic waist and ruffled hem.',
 't-shirt with a round neckline, long sleeves and a front print with velvet detail.']

In [11]:
from itertools import chain
from tqdm import tqdm
import torch
from transformers import MarianMTModel, MarianTokenizer

from model import splitevery

def backtranslate(texts, source_lang="en", target_lang="es", batch_size=8):
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # Prepare tokenizer and model for forward translation
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(torch_device)
    # Prepare tokenizer and model for backward translation
    inverse_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
    inverse_tokenizer = MarianTokenizer.from_pretrained(inverse_model_name)
    inverse_model = MarianMTModel.from_pretrained(inverse_model_name).to(torch_device)
    
    # Split input texts in sentences
    splitted = [text.split(".") for text in texts]
    
    # Backtranslate in batches
    splitted_backtranslations = []
    for batch in tqdm(splitevery(chain(*splitted), batch_size)):
        translated = model.generate(**tokenizer.prepare_seq2seq_batch(batch, return_tensors="pt").to(torch_device))
        tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translated = inverse_model.generate(**inverse_tokenizer.prepare_seq2seq_batch(tgt_text, return_tensors="pt").to(torch_device))
        bck_text = [inverse_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        splitted_backtranslations.extend(bck_text)
        
    # Restore original texts
    backtranslations = []
    i = 0
    for splitted_text in splitted:
        backtranslations.append(" . ".join(splitted_backtranslations[i:i+len(splitted_text)]))
        i += len(splitted_text)
        
    return backtranslations

In [14]:
src_text = list(full["description"][:8])
backtranslations = backtranslate(src_text, target_lang="es")
backtranslations

4it [00:22,  5.69s/it]


['Tote bag in a color combination . external braided in combination of materials . shoulder straps with decorative taco . interior lined with pocket and wallet with zipper . magnetic closure . height x length x width 26 . 3 x 38 . 5 x 14 . 5 cm . 1 / 10 . 3 x 15 . 1 x 5 . 7′′',
 'knitted cardigan with round neck . with long sleeves with steering cuffs, a buttoned front and ribbed ribs . - No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no.',
 'Mercurized glass soap . - No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no.',
 'relaxed fit pants made of a linen mixture . with an elastic cord waistband, front pockets and a back pocket . - No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no

In [15]:
list(zip(src_text, backtranslations))

[('tote bag in a combination of colours. braided exterior in a combination of materials. shoulder straps with a decorative stud. lined interior with pocket and zip purse. magnetic clasp closure.height x length x width 26.3 x 38.5 x 14.5 cm. / 10.3 x 15.1 x 5.7″',
  'Tote bag in a color combination . external braided in combination of materials . shoulder straps with decorative taco . interior lined with pocket and wallet with zipper . magnetic closure . height x length x width 26 . 3 x 38 . 5 x 14 . 5 cm . 1 / 10 . 3 x 15 . 1 x 5 . 7′′'),
 ('knit cardigan with a round neck. featuring long sleeves with ruffled cuffs, a button-up front and ribbed trims.',
  'knitted cardigan with round neck . with long sleeves with steering cuffs, a buttoned front and ribbed ribs . - No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no.'),
 ('mercurised glass soap dish.',
  'Mercurized glass soap . -

Poor quality translations, specially when compared against deepL

## Try with T5

In [None]:
from transformers import T5ForConditionalGeneration

t5 = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
from transformers import AutoTokenizer

t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
src_text = list(full["description"][:8])

In [None]:
generated = t5.generate(**t5_tokenizer.prepare_seq2seq_batch(["translate English to German: " + x for x in src_text], return_tensors="pt"))

In [None]:
generated

In [None]:
tgt_text = [t5_tokenizer.decode(t, skip_special_tokens=True) for t in generated]

In [None]:
tgt_text

In [None]:
generated = t5.generate(**t5_tokenizer.prepare_seq2seq_batch(["translate German to English: " + x for x in src_text], return_tensors="pt"))
backtranslations = [t5_tokenizer.decode(t, skip_special_tokens=True) for t in generated]

In [None]:
backtranslations

Doesn't work in reversal...

## Paraphrasing with Pegasus fine-tuned model

In [1]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [7]:
def get_response(input_text, num_return_sequences):
    batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest', return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [8]:
src_text = full["description"][0]
paraphrased = []
for txt in src_text.split("."):
    paraphrased.append(get_response(txt, 8))

In [9]:
print(src_text)

jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.


In [10]:
print(paraphrased)

[['A jacket made of a technical fabric.', 'The jacket is made of a technical fabric.', 'A jacket made of technical fabric.', 'The jacket was made of a technical fabric.', 'A jacket made from a technical fabric.', 'There is a jacket made of a technical fabric.', 'A jacket is made of a technical fabric.', 'A jacket made of fabric.'], ['There is a high collar and long sleeves.', 'The collar and sleeves are long.', 'There are long sleeves and a high collar.', 'The collar is high and the sleeves are long.', 'A high collar and long sleeves.', 'Long sleeves and a high collar.', 'high collar and long sleeves.', 'High collar and long sleeves.'], ['There are front pockets.', 'There are pockets in front of them.', 'There are pockets in the front.', 'The front pockets have something in them.', 'Front pockets.', 'The front pockets have something on them.', 'The front pockets are large.', 'The front pockets have something inside them.'], ['ribbed trims.', 'ribbed trims', 'ribbed trims are ribbed.', 

Kind of meh

## Backtranslation using BackTranslate (Google Translate)

In [6]:
from BackTranslation import BackTranslation
trans = BackTranslation()
result = trans.translate('hello', src='en', tmp = 'es')
print(result.result_text)
# 'Hello there'

[nltk_data] Downloading package punkt to
[nltk_data]     /home/alvaro.barbero/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Hello


In [11]:
%%time
src_text = full["description"][0]
backtranslation = trans.translate(src_text, src='en', tmp = 'es')

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 189 ms


In [15]:
backtranslation.tran_text

'Chaqueta hecha de un tejido técnico con textura.Cuello alto y mangas largas.Bolsillos delanteros.Recortes acanalados.frente al frente.'

In [10]:
print(src_text)
print(backtranslation.result_text)

jacket made of a technical fabric with texture. high collar and long sleeves. front pockets. ribbed trims. zip-up front.
Jacket made of a technical fabric with tall texture and long sleeves. Front Checks.Recrupletes. Frente.


In [20]:
from time import sleep

def backtranslate(texts, backtranslator, source_lang="en", target_lang="es"):
    backtranslations = []
    for text in texts:
        try:
            backtranslation = backtranslator.translate(text, src=source_lang, tmp=target_lang)
            backtranslations.append(backtranslation.result_text)
        except:
            sleep(1)
    return backtranslations

In [21]:
%%time
backtranslate(full["description"][0:10], trans)

CPU times: user 144 ms, sys: 4 ms, total: 148 ms
Wall time: 4.55 s


['Tote bag in a combination of colors. Externsion braided in a combination of materials. Shoulder corridor with a decorative stallion.Interior aligned with pocket and zip bag. Magnetic close.Height x Length x Width 26.3 x 38.5 x 14.5 cm10.3 x 15.1 x 5.7 "',
 'Cardigan knitted with a round neck. With long sleeve with fists with flyers, a front and corrugated ornaments with buttons.',
 'Mercurred glass soap dish.',
 'Relaxed adjustment pants made from a linen mix. With an elastic lace waist, front pockets and a rear pocket.',
 'Cardigan with round neckline, long sleeves and buttons close.',
 'Bermuda shorts in a relaxed shape with an elastic lace waist, side pockets and rear pocket.',
 'V-neck with wide straps, an elastic waist and a hem with ruffles.',
 'T-shirt with a round neckline, long sleeves and a front print with velvet detail.',
 'Round neck cap with pleated balloon sleeves falling under the elbow and elastic ornaments.',
 'Rubber water resistant boots.Available in black, in kha

In [22]:
full["description"][0:10]

25291    tote bag in a combination of colours. braided ...
5328     knit cardigan with a round neck. featuring lon...
28974                          mercurised glass soap dish.
10697    relaxed fit trousers made of a linen blend. fe...
8656     cardigan with round neckline, long sleeves and...
16606    relaxed fit bermuda shorts featuring an elasti...
19297    v-neck top featuring wide straps, an elastic w...
11892    t-shirt with a round neckline, long sleeves an...
1856     round neck top featuring pleated balloon sleev...
23616    rubberised water resistant boots. available in...
Name: description, dtype: object