# Back translation

# Load dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset[:20]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .',
  'effective but too-tepid biopic',
  'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .',
  "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .",
  'the film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .',
  'offers that rare combination of entertainment and education .',
  'perhaps no picture ever made has more literally showed that the road to hell is paved with good intentions .',


# Test translation

In [3]:
import translators as ts
# _ = ts.preaccelerate_and_speedtest()

In [4]:
print(ts.translators_pool)
print(ts.translate_text("HELLO", from_language='en', to_language='zh', translator='bing'))

['alibaba', 'apertium', 'argos', 'baidu', 'bing', 'caiyun', 'cloudTranslation', 'deepl', 'elia', 'google', 'hujiang', 'iciba', 'iflytek', 'iflyrec', 'itranslate', 'judic', 'languageWire', 'lingvanex', 'niutrans', 'mglip', 'mirai', 'modernMt', 'myMemory', 'papago', 'qqFanyi', 'qqTranSmart', 'reverso', 'sogou', 'sysTran', 'tilde', 'translateCom', 'translateMe', 'utibet', 'volcEngine', 'yandex', 'yeekit', 'youdao']
你好


# Loop through the dataset and perform back translation

- Choose a English-like language to not lose information

Mandarin loses information
```
the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
这块石头注定要成为 21 世纪的新“柯南”，他将比阿诺德·施瓦辛格、让-克劳德·范·达姆或史蒂文·西格尔更引起轰动。
This stone is destined to be the new "Conan" of the 21st century, and he will be more sensational than Arnold· Schwarzenegger, Jean-Claude· Van · Dam or Steven · Siegel.
```

In [5]:
def clean_text(sentence):
    # Case folding
    sentence = sentence.lower()
    # Remove all punctuation
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    # Remove extra whitespace
    sentence = ' '.join(sentence.split())
    return sentence

def word_level_edit_distance(sentence1, sentence2):
    # Split sentences into words
    words1 = sentence1.split()
    words2 = sentence2.split()
    
    # Create matrix for DP calculation of edit dist, modified to word level instead
    matrix = [[0 for _ in range(len(words2) + 1)] for _ in range(len(words1) + 1)]
    
    # Init 0-word_len-1 matrix
    for i in range(len(words1) + 1):
        matrix[i][0] = i
    for j in range(len(words2) + 1):
        matrix[0][j] = j
        
    # Fill in with match rules
    for i in range(1, len(words1) + 1):
        for j in range(1, len(words2) + 1):
            if words1[i-1] == words2[j-1]:
                matrix[i][j] = matrix[i-1][j-1]
            else:
                matrix[i][j] = min(
                    matrix[i-1][j] + 1,    # deletion
                    matrix[i][j-1] + 1,    # insertion
                    matrix[i-1][j-1] + 1   # substitution
                )
    
    return matrix[len(words1)][len(words2)]

In [6]:
word_level_edit_distance('hello world i a', 'hello worlds i am a')

2

In [7]:
import re
import pandas as pd
from tqdm import tqdm

word_level_edit_distance_threshold = 5 # At least need 5 words to be different

augmented_dataset = pd.DataFrame(columns=['text', 'label'])
for row in tqdm(train_dataset, total=len(train_dataset), desc="Augmenting Dataset"):
    sentence = row['text']
    label = row['label']
    
    # Translate to another language
    translated_sentence = ts.translate_text(sentence, from_language='en', to_language='fr', translator='bing')
    
    # Translate back to english
    translated_back_sentence = ts.translate_text(translated_sentence, from_language='fr', to_language='en', translator='bing')
    
    # Check if the original vs back translation is the same, if same, then discard
    # Need to do some cleaning to ensure differences are not due to punctuation
    # Also have a edit distance thresh, because we dont want minor changes samples to inflate the dataset
    cleaned_original = clean_text(sentence).strip()
    cleaned_back_translated = clean_text(translated_back_sentence).strip()
    
    if cleaned_original == cleaned_back_translated.strip():
        continue
        
    # Check if the edit distance is greater than threshold
    if word_level_edit_distance(cleaned_original, cleaned_back_translated) < word_level_edit_distance_threshold:
        continue
        
    # Append to augmented dataset
    augmented_dataset.loc[len(augmented_dataset)] = {
        'text': translated_back_sentence, 
        'label': label
    }
    
    # print(f"Original: {cleaned_original}")
    # print(f"Back Translated: {cleaned_back_translated}\n")

Augmenting Dataset: 100%|██████████| 8530/8530 [1:06:50<00:00,  2.13it/s]


# Save augmented dataset

In [8]:
len(augmented_dataset)

4405

In [12]:
train_df = pd.DataFrame(dataset['train'])

# Add to original dataset
original_augmented_dataset = pd.concat([train_df, augmented_dataset], ignore_index=True)

# Save in csv
original_augmented_dataset.to_csv('augmented_combined_train_dataset.csv', index=False)

In [14]:
len(original_augmented_dataset)

12935