In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
def load_model_pair(src_lang, tgt_lang):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    return tokenizer, model


In [None]:
def translate_batch(texts, tokenizer, model, batch_size=16):
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        translated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results.extend(translated)
    return results


In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer

df = pd.read_csv("training.csv")


original_sentences = df["sentence"].tolist()
labels = df["label"].tolist()


en2fr_tok, en2fr_model = load_model_pair("en", "fr")
fr2en_tok, fr2en_model = load_model_pair("fr", "en")

print("Translating English to French")
sentences_fr = translate_batch(original_sentences, en2fr_tok, en2fr_model)

print("Translating French to English")
back_translated = translate_batch(sentences_fr, fr2en_tok, fr2en_model)

# Combine original and back-translated with same labels
df_aug = pd.DataFrame({
    "sentence": original_sentences + back_translated,
    "label": labels + labels
})

df_aug = df_aug.drop_duplicates(subset=["sentence"]).reset_index(drop=True)

df_aug.to_csv("training_augmented.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translating English to French


  0%|          | 0/6382 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

100%|██████████| 6382/6382 [1:21:58<00:00,  1.30it/s]


Translating French to English


100%|██████████| 6382/6382 [1:46:30<00:00,  1.00s/it]


In [None]:
# add id to each entry of df_aug
df_aug["id"] = range(len(df_aug))


Unnamed: 0,sentence,label,id
0,Those 2 drinks are part of the HK culture and ...,negative,0
1,I was told by the repair company that was doin...,negative,1
2,It is there to give them a good time .,neutral,2
3,Like leafing through an album of photos accomp...,negative,3
4,Johnny was a talker and liked to have fun.,positive,4


In [None]:
#!pip install contractions
#import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer

df = pd.read_csv("training.csv")
original_sentences = df["sentence"].tolist()
labels = df["label"].tolist()
original_ids = df.index.tolist()

en2fr_tok, en2fr_model = load_model_pair("en", "fr")
fr2en_tok, fr2en_model = load_model_pair("fr", "en")

print("Translating English to French...")
sentences_fr = translate_batch(original_sentences, en2fr_tok, en2fr_model)

print("Translating French to English...")
back_translated = translate_batch(sentences_fr, fr2en_tok, fr2en_model)

# original df
df_original = pd.DataFrame({
    "original_id": original_ids,
    "sentence": original_sentences,
    "label": labels,
    "is_augmented": False
})

# back-translated df - saving original ids to prevent data leakage when doing train test split for model
df_augmented = pd.DataFrame({
    "original_id": original_ids,
    "sentence": back_translated,
    "label": labels,
    "is_augmented": True
})

# combine original with translated and drop duplicates
df_aug = pd.concat([df_original, df_augmented], ignore_index=True)
df_aug = df_aug.drop_duplicates(subset=["sentence"]).reset_index(drop=True)

df_aug.to_csv("training_augmented_with_reference.csv", index=False)




Translating English to French...


100%|██████████| 6382/6382 [1:23:35<00:00,  1.27it/s]


Translating French to English...


100%|██████████| 6382/6382 [1:48:24<00:00,  1.02s/it]
