<a href="https://colab.research.google.com/github/anvitakashikar/Multi-task-Model/blob/main/Multitask_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the necessary libraries
!pip install transformers tensorflow nltk

# Import libraries
import tensorflow as tf
from transformers import MarianMTModel, MarianTokenizer
from concurrent.futures import ThreadPoolExecutor
import random
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # Downloading WordNet's required multilingual resources

from nltk.corpus import wordnet
nltk.download('wordnet')




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Hyperparameters
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 5e-5
MAX_LENGTH = 128

# Initialize translation model and tokenizer
def load_translation_model(src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer


In [4]:
# Back-translation function
def back_translate(text, src_lang="en", tgt_lang="fr", back_lang="en"):
    # Load translation models
    model_src_tgt, tokenizer_src_tgt = load_translation_model(src_lang, tgt_lang)
    model_tgt_back, tokenizer_tgt_back = load_translation_model(tgt_lang, back_lang)

    # Translate text to target language
    inputs = tokenizer_src_tgt(text, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    translated = model_src_tgt.generate(**inputs)
    tgt_text = tokenizer_src_tgt.decode(translated[0], skip_special_tokens=True)

    # Translate back to source language
    inputs_back = tokenizer_tgt_back(tgt_text, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    back_translated = model_tgt_back.generate(**inputs_back)
    back_text = tokenizer_tgt_back.decode(back_translated[0], skip_special_tokens=True)

    return back_text


In [5]:
import random
from nltk.corpus import wordnet

def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words[:]
    random.shuffle(new_words)
    for i in range(min(n, len(new_words))):
        synonyms = wordnet.synsets(new_words[i])
        if synonyms:
            # Get the first lemma for a synonym (ensures it's a valid word)
            synonym = synonyms[0].lemmas()[0].name()
            new_words[i] = synonym.replace('_', ' ')  # Replace underscores with spaces for multi-word synonyms
    return ' '.join(new_words)



In [11]:
# Function to run augmentations in parallel
def parallel_augmentation(texts, src_lang="en", tgt_lang="fr", back_lang="en"):
    with ThreadPoolExecutor() as executor:
        # Apply both back translation and synonym replacement in parallel
        futures = [
            executor.submit(back_translate, text, src_lang, tgt_lang, back_lang) for text in texts
        ] + [
            executor.submit(synonym_replacement, text) for text in texts
        ]
        results = [future.result() for future in futures]
    return results

# Batch Processing Example
def batch_process_texts(texts, batch_size=8):
    results = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_results = parallel_augmentation(batch_texts)
        results.extend(batch_results)
    return results


In [13]:
!pip install transformers
from transformers import TFT5ForConditionalGeneration, MarianMTModel

# Model Creation
model_summary = TFT5ForConditionalGeneration.from_pretrained("t5-base")
model_translation = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [14]:
# Sample text to test
text = "This is another test sentence."

# Define languages for back translation
src_lang = "en"    # Source language
tgt_lang = "fr"    # Target language for first translation
back_lang = "en"   # Language to translate back into

# Perform back translation
translated_text = back_translate(text, src_lang=src_lang, tgt_lang=tgt_lang, back_lang=back_lang)

# Print the original and back-translated text
print("Original Text:", text)
print("Back-Translated Text:", translated_text)




Original Text: This is another test sentence.
Back-Translated Text: That's another test phrase.
