# Machine Translation

### Dowloading and Saving Model.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/nllb-200-distilled-1.3B"
save_directory = "./nllb-200-distilled-1.3B"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Save the tokenizer and model locally
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


### Translation Frame Work.

In [None]:
import torch
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import math
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

save_directory = "./nllb-200-distilled-1.3B"
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

def translate(text, src_lang, tgt_lang):

    inputs = tokenizer(text, return_tensors="pt")
    translated_tokens = model.generate(**inputs,forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

    return translated_text

In [None]:
src_text = "There's a hot new club in town that everyone is itching to gain entrance into."
src_lang = "eng_Latn" 
tgt_lang = "tel_Telu"

translated_text = translate(src_text, src_lang, tgt_lang)
print(translated_text)

### For longer texts

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Define the path to the model
model_id = "facebook/nllb-200-distilled-1.3B"
local_model_path = model_id

# Load the tokenizer and model from the local directory
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(local_model_path).to(device)

def translate_text(text, src_lang, tgt_lang, max_length=512):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest").to(device)
    translated_tokens = model.generate(
        inputs["input_ids"],
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

def chunk_text(text, max_tokens=512):
    tokens = tokenizer.encode(text, return_tensors="pt")[0]
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk))
    return chunks

def iterative_translation(text, src_lang, tgt_lang, target_word_count=100):
    while True:
        chunks = chunk_text(text)
        translations = [translate_text(chunk, src_lang, tgt_lang) for chunk in chunks]
        combined_translation = " ".join(translations)
        
        word_count = len(combined_translation.split())
        if word_count <= target_word_count:
            return combined_translation
        
        text = combined_translation

Using device: cuda




In [11]:
# Language usage - https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200
src_text = """There's a hot new club in town that everyone is itching to gain entrance into. 
The entry is free, and there are no membership fees or exclusive conditions. 
Literally all you have to do, is wake up and show up. The majority of us already know the importance of starting our day right, 
but getting up at 5 am might sound a little too "alarming." However, if you want to maximize your productivity, 
and show up with the right energy and mindset for what matters, then you might want to join the 5 am Club. 
The 5 am Club, is a self-help parable that shows us how to embrace a revolutionary morning routine, that delivers results. 
World-famous productivity and leadership expert, Robin Sharma teaches us how to use the first 
hour of our day to harness our creative capacity, protect our sanity, and drive personal growth. 
Author Robin Sharma believes that when we rise at 5 am, when the world is quiet and devoid of energy-sapping distractions, 
this is when we learn to master ourselves. """
src_lang = "eng_Latn"
tgt_lang = "hin_Deva"

translated_text = iterative_translation(src_text, src_lang, tgt_lang)
print(translated_text)


KeyboardInterrupt: 

In [4]:
import gc
del model
gc.collect()
print("Model has been removed from the device and memory is freed.")

NameError: name 'model' is not defined