# Download Modules

In [None]:
!pip install transformers sentencepiece datasets

In [None]:
from datasets import load_dataset
# from google.colab import drive
from IPython.display import display
# from IPython.html import widgets
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

sns.set()

In [None]:
from datasets import DatasetDict, Dataset

# File paths
train_english_path = "/kaggle/input/wmtdata/IndicNECorp1.0/English-Manipuri/parallel/en-mni-train-en.txt"
train_manipuri_path = "/kaggle/input/wmtdata/IndicNECorp1.0/English-Manipuri/parallel/en-mni-train-mni.txt"
valid_english_path = "/kaggle/input/wmtdata/IndicNECorp1.0/English-Manipuri/parallel/en-mni-valid-en.txt"
valid_manipuri_path = "/kaggle/input/wmtdata/IndicNECorp1.0/English-Manipuri/parallel/en-mni-valid-mni.txt"
test_english_path = "/kaggle/input/wmtdata/IndicNECorp1.0/English-Manipuri/parallel/en-mni-test-en.txt"
test_manipuri_path = "/kaggle/input/wmtdata/IndicNECorp1.0/English-Manipuri/parallel/en-mni-test-mni.txt"

# Function to read and process data
def read_data(english_path, manipuri_path):
    with open(english_path, 'r', encoding='utf-8') as f:
        english_sentences = f.read().split('\n')
    
    with open(manipuri_path, 'r', encoding='utf-8') as f:
        manipuri_sentences = f.read().split('\n')
    
    # Ensure both files have the same number of sentences
    assert len(english_sentences) == len(manipuri_sentences), "The number of sentences in both files should match."
    
    # Create the dataset
    data = {'translation': [{'en': en, 'mn': mni} for en, mni in zip(english_sentences, manipuri_sentences)]}
    return Dataset.from_dict(data)

# Load the datasets
train_dataset = read_data(train_english_path, train_manipuri_path)
valid_dataset = read_data(valid_english_path, valid_manipuri_path)
test_dataset = read_data(test_english_path, test_manipuri_path)

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

# Print to verify
print(dataset_dict)

In [None]:
dataset_dict['train'][4:6]

In [None]:
from datasets import DatasetDict, Dataset
import re
import unicodedata
import pandas as pd

# Function to clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove invisible characters
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    # Remove newline, tab, etc.
    text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return text

# Function to clean the translation pair
def clean_translation(example):
    example['translation']['en'] = clean_text(example['translation']['en'])
    example['translation']['mn'] = clean_text(example['translation']['mn'])
    return example

# Apply the cleaning function to each dataset split
for split in ['train', 'validation', 'test']:
    dataset_dict[split] = dataset_dict[split].map(clean_translation)

# Convert to pandas DataFrame
df_train = dataset_dict['train'].to_pandas()
df_validation = dataset_dict['validation'].to_pandas()
df_test = dataset_dict['test'].to_pandas()

# Function to remove duplicates in a DataFrame
def remove_duplicates(df):
    df['en_mn'] = df['translation'].apply(lambda x: x['en'] + x['mn'])
    df = df.drop_duplicates(subset=['en_mn'])
    df = df.drop(columns=['en_mn'])
    return df

# Remove duplicated sentence pairs
df_train = remove_duplicates(df_train)
df_validation = remove_duplicates(df_validation)
df_test = remove_duplicates(df_test)

# Convert back to Dataset
dataset_dict['train'] = Dataset.from_pandas(df_train)
dataset_dict['validation'] = Dataset.from_pandas(df_validation)
dataset_dict['test'] = Dataset.from_pandas(df_test)

# Remove unnecessary columns
dataset_dict['train'] = dataset_dict['train'].remove_columns('__index_level_0__')
dataset_dict['validation'] = dataset_dict['validation'].remove_columns('__index_level_0__')
dataset_dict['test'] = dataset_dict['test']

# Print to verify
print(dataset_dict)


In [None]:
shuffled_dataset = dataset_dict['train'].shuffle(seed=42)
dataset_dict['train'] = shuffled_dataset
dataset_dict

In [None]:
# from datasets import DatasetDict
# split_datasets = DatasetDict({
#     'train': dataset_dict['train'],
#     'test': dataset_dict2['test'],
#     'validation': dataset_dict3['valid']
# })
# split_datasets

In [None]:
# Use 'google/mt5-small' for non-pro cloab users
model_repo = 'google/mt5-base'
model_path = '/kaggle/working/Trans/mt5_translation.pt'
max_seq_len = 20

# Load Tokenizer & Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_repo)

In [None]:
# Model description: https://huggingface.co/google/mt5-base
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
model = model.cuda()

# Overview and Quick Test

In [None]:
token_ids = tokenizer.encode(
    '<mn> This will be translated to Japanese! (hopefully)',
    return_tensors='pt').cuda()
print(token_ids)

model_out = model.generate(token_ids)
print(model_out)

output_text = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(model_out[0]))
print(output_text)

# Steps
1. Load the pretrained model and tokenizer
2. Load dataset
3. Transform dataset into input (entails a minor model change)
4. Train/finetune the model on our dataset
5. Test the model

# Test Tokenizer

In [None]:
example_input_str = '<mn> This is just a test nbuig.'
# example_input_str = 'これは普通のテスト'
input_ids = tokenizer.encode(example_input_str, return_tensors='pt')
print('Input IDs:', input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print('Tokens:', tokens)

In [None]:
# vocab_src_length = len(tokenizer.get_vocab())
# print(f"Length of vocabulary: {vocab_src_length}")

In [None]:
# special_tokens = tokenizer.all_special_tokens
# special_token_ids = tokenizer.convert_tokens_to_ids(special_tokens)

In [None]:
# for token, token_id in zip(special_tokens, special_token_ids):
#     print(f"Token: {token} (ID: {token_id})")

In [None]:
# sorted(tokenizer.vocab.items(), key=lambda x: x[1])

# Prepare Dataset

In [None]:
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

In [None]:
train_dataset[0]

In [None]:
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'mn': '<mn>',
}

In [None]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

def encode_target_str(text, tokenizer, seq_len,
                      lang_token_map=LANG_TOKEN_MAPPING):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return token_ids[0]

def format_translation_data(translations, lang_token_map,
                            tokenizer, seq_len=128):
  # Choose a random 2 languages for in i/o
  langs = list(lang_token_map.keys())
  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)

  # Get the translations for the batch
  input_text = translations[input_lang]
  target_text = translations[target_lang]

  if input_text is None or target_text is None:
    return None

  input_token_ids = encode_input_str(
      input_text, target_lang, tokenizer, seq_len, lang_token_map)

  target_token_ids = encode_target_str(
      target_text, tokenizer, seq_len, lang_token_map)

  return input_token_ids, target_token_ids

def transform_batch(batch, lang_token_map, tokenizer):
  inputs = []
  targets = []
  for translation_set in batch['translation']:
    formatted_data = format_translation_data(
        translation_set, lang_token_map, tokenizer, max_seq_len)

    if formatted_data is None:
      continue

    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))

  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

def get_data_generator(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i:i+batch_size]
    yield transform_batch(raw_batch, lang_token_map, tokenizer)

In [None]:
in_ids, out_ids = format_translation_data(
    train_dataset[0]['translation'], LANG_TOKEN_MAPPING, tokenizer)
print(' '.join(tokenizer.convert_ids_to_tokens(in_ids)))
print(' '.join(tokenizer.convert_ids_to_tokens(out_ids)))

data_gen = get_data_generator(train_dataset, LANG_TOKEN_MAPPING, tokenizer, 8)
data_batch = next(data_gen)
print('Input shape:', data_batch[0].shape)
print('Output shape:', data_batch[1].shape)

# Train/Finetune BERT

In [None]:
import os
os.makedirs('/kaggle/working/Trans/', exist_ok=True)

In [None]:
import os
directory_path = '/kaggle/working/Trans/'
print("Contents of the directory:", os.listdir(directory_path))
model_path = os.path.join(directory_path, 'mt5_translation.pt')
print("Model path:", model_path)
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))
else:
    print("File not found at the specified path.")


In [None]:
n_epochs = 16
batch_size = 16
print_freq = 50
checkpoint_freq = 1000
lr = 5e-4
n_batches = int(np.ceil(len(train_dataset) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

In [None]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, n_warmup_steps, total_steps)

In [None]:
losses = []

In [None]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [None]:
for epoch_idx in range(n_epochs):
  # Randomize data order
  data_generator = get_data_generator(train_dataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)

  for batch_idx, (input_batch, label_batch) \
      in tqdm_notebook(enumerate(data_generator), total=n_batches):
    optimizer.zero_grad()

    # Forward pass
    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)

    # Calculate loss and update weights
    loss = model_out.loss
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Print training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
          epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))

    if (batch_idx + 1) % checkpoint_freq == 0:
      test_loss = eval_model(model, test_dataset)
      print('Saving model with test loss of {:.3f}'.format(test_loss))
      torch.save(model.state_dict(), model_path)

torch.save(model.state_dict(), model_path)

In [None]:
!pip install huggingface_hub

In [None]:
model.save_pretrained('./Trans2')
tokenizer.save_pretrained('./Trans2')

In [None]:
import shutil
shutil.make_archive('mt5modelFinal', 'zip', '/kaggle/working/Trans2')

In [None]:
from IPython.display import FileLink
FileLink(r'mt5modelFinal.zip')

In [None]:
from huggingface_hub import login
login(token="your token")

In [None]:
from datasets import DatasetDict, Dataset
from huggingface_hub import HfApi, Repository

repo_name = "ABHIiiii1/mt5-Finetuned-Bi-En-Mn-trans2"
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

In [None]:
# Graph the loss

window_size = 50
smoothed_losses = []
for i in range(len(losses)-window_size):
  smoothed_losses.append(np.mean(losses[i:i+window_size]))

plt.plot(smoothed_losses[100:])

# Manual Testing

In [None]:
test_sentence = dataset_dict['validation'][0]['translation']['en']
print('Raw input text:', test_sentence)

input_ids = encode_input_str(
    text = test_sentence,
    target_lang = 'mn',
    tokenizer = tokenizer,
    seq_len = model.config.max_length,
    lang_token_map = LANG_TOKEN_MAPPING)
input_ids = input_ids.unsqueeze(0).cuda()

print('Truncated input text:', tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(input_ids[0])))

In [None]:
output_tokens = model.generate(input_ids, num_beams=10, num_return_sequences=3)
# print(output_tokens)
for token_set in output_tokens:
  print(tokenizer.decode(token_set, skip_special_tokens=True))

In [None]:
#@title Slick Blue Translate
input_text = 'A surfboarder ran into a shark' #@param {type:"string"}
output_language = 'mn' #@param ["en", "ja", "zh"]

input_ids = encode_input_str(
    text = input_text,
    target_lang = output_language,
    tokenizer = tokenizer,
    seq_len = model.config.max_length,
    lang_token_map = LANG_TOKEN_MAPPING)
input_ids = input_ids.unsqueeze(0).cuda()

output_tokens = model.generate(input_ids, num_beams=20, length_penalty=0.2)
print(input_text + '  ->  ' + \
      tokenizer.decode(output_tokens[0], skip_special_tokens=True))

# BLEU

In [None]:
def translate(input_text, output_language, model, tokenizer, lang_token_map):
    input_ids = encode_input_str(
        text = input_text,
        target_lang = output_language,
        tokenizer = tokenizer,
        seq_len = model.config.max_length,
        lang_token_map = lang_token_map)
    input_ids = input_ids.unsqueeze(0).cuda()
    output_tokens = model.generate(input_ids, num_beams=20, length_penalty=0.2)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

In [None]:
input_text = 'ৱাহংবসি করিনো হায়বসি করিনো লৈ'
output_language = 'en'
print(translate(input_text, output_language, model, tokenizer, LANG_TOKEN_MAPPING))

In [None]:
input_text = 'hello, how are you?'
output_language = 'mn'
print(translate(input_text, output_language, model, tokenizer, LANG_TOKEN_MAPPING))

In [None]:
val_data = dataset_dict['validation']

In [None]:
def translate_texts(translator, dataset):
    tgt_texts, trans_texts = [], []

    for data in dataset:
        src_text = data['translation']['mn']
        tgt_text = data['translation']['en']
        translated_text = translator(src_text, 'en', model, tokenizer, LANG_TOKEN_MAPPING)  # Assuming 'hi' for Hindi
        tgt_texts.append(tgt_text)
        trans_texts.append(translated_text)

    return tgt_texts, trans_texts

In [None]:
tgt_texts2, trans_texts2 = dataset_dict(translate, val_data)

In [None]:
file_name = "Bi_Mn-En_pred1.txt"

with open(file_name, "w") as file:
    for item in trans_texts2:
        file.write("%s\n" % item)

In [None]:
file_name = "Bi_Mn-En_tgt1.txt"

with open(file_name, "w") as file:
    for item in tgt_texts2:
        file.write("%s\n" % item)

In [None]:
def translate_texts(translator, dataset):
    tgt_texts, trans_texts = [], []

    for data in dataset:
        src_text = data['translation']['en']
        tgt_text = data['translation']['mn']
        translated_text = translator(src_text, 'mn', model, tokenizer, LANG_TOKEN_MAPPING)  # Assuming 'hi' for Hindi
        tgt_texts.append(tgt_text)
        trans_texts.append(translated_text)

    return tgt_texts, trans_texts

In [None]:
tgt_texts1, trans_texts1 = translate_texts(translate, val_data)

In [None]:
file_name = "Bi_En-Mn_pred.txt"

with open(file_name, "w") as file:
    for item in trans_texts1:
        file.write("%s\n" % item)

In [None]:
file_name = "Bi_En-Mn_tgt.txt"

with open(file_name, "w") as file:
    for item in tgt_texts1:
        file.write("%s\n" % item)

In [None]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("C:/Users/Asus/Downloads/Bi_Mn-En_pred1.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("C:/Users/Asus/Downloads/Bi_Mn-En_tgt1.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

In [None]:
pred[1:5]

In [None]:
new_ref[1:5]

In [None]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("C:/Users/Asus/Downloads/Bi_En-Mn_pred.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("C:/Users/Asus/Downloads/Bi_En-Mn_tgt.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

In [None]:
pred[1:5]

In [None]:
new_ref[1:5]