## Installing Dependencies

In [1]:
! pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org <package_name>
! pip install datasets sacrebleu torch transformers sentencepiece transformers[sentencepiece]
! pip install accelerate -U

/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: ` pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org <package_name>'
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Required Imports

In [3]:
import warnings
import numpy as np
import pandas as pd

import torch
import transformers

from datasets import Dataset
from datasets import load_metric

from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

warnings.filterwarnings("ignore")

## Constants

In [4]:
BATCH_SIZE = 16
BLEU = "bleu"
ENGLISH = "en"
ENGLISH_TEXT = "english_text"
EPOCH = "epoch"
INPUT_IDS = "input_ids"
GEN_LEN = "gen_len"
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-hi-en"
MODEL_NAME = MODEL_CHECKPOINT.split("/")[-1]
LABELS = "labels"
PREFIX = ""
HINDI = "hi"
HINDI_TEXT = "hindi_text"
SCORE = "score"
SOURCE_LANG = "hi"
TARGET_LANG = "en"
TRANSLATION = "translation"
UNNAMED_COL = "Unnamed: 0"

## Helper Functions

In [5]:
def postprocess_text(preds: list, labels: list) -> tuple:
    """Performs post processing on the prediction text and labels"""

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def prep_data_for_model_fine_tuning(source_lang: list, target_lang: list) -> list:
    """Takes the input data lists and converts into translation list of dicts"""

    data_dict = dict()
    data_dict[TRANSLATION] = []

    for sr_text, tr_text in zip(source_lang, target_lang):
        temp_dict = dict()
        temp_dict[HINDI] = sr_text
        temp_dict[ENGLISH] = tr_text

        data_dict[TRANSLATION].append(temp_dict)

    return data_dict


def generate_model_ready_dataset(dataset: list, source: str, target: str,
                                 model_checkpoint: str,
                                 tokenizer: AutoTokenizer):
    """Makes the data training ready for the model"""

    preped_data = []

    for row in dataset:
        inputs = PREFIX + row[source]
        targets = row[target]

        model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)

        model_inputs[TRANSLATION] = row

        # setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)
            model_inputs[LABELS] = labels[INPUT_IDS]

        preped_data.append(model_inputs)

    return preped_data



def compute_metrics(eval_preds: tuple) -> dict:
    """computes bleu score and other performance metrics """

    metric = load_metric("sacrebleu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {BLEU: result[SCORE]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result[GEN_LEN] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [6]:
def create_parallel_corpus(file1_path, file2_path, output_path, delimiter='\t'):
    # Read the contents of the text files
    with open(file1_path, 'r', encoding='utf-8') as f1, open(file2_path, 'r', encoding='utf-8') as f2:
        sentences1 = f1.readlines()
        sentences2 = f2.readlines()

    # Ensure the number of sentences in both files is the same
    if len(sentences1) != len(sentences2):
        raise ValueError("The number of sentences in the input files does not match.")

    # Write aligned sentences to the output file
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for sent1, sent2 in zip(sentences1, sentences2):
            outfile.write(sent1.strip() + delimiter + sent2.strip() + '\n')

# Example usage:
file1_path = '/content/drive/MyDrive/hindi_452.txt'
file2_path = '/content/drive/MyDrive/English_452.txt'
output_path = '/content/drive/MyDrive/parallel_corpus.txt'
# output_path = '/content/drive/MyDrive/parallel_corpus_452_.txt'


create_parallel_corpus(file1_path, file2_path, output_path)

## Loading and Preparing The Dataset

In [7]:
translation_data = pd.read_csv(output_path,sep='\t', header= None)
# translation_data = translation_data.drop([UNNAMED_COL], axis=1)
translation_data

Unnamed: 0,0,1
0,वो वरिष्ठ प्रशासन अधिकारी है.,He/She is a senior administrative officer.
1,वो स्थानीय कांग्रेसी है.,He/She is a local Congress party member.
2,वो संसद के सदस्य है.,He/She is a member of parliament.
3,वो वरिष्ठ कॉर्पोरेट कार्यकारी है.,He/She is a senior corporate executive.
4,वो प्रबंधन का समर्थन प्रबंधक है.,He/She is a management support manager.
...,...,...
96,वो अपने परिवार के साथ खुशियों का मनाती है।,She celebrates happiness with her family.
97,वो अपनी बेटी को संगीत कक्षा में डाल रही है।,She is enrolling her daughter in music class.
98,वो अपने पति के साथ फिल्म देखने जा रही है।,She is going to watch a movie with her husband.
99,वो अपने परिवार का ख्याल रखती है।,She takes care of her family.


## Train, Test & Validation Split of Data

In [8]:
# translation_data = translation_data[:53]
translation_data.shape

(101, 2)

In [9]:
X = translation_data[0]
y = translation_data[1]

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10,
                                                    shuffle=True,
                                                    random_state=100)

print("INITIAL X-TRAIN SHAPE: ", x_train.shape)
print("INITIAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-TEST SHAPE: ", x_test.shape)
print("Y-TEST SHAPE: ", y_test.shape)

INITIAL X-TRAIN SHAPE:  (90,)
INITIAL Y-TRAIN SHAPE:  (90,)
X-TEST SHAPE:  (11,)
Y-TEST SHAPE:  (11,)


In [11]:
#x_train = x_train[:1]
#y_train = y_train[:1]

In [12]:
x_train.shape
y_train.shape

(90,)

In [13]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=100)

print("FINAL X-TRAIN SHAPE: ", x_train.shape)
print("FINAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-VAL SHAPE: ", x_val.shape)
print("Y-VAL SHAPE: ", y_val.shape)

FINAL X-TRAIN SHAPE:  (72,)
FINAL Y-TRAIN SHAPE:  (72,)
X-VAL SHAPE:  (18,)
Y-VAL SHAPE:  (18,)


## Load Tokenizer from AutoTokenizer Class

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

## Prepare the model ready dataset

In [15]:
training_data = prep_data_for_model_fine_tuning(x_train.values, y_train.values)

validation_data = prep_data_for_model_fine_tuning(x_val.values, y_val.values)

test_data = prep_data_for_model_fine_tuning(x_test.values, y_test.values)

In [None]:
print(training_data)

In [16]:
train_data = generate_model_ready_dataset(dataset=training_data[TRANSLATION],
                                          tokenizer=tokenizer,
                                          source=HINDI,
                                          target=ENGLISH,
                                          model_checkpoint=MODEL_CHECKPOINT)

validation_data = generate_model_ready_dataset(dataset=validation_data[TRANSLATION],
                                               tokenizer=tokenizer,
                                               source=HINDI,
                                               target=ENGLISH,
                                               model_checkpoint=MODEL_CHECKPOINT)

test_data = generate_model_ready_dataset(dataset=test_data[TRANSLATION],
                                               tokenizer=tokenizer,
                                               source=HINDI,
                                               target=ENGLISH,
                                               model_checkpoint=MODEL_CHECKPOINT)

In [17]:
train_df = pd.DataFrame.from_records(train_data)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  72 non-null     object
 1   input_ids       72 non-null     object
 2   labels          72 non-null     object
 3   translation     72 non-null     object
dtypes: object(4)
memory usage: 2.4+ KB


In [18]:
validation_df = pd.DataFrame.from_records(validation_data)
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  18 non-null     object
 1   input_ids       18 non-null     object
 2   labels          18 non-null     object
 3   translation     18 non-null     object
dtypes: object(4)
memory usage: 704.0+ bytes


In [19]:
test_df = pd.DataFrame.from_records(test_data)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  11 non-null     object
 1   input_ids       11 non-null     object
 2   labels          11 non-null     object
 3   translation     11 non-null     object
dtypes: object(4)
memory usage: 480.0+ bytes


## Convert dataframe to Dataset Class object

In [20]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 72
})

In [21]:
validation_dataset = Dataset.from_pandas(validation_df)
validation_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 18
})

In [22]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 11
})

## Load model, Create Model Training Args and Data Collator

In [23]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [24]:
pip install accelerate -U



In [25]:
model_args = Seq2SeqTrainingArguments(
    f"{MODEL_NAME}-finetuned-{SOURCE_LANG}-to-{TARGET_LANG}",
    evaluation_strategy=EPOCH,
    learning_rate=2e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    max_grad_norm=1.0,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Fine Tuning the Model


In [27]:
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.917975,51.7761,9.8889
2,No log,0.802549,59.0088,9.0
3,No log,0.765793,62.5002,8.9444


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

TrainOutput(global_step=15, training_loss=0.7680944442749024, metrics={'train_runtime': 84.8009, 'train_samples_per_second': 2.547, 'train_steps_per_second': 0.177, 'total_flos': 682204594176.0, 'train_loss': 0.7680944442749024, 'epoch': 3.0})

## Saving the Fine Tuned Transformer

In [29]:
trainer.save_model("FineTunedTransformer")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}


## Perform Translation on Test Datset

In [30]:
test_results = trainer.predict(test_dataset)

In [31]:
print("Test Bleu Score: ", test_results.metrics["test_bleu"])

Test Bleu Score:  55.7909


## Generate Prediction Sentences

In [32]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61127, 512, padding_idx=61126)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61127, 512, padding_idx=61126)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [33]:
predictions = []
test_input = test_dataset[TRANSLATION]

for input_text in tqdm(test_input):
    source_sentence = input_text[HINDI]
    encoded_source = tokenizer(source_sentence,
                               return_tensors='pt',
                               padding=True,
                               truncation=True)
    encoded_source.to(device)  # Move input tensor to the same device as the model

    translated = model.generate(**encoded_source)

    predictions.append([tokenizer.decode(t, skip_special_tokens=True) for t in translated][0])

# Move the model back to CPU if needed
model.to("cpu")

100%|██████████| 11/11 [00:13<00:00,  1.21s/it]


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61127, 512, padding_idx=61126)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61127, 512, padding_idx=61126)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [34]:
y_true_en = []
y_true_pt = []

for input_text in tqdm(test_input):
    y_true_pt.append(input_text[HINDI])
    y_true_en.append(input_text[ENGLISH])

100%|██████████| 11/11 [00:00<00:00, 18171.46it/s]


In [35]:
output_df = pd.DataFrame({"y_true_port": y_true_pt, "y_true_eng": y_true_en, "predicted_text": predictions})
output_df

Unnamed: 0,y_true_port,y_true_eng,predicted_text
0,वह वीवर है.,He/She is a weaver.,He/She is a television manager.
1,वो अपने परिवार के साथ खुशियों का मनाता है,He celebrates happiness with his family.,She celebrates happiness with her family.
2,वे अन्वेषक है.,He/She is an investigator.,He/She is a manager.
3,वो अपनी बहन को एक किताब दे रही है।,She is giving a book to her sister.,She is giving a book to her sister.
4,वह वेयरहाउस है.,It is a warehouse.,He/She is a manager.
5,वे कार्यालय सहायक है.,He/She is an office assistant.,He/She is the office assistant.
6,वह वॉचमेकर है.,He/She is a watchmaker.,He/She is a watchman.
7,वे क्रेडिट विश्लेषक है.,He/She is a credit analyst.,He/She is a credit reporter.
8,वहाँ उसने अपने बच्चों के साथ बालकमी खेली,"There, she played hopscotch with her children.","There, she played with her children."
9,वे ग्राहक एजेंट है.,He/She is a customer agent.,He/She is a client agent.


In [46]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("FineTunedTransformer")
model = AutoModelForSeq2SeqLM.from_pretrained("FineTunedTransformer")

# Example usage: Translate a sentence
# sentence1 = "वह एक डॉक्टर है"
# sentence2 = "वह पानी पी रही है"

# sentence1 = "उसकी बेटी उसके साथ गाना गाती है	"
# sentence2 = "वो एक इंजीनियर है"

# sentence1 = "वो मैकेनिक है"
# sentence2 = "वो खाना बना रही है"

sentence1 = ""
sentence2 = "वो एक नर्स है"







input1 = tokenizer(sentence1, return_tensors="pt")
translated1 = model.generate(**input1)
translated_sentence1 = tokenizer.decode(translated1[0], skip_special_tokens=True)

input2 = tokenizer(sentence2, return_tensors="pt")
translated2 = model.generate(**input2)
translated_sentence2 = tokenizer.decode(translated2[0], skip_special_tokens=True)

print("Translated sentence:", translated_sentence1)
print("Translated sentence:", translated_sentence2)


Translated sentence: He/She is a singer.
Translated sentence: He/She is a nurse.
