In [1]:
!pip install datasets transformers[sentencepiece] sacrebleu sacremoses -q

# Import Libraries

In [2]:
import os 
import sys
import transformers
import tensorflow as tf
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

2024-07-14 15:38:29.389747: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-14 15:38:29.389867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-14 15:38:29.539539: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Initialize pretrained model
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

# Load Dataset

In [4]:
# Load dataset from huggingface
raw_datasets = load_dataset("cfilt/iitb-english-hindi")

Downloading readme:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [9]:
raw_datasets['test'][0]

{'translation': {'en': 'A black box in your car?',
  'hi': 'आपकी कार में ब्लैक बॉक्स?'}}

# Preprocess Dataset

In [10]:
# load tokenizer from pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
# tokenizing english sentences
tokenizer(raw_datasets['test'][0]['translation']['en'])

{'input_ids': [238, 3078, 3613, 21, 85, 3869, 22, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
# tokenizing hindi sentences
tokenizer(text_target=raw_datasets['test'][0]['translation']['hi'])

{'input_ids': [1281, 3444, 11, 20501, 8323, 22, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [13]:
raw_datasets['test'][0]['translation']['hi']

'आपकी कार में ब्लैक बॉक्स?'

In [14]:
raw_datasets['test'][0]['translation']['en']

'A black box in your car?'

In [15]:
# define preprocessing function

max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"

def preprocess(data):
    inputs = [ex[source_lang] for ex in data['translation']]
    targets = [ex[target_lang] for ex in data['translation']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding='max_length', truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, padding='max_length', truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [16]:
# check preprocessing
preprocess(raw_datasets['train'][:2])

{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949], [32643, 28541, 36253, 0, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 619

In [None]:
# apply preprocessing to the datasets
tokenized_datasets = raw_datasets.map(preprocess, batched=True)

In [18]:
# Load the pretrained model
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.summary()

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  76381184  
                                                                 
 final_logits_bias (BiasLay  multiple                  61950     
 er)                                                             
                                                                 
Total params: 76443134 (291.61 MB)
Trainable params: 76381184 (291.37 MB)
Non-trainable params: 61950 (241.99 KB)
_________________________________________________________________


In [28]:
# Define parameters
batch_size = 8
num_samples = 100000  # Number of samples to select
learning_rate = 2e-5
weight_decay = 0.01
num_epochs = 10

In [29]:
# Prepare data collators
data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="tf", pad_to_multiple_of=128)

# Prepare datasets
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

# Select 100k samples using take() method
train_dataset = train_dataset.take(num_samples // batch_size)

# Shuffle the dataset again after taking the samples
train_dataset = train_dataset.shuffle(buffer_size=num_samples)

validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=generation_data_collator,
)


In [30]:
# Define and compile the model 
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.summary()

# Define optimizer
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Model: "tf_marian_mt_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  76381184  
                                                                 
 final_logits_bias (BiasLay  multiple                  61950     
 er)                                                             
                                                                 
Total params: 76443134 (291.61 MB)
Trainable params: 76381184 (291.37 MB)
Non-trainable params: 61950 (241.99 KB)
_________________________________________________________________


In [None]:
# Train the model
model.fit(train_dataset, validation_data=validation_dataset, epochs=num_epochs)

# Save the model
model.save_pretrained("langGPT/")

Epoch 1/10
Epoch 2/10
Epoch 3/10

# Model Testing

In [5]:
# Load the pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("langGPT/")

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at langGPT/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [4]:
# check generation

input_text = "My name is Srikar V"

input_ids = tokenizer(input_text, return_tensors="tf").input_ids
outputs = model.generate(input_ids)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['मेरा नाम श्रीकर वी है।']


# BLEU Score Evaluation

In [38]:
# use test dataset
test_dataset = raw_datasets['test'][:500]

In [37]:
test_dataset

Dataset({
    features: ['translation'],
    num_rows: 2507
})

In [34]:
# define translation function
def translate_texts(texts):
    input_ids = tokenizer(texts, return_tensors="tf", padding=True, truncation=True, max_length=128).input_ids
    outputs = model.generate(input_ids)
    translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(translations)
    return translations

In [39]:
# calculate BLEU score

import sacrebleu

references = []
hypotheses = []

for data in tqdm(test_dataset['translation']):
    source_text = data['en']  # Replace 'en' with the appropriate key
    reference_text = data['hi']  # Replace 'hi' with the appropriate key
    
    references.append([reference_text])
    hypotheses.append(translate_texts([source_text])[0])

# Calculate the BLEU score
bleu = sacrebleu.corpus_bleu(hypotheses, references)

print(f"BLEU score: {bleu.score}")

  0%|          | 0/500 [00:00<?, ?it/s]

['आपकी कार में काला बॉक्स?']
['जैसा कि अमेरिका के सड़क योजनाकारों के लिए नकदी का प्रयास करते हैं, बहुत से लोग एक छोटे काले बक्से में एक समाधान देखने लगे हैं जो आपकी कार के दमन बोर्ड द्वारा अच्छी तरह से फिट किया गया है।']
['यह यंत्र, जो हर मील का एक मोटरकार ड्राइव का ट्रैक करता है और ब्यूरो को सूचना भेजता है, वह वाशिंगटन में विवादास्पद प्रयास और राज्य नियोजन कार्यालयों में अमेरिका की प्रमुख सड़कों के वित्तपोषण के लिए पुरानी प्रणाली को अधिरोपित करने के लिए एक विवादपूर्ण प्रयास का केंद्र है।']
['राजमार्ग योजना का आमतौर पर नीरस क्षेत्र अचानक तीव्र बहस और रंगीन संयोजनों को जन्म देता है।']
['लिबर्टीगरों ने सरकार के साथ सहयोग किया है कि वह सरकार को उन छोटे-छोटे बॉक्सों का उपयोग करने की अनुमति दे, जिनसे कि आप मीलों की दूरी तय कर सकें, और संभवत: जहां आप उन्हें चला रहे हों वहां कर बिल बनाने के लिए सूचना का उपयोग करें।']
['चाय की पार्टी बहुत खराब होती है।']
['अमेरिकी नागरिक मुक्ति संघ के संबंध में भी बहुत चिंता की जाती है, जो विभिन्न प्रकार की गोपनीयतापूर्ण मुद्दों को जन्म देता है।']
['और जबकि का