# Task 1: Translation Model Fine-tuning and Deployment

# 1. Imports


In [4]:
import csv
import pandas as pd
pd.set_option('display.max_colwidth', None)
from datasets import load_dataset

# 2. Selecting Datasets


In [5]:
# Path to the dataset files
en_file_path = "../Datasets/WikiMatrix/WikiMatrix.en-hi.en"
hi_file_path = "../Datasets/WikiMatrix/WikiMatrix.en-hi.hi"
output_file_path = "wikimatrix_en_hi.csv"

In [11]:
# Read the files and write to CSV
with open(en_file_path, "r", encoding="utf-8") as en_file, open(hi_file_path, "r", encoding="utf-8") as hi_file, open(output_file_path, "w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["source", "target"])  # Write header
    for en_sentence, hi_sentence in zip(en_file, hi_file):
        writer.writerow([en_sentence.strip(), hi_sentence.strip()])


In [12]:
raw_data = pd.read_csv(output_file_path)
raw_data.tail(5)

Unnamed: 0,source,target
231455,Latin is also an option which can be taken from Year 7.,7 सात वर्ष के बाद खाते से आंशिक निकासी भी की जा सकती है।
231456,Siege Weapons of the Far East: AD 300–1300.,इतिहासकार इसे ईसा पूर्व 3300-1300 का काल मानते हैं।
231457,"Five points in general position suffice to provide these five pieces of information, while four points do not.","उपर्युक्त पाँच स्वयं तथ्यों में से चार तो इतने सरल तथा सप्ष्ट हैं कि इन्हें सिद्ध करना अपने हाथ को अपना सिद्ध करने के बराबर है, परन्तु पाँचवाँ स्वयंतथ्‌य स्वयंसिद्ध सा प्रतीत नहीं होता।"
231458,"Prior to his career in professional wrestling Hellwig was an amateur bodybuilder, competing in a number of NPC contests and winning the 1984 NPC Mr. Georgia crown.","हेलविग अपने पेशेवर कुश्ती कैरियर से पहले शौकिया रूप से एक बॉडी बिल्डर थे, और कई एनपीसी प्रतियोगिताओं में उन्होंने प्रतिस्पर्धा की और 1984 एनपीसी श्री जॉर्जिया का ताज जीता था।"
231459,,


In [13]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231460 entries, 0 to 231459
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   source  231459 non-null  object
 1   target  231459 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


In [8]:
# Extract three random rows with a new random state
random_rows = raw_data.sample(n=7, random_state=66)
random_rows

Unnamed: 0,source,target
77578,"Salicylic acid has the formula C6H4(OH)COOH, where the OH group is ortho to the carboxyl group.",इसका अणुसूत्र C6H4(OH)COOH है जहाँ OH समूह कार्बोक्सिल समूह के आर्थो है।
187609,"Aware of the plan, the Meccan caravan eluded the Muslims.","योजना से अवगत, मक्का कारवां ने मुस्लिमों को छोड़ दिया।"
11759,"After the perceived poor performance in battles in Malaya and Burma in 1942, it was decided that the existing infantry divisions were over–mechanised.",1942 में मलाया और बर्मा में लड़ी जाने वाली लड़ाइयों में कथित खराब प्रदर्शन के बाद यह निर्णय लिया गया कि मौजूदा इन्फैन्ट्री डिवीजन जरूरत से ज्यादा मशीनीकृत थे।
74899,"A stable population, one that has had constant crude birth and death rates for such a long period of time that the percentage of people in every age class remains constant, or equivalently, the population pyramid has an unchanging structure.","स्थाई जनसंख्या, एक ऐसी जनसंख्या होती है जिसकी अशोधित जन्म और मृत्यु दर, इतने लंबे समय तक स्थिर बनी रहती है कि प्रत्येक उम्र वर्ग में लोगों का प्रतिशत स्थिर बना रहता है, या समतुल्य रूप से जनसंख्या पिरामिड की संरचना अपरिवर्तनीय होती है।"
203262,"Amra Ram (Hindi: अमरा राम, born 5 August 1955) is an Indian politician and farmer leader, who was the President of All India Kishan Sabha from July 2013 to October 2017.",अमरा राम (जन्म: 5 अगस्त 1955) एक भारतीय राजनतिज्ञ तथा किसान नेता हैं जो कि जुलाई 2013 से ऑल इण्डिया किसान महासभा के अध्यक्ष हैं।
52746,"Unfortunately, in part because of a backlash from promoters over the South Africa incident, other European shows were cancelled.","दुर्भाग्यवश, दक्षिण अफ्रीका की घटना से नाराज प्रोमोटरों की प्रतिक्रियास्वरूप, अन्य यूरोपीय शो रद्द कर दिये गये।"
98577,The break-up of Czechoslovakia resulted in the Czech Republic and Slovakia participating for the first time.,चेकोस्लोवाकिया का टूटना चेक गणराज्य और स्लोवाकिया में पहली बार भाग लिया।


In [14]:
raw_data.isnull().sum()

source    1
target    1
dtype: int64

In [20]:
# Read the files and write to CSV
with open(en_file_path, "r", encoding="utf-8") as en_file, open(hi_file_path, "r", encoding="utf-8") as hi_file, open(output_file_path, "w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["input", "target"])  # Write header
    
    for en_sentence, hi_sentence in zip(en_file, hi_file):
        # Strip the sentences and check if they are not empty
        en_sentence = en_sentence.strip()
        hi_sentence = hi_sentence.strip()
        
        # Skip writing to the CSV if either sentence is empty or None
        if en_sentence and hi_sentence:
            writer.writerow([en_sentence, hi_sentence])


In [21]:
# Load the dataset
dataset = load_dataset("csv", data_files={"train": "wikimatrix_en_hi.csv"})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 231459
    })
})

In [23]:
print(dataset["train"][10])
print(dataset["train"][11])
print(dataset["train"][12])

{'input': '"Has Microsoft Ever Read the History Books?".', 'target': '" क्या MICROSOFT ने कभी इतिहास की पुस्तकों को पढ़ा है?'}
{'input': 'Thanks-a-Lot Bear (voiced by Melissa Mable) – A care bear who is thankful for anything.', 'target': '(८) यात्री कर : मनु ने जलमार्ग पर, जबकि शुक्र ने थलमार्ग पर कर का उल्लेख किया है।'}
{'input': '"Launch Event Details – When did the Rovers Launch?".', 'target': 'मंगल ग्रह के मिशन की सूची "Launch Event Details – When did the Rovers Launch?'}


In [24]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

In [32]:
from transformers import pipeline

tokenizer = pipeline('text2text-generation', model="google-t5/t5-small")
inputs = tokenizer(sample["input"])
targets = tokenizer(sample["target"])


Device set to use cuda:0


In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# Create a new tokenizer
tokenizer = Tokenizer(models.BPE())

# Pre-tokenization (splitting into words, characters, etc.)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on your dataset
trainer = trainers.BpeTrainer(vocab_size=50000, min_frequency=2, special_tokens=["<unk>", "<s>", "</s>", "<pad>"])
tokenizer.train_from_iterator(your_dataset_iterator, trainer)

# Save tokenizer
tokenizer.save("custom_tokenizer.json")


In [29]:
sample = dataset["train"][12]
input = tokenizer(sample["input"])
target = tokenizer(sample["target"])


print(tokenizer.convert_ids_to_tokens(input["input_ids"]))
print(tokenizer.convert_ids_to_tokens(target["input_ids"]))

['▁"', 'La', 'un', 'ch', '▁Event', '▁Details', '▁', '–', '▁When', '▁did', '▁the', '▁Rover', 's', '▁Launch', '?"', '.', '</s>']
['▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁"', 'La', 'un', 'ch', '▁Event', '▁Details', '▁', '–', '▁When', '▁did', '▁the', '▁Rover', 's', '▁Launch', '?', '</s>']


In [46]:
from transformers import AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
tokenizer = AutoTokenizer.from_pretrained(model)

sample = raw_datasets["train"][12]
inputs = tokenizer(sample["input"])
with tokenizer.as_target_tokenizer():
    targets = tokenizer(sample["target"])


print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

OSError: Incorrect path_or_model_id: 'T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1-23): 23 x T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (final_layer_norm): T5LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerCrossAttention(
            (EncDecAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1-23): 23 x T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerCrossAttention(
            (EncDecAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (final_layer_norm): T5LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_head): Linear(in_features=1024, out_features=32128, bias=False)
)'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [42]:
from transformers import AutoTokenizer

# Load the tokenizer
model_checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Sample data from the dataset
sample = dataset["train"][12]  # Replace with an actual sample index
input_text = "translate Hindi to English: " + sample["Hindi"]
target_text = sample["English"]

# Tokenize inputs and targets
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(target_text, return_tensors="pt", padding=True, truncation=True)

# Convert token IDs back to tokens
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
target_tokens = tokenizer.convert_ids_to_tokens(targets["input_ids"][0])

# Print the results for inspection
print("Original Hindi Input:", input_text)
print("Input Tokens:", input_tokens)
print("Decoded Input Text:", tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=False))
print("\nOriginal English Target:", target_text)
print("Target Tokens:", target_tokens)
print("Decoded Target Text:", tokenizer.decode(targets["input_ids"][0], skip_special_tokens=False))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Original Hindi Input: translate Hindi to English: मंगल ग्रह के मिशन की सूची "Launch Event Details – When did the Rovers Launch?
Input Tokens: ['▁translate', '▁Hindi', '▁to', '▁English', ':', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁"', 'La', 'un', 'ch', '▁Event', '▁Details', '▁', '–', '▁When', '▁did', '▁the', '▁Rover', 's', '▁Launch', '?', '</s>']
Decoded Input Text: translate Hindi to English: <unk> <unk> <unk> <unk> <unk> <unk> "Launch Event Details – When did the Rovers Launch?</s>

Original English Target: "Launch Event Details – When did the Rovers Launch?".
Target Tokens: ['▁"', 'La', 'un', 'ch', '▁Event', '▁Details', '▁', '–', '▁When', '▁did', '▁the', '▁Rover', 's', '▁Launch', '?"', '.', '</s>']
Decoded Target Text: "Launch Event Details – When did the Rovers Launch?".</s>


In [43]:
print(tokenizer.special_tokens_map)  # Check special tokens (e.g., <pad>, <unk>, etc.)
print(tokenizer.vocab_size)  # Check tokenizer vocabulary size


{'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_53

In [34]:
# Read the files and write to CSV
with open(en_file_path, "r", encoding="utf-8") as en_file, open(hi_file_path, "r", encoding="utf-8") as hi_file, open(output_file_path, "w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["English", "Hindi"])  # Write header
    
    for en_sentence, hi_sentence in zip(en_file, hi_file):
        # Strip the sentences and check if they are not empty
        en_sentence = en_sentence.strip()
        hi_sentence = hi_sentence.strip()
        
        # Skip writing to the CSV if either sentence is empty or None
        if en_sentence and hi_sentence:
            writer.writerow([en_sentence, hi_sentence])


In [37]:
# Load the dataset
dataset = load_dataset("csv", data_files={"train": "wikimatrix_en_hi.csv"})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 231459
    })
})

In [38]:
print(dataset["train"][10])
print(dataset["train"][11])
print(dataset["train"][12])

{'English': '"Has Microsoft Ever Read the History Books?".', 'Hindi': '" क्या MICROSOFT ने कभी इतिहास की पुस्तकों को पढ़ा है?'}
{'English': 'Thanks-a-Lot Bear (voiced by Melissa Mable) – A care bear who is thankful for anything.', 'Hindi': '(८) यात्री कर : मनु ने जलमार्ग पर, जबकि शुक्र ने थलमार्ग पर कर का उल्लेख किया है।'}
{'English': '"Launch Event Details – When did the Rovers Launch?".', 'Hindi': 'मंगल ग्रह के मिशन की सूची "Launch Event Details – When did the Rovers Launch?'}


In [48]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load your custom dataset (CSV file with "Hindi" and "English" columns)

# Pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = ["translate Hindi to English: " + ex for ex in examples["Hindi"]]
    targets = examples["English"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [41]:
# Preprocess the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Inspect the first example
for idx in range(3):  # Print first 3 examples as a sample
    print(f"Original Hindi: {dataset['train'][idx]['Hindi']}")
    print(f"Original English: {dataset['train'][idx]['English']}")

    # Tokenized inputs
    tokenized_input = tokenized_datasets["train"][idx]["input_ids"]
    print(f"Tokenized Input IDs: {tokenized_input}")
    print(f"Decoded Input: {tokenizer.decode(tokenized_input, skip_special_tokens=False)}")

    # Tokenized outputs
    tokenized_label = tokenized_datasets["train"][idx]["labels"]
    print(f"Tokenized Label IDs: {tokenized_label}")
    print(f"Decoded Label: {tokenizer.decode(tokenized_label, skip_special_tokens=False)}")

    print("=" * 50)


Map:   0%|          | 0/231459 [00:00<?, ? examples/s]

Original Hindi: अपने परवरदिगार का नाम ले कर पढ़ो, जिसने (दुनिया को) पैदा ‎किया।
Original English: Recite in the name of your Lord who created—Created man from a clinging substance.
Tokenized Input IDs: [13959, 25763, 12, 1566, 10, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6, 3, 2, 41, 2, 3, 2, 61, 3, 2, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded Input: translate Hindi to English: <unk> <unk> <unk> <unk> <unk> <unk> <unk>, <unk> (<unk> <unk>) <unk> <unk></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [56]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(examples["inputs"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["targets"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function, batched=True, remove_columns=["inputs", "targets"]
)

ValueError: Column to remove ['targets', 'inputs'] not in the dataset. Current columns in the dataset: ['English', 'Hindi']

In [49]:
from transformers import DataCollatorForSeq2Seq

# Data collator ensures padding in batches
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_strategy="epoch",
)


In [52]:
from datasets import DatasetDict

# Split the "train" set into train and validation sets
split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)  # 10% for validation
tokenized_datasets = DatasetDict({
    "train": split_datasets["train"],
    "validation": split_datasets["test"]
})

# Verify the new structure
print(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 208313
    })
    validation: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 23146
    })
})


In [53]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [51]:
from datasets import DatasetDict

# Split the dataset into train and validation
train_valid_split = tokenized_datasets["train"].train_test_split(test_size=0.1)  # Adjust the test_size as needed

# Now the dataset will contain both train and validation
train_valid_split

# Update trainer to use the validation split
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_split["train"],
    eval_dataset=train_valid_split["test"],  # Use the validation split here
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()


  trainer = Seq2SeqTrainer(


  0%|          | 0/39060 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 12.05 GiB is allocated by PyTorch, and 395.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [54]:
# Load the T5 model
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()


  trainer = Seq2SeqTrainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 126.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 12.05 GiB is allocated by PyTorch, and 395.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_t5_hindi_english")
tokenizer.save_pretrained("./fine_tuned_t5_hindi_english")

# Evaluate the model
results = trainer.evaluate()
print(results)


In [None]:
from transformers import pipeline

# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_t5_hindi_english")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_t5_hindi_english")

# Create a translation pipeline
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# Translate a Hindi sentence
result = translator("नमस्ते दुनिया")
print(result)


In [None]:
import datasets
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

pipe = pipeline("translation", model="google-t5/t5-small", device=0)
dataset = datasets.load_dataset("superb", name="asr", split="test")

# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
for out in tqdm(pipe(KeyDataset(dataset, "file"))):
    print(out)
    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
    # {"text": ....}
    # ....

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# Suppose we have the following 2 training examples:
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"

input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"

# encode the inputs
task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2]

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    [output_sequence_1, output_sequence_2],
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from transformers import AutoTokenizer


# Tokenize the dataset
def preprocess_function(examples):
    inputs = [example["source"] for example in examples]
    targets = [example["target"] for example in examples]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
print(dataset)  # Check the dataset structure
print(dataset["train"][0])  # Check the first example in the dataset

In [None]:


# Read the files
with open(en_file_path, "r", encoding="utf-8") as en_file, open(hi_file_path, "r", encoding="utf-8") as hi_file:
    en_sentences = en_file.readlines()
    hi_sentences = hi_file.readlines()

# Combine into a list of dictionaries (source: English, target: Hindi)
dataset = [{"source": en.strip(), "target": hi.strip()} for en, hi in zip(en_sentences, hi_sentences)]

# Verify the dataset structure
print(dataset[0])  # Should print something like {'source': '...', 'target': '...'}

In [None]:
from datasets import Dataset

# Convert the list of dictionaries into a Hugging Face Dataset
hf_dataset = Dataset.from_list(dataset)

# Verify the dataset structure
print(hf_dataset)
print(hf_dataset[0])  # Should print something like {'source': '...', 'target': '...'}

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer

# Tokenize the dataset
def preprocess_function(examples):
    inputs = [example["source"] for example in examples]
    targets = [example["target"] for example in examples]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)

# Verify the tokenized dataset
print(tokenized_dataset)
print(tokenized_dataset[0])  # Should print tokenized inputs and targets