In [3]:
!pip install transformers datasets
!pip install accelerate -u
!pip install transformers[torch]

import pandas as pd
from datasets import Dataset

# Load MLQA dataset
mlqa_df = pd.read_csv('/content/mlqa_hindi.csv')  # Replace with your actual path
mlqa_dataset = Dataset.from_pandas(mlqa_df)

# Load XQuAD dataset
xquad_df = pd.read_csv('/content/xquad.csv')  # Replace with your actual path
xquad_dataset = Dataset.from_pandas(xquad_df)

# Verify the structure of the datasets
print(mlqa_dataset)
print(xquad_dataset)



Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -u
Dataset({
    features: ['context', 'question', 'answer_text', 'answer_start', 'language'],
    num_rows: 5425
})
Dataset({
    features: ['context', 'question', 'answer_text', 'answer_start', 'language'],
    num_rows: 1190
})


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Load tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess the datasets
def preprocess_function(examples):
    inputs = [f"question: {q} context: {c}" for q, c in zip(examples["question"], examples["context"])]
    targets = [a for a in examples["answer_text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

mlqa_encoded = mlqa_dataset.map(preprocess_function, batched=True)
xquad_encoded = xquad_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mlqa_encoded,
    eval_dataset=xquad_encoded,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./results_xquad")
tokenizer.save_pretrained("./results_xquad")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/5425 [00:00<?, ? examples/s]



Map:   0%|          | 0/1190 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
100,0.1245,0.097567
200,0.072,0.042551
300,0.04,0.032826
400,0.0302,0.032538
500,0.0384,0.032578
600,0.0409,0.031618
700,0.0289,0.032403
800,0.0395,0.031005
900,0.0279,0.03225
1000,0.0308,0.030415


('./results_xquad/tokenizer_config.json',
 './results_xquad/special_tokens_map.json',
 './results_xquad/spiece.model',
 './results_xquad/added_tokens.json')

In [21]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./results_xquad")
tokenizer = T5Tokenizer.from_pretrained("./results_xquad")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def generate_answer(context, question):
    # Prepare the input text
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)

    # Generate the answer
    outputs = model.generate(
        inputs["input_ids"],
        max_length=50,
        num_beams=5,
        length_penalty=2.0,
        early_stopping=True
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer



# Add more test cases if needed
test_cases = [
    {
        "context": "‡§™‡•à‡§Ç‡§•‡§∞‡•ç‡§∏ ‡§ï‡•Ä ‡§°‡§ø‡•û‡•á‡§®‡•ç‡§∏ ‡§®‡•á ‡§≤‡•Ä‡§ó ‡§Æ‡•á‡§Ç ‡§ï‡•á‡§µ‡§≤ 308 ‡§Ö‡§Ç‡§ï ‡§¶‡§ø‡§è ‡§î‡§∞ ‡§õ‡§†‡•á ‡§∏‡•ç‡§•‡§æ‡§® ‡§™‡§∞ ‡§∞‡§π‡•á ‡§ú‡§¨‡§ï‡§ø 24 ‡§á‡§®‡•ç‡§ü‡§∞‡§∏‡•á‡§™‡•ç‡§∂‡§® ‡§î‡§∞ ‡§ö‡§æ‡§∞ ‡§™‡•ç‡§∞‡•ã ‡§¨‡§æ‡§â‡§≤‡§∞‡•ã‡§Ç ‡§ï‡•á ‡§∏‡§æ‡§•‡•§",
        "question": "‡§™‡•à‡§Ç‡§•‡§∞‡•ç‡§∏ ‡§°‡§ø‡•û‡•á‡§Ç‡§∏ ‡§®‡•á ‡§ï‡§ø‡§§‡§®‡•á ‡§Ö‡§Ç‡§ï ‡§¶‡§ø‡§è?"
    },
    {
        "context":"‡§á‡§Ç‡§ó‡•ç‡§≤‡•à‡§Ç‡§° ‡§î‡§∞ ‡§ú‡§∞‡•ç‡§Æ‡§®‡•Ä ‡§Æ‡•á‡§Ç ‡§ï‡§æ‡§®‡•Ç‡§® ‡§î‡§∞ ‡§¶‡§∞‡•ç‡§∂‡§® ‡§ï‡§æ ‡§Ö‡§ß‡•ç‡§Ø‡§Ø‡§® ‡§ï‡§∞‡§§‡•á ‡§π‡•Å‡§è, ‡§á‡§ï‡§¨‡§æ‡§≤ ‡§ë‡§≤ ‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ ‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§≤‡•Ä‡§ó ‡§ï‡•Ä ‡§≤‡§Ç‡§¶‡§® ‡§∂‡§æ‡§ñ‡§æ ‡§ï‡§æ ‡§∏‡§¶‡§∏‡•ç‡§Ø ‡§¨‡§® ‡§ó‡§Ø‡§æ‡•§ ‡§µ‡§π 1908 ‡§Æ‡•á‡§Ç ‡§≤‡§æ‡§π‡•å‡§∞ ‡§µ‡§æ‡§™‡§∏ ‡§Ü ‡§ó‡§Ø‡§æ‡•§ ‡§ï‡§æ‡§®‡•Ç‡§®‡•Ä ‡§Ö‡§≠‡•ç‡§Ø‡§æ‡§∏ ‡§î‡§∞ ‡§¶‡§æ‡§∞‡•ç‡§∂‡§®‡§ø‡§ï ‡§ï‡§µ‡§ø‡§§‡§æ ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§Ö‡§™‡§®‡•á ‡§∏‡§Æ‡§Ø ‡§ï‡•ã ‡§µ‡§ø‡§≠‡§æ‡§ú‡§ø‡§§ ‡§ï‡§∞‡§§‡•á ‡§π‡•Å‡§è, ‡§á‡§ï‡§¨‡§æ‡§≤ ‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§≤‡•Ä‡§ó ‡§Æ‡•á‡§Ç ‡§∏‡§ï‡•ç‡§∞‡§ø‡§Ø ‡§∞‡§π‡•á‡•§ ",
        "question": "‡§á‡§ï‡§¨‡§æ‡§≤ ‡§ï‡§ø‡§∏ ‡§µ‡§∞‡•ç‡§∑ ‡§≤‡§æ‡§π‡•å‡§∞ ‡§≤‡•å‡§ü‡•á?"
    },
    {
        "context": "‡§¨‡•ç‡§∞‡•ã‡§Ç‡§ï‡•ã‡§∏ ‡§®‡•á ‡§ñ‡•á‡§≤ ‡§ï‡•á ‡§Ö‡§Ç‡§§‡§ø‡§Æ ‡§§‡•Ä‡§® ‡§Æ‡§ø‡§®‡§ü ‡§Æ‡•á‡§Ç 11 ‡§Ö‡§Ç‡§ï ‡§¨‡§®‡§æ‡§ï‡§∞ ‡§°‡§ø‡§µ‡•Ä‡§ú‡§®‡§≤ ‡§∞‡§æ‡§â‡§Ç‡§° ‡§Æ‡•á‡§Ç ‡§™‡§ø‡§ü‡•ç‡§∏‡§¨‡§∞‡•ç‡§ó ‡§∏‡•ç‡§ü‡•Ä‡§≤‡§∞‡•ç‡§∏ ‡§ï‡•ã 23‚Äì16 ‡§∏‡•á ‡§π‡§∞‡§æ‡§Ø‡§æ‡•§ ‡§´‡§ø‡§∞ ‡§â‡§®‡•ç‡§π‡•ã‡§Ç‡§®‡•á ‡§è‡§è‡§´‡§∏‡•Ä ‡§ö‡•à‡§Æ‡•ç‡§™‡§ø‡§Ø‡§®‡§∂‡§ø‡§™ ‡§ó‡•á‡§Æ ‡§Æ‡•á‡§Ç ‡§®‡•ç‡§Ø‡•Ç ‡§á‡§Ç‡§ó‡•ç‡§≤‡•à‡§Ç‡§° ‡§ï‡•á 2-‡§Ö‡§Ç‡§ï ‡§ï‡•á ‡§∞‡•Ç‡§™‡§æ‡§Ç‡§§‡§∞‡§£ ‡§ï‡•á ‡§™‡•ç‡§∞‡§Ø‡§æ‡§∏ ‡§ï‡•á ‡§è‡§ï ‡§™‡§æ‡§∏ ‡§ï‡•ã ‡§á‡§Ç‡§ü‡§∞‡§∏‡•á‡§™‡•ç‡§ü ‡§ï‡§∞‡§ï‡•á ‡§Æ‡•å‡§ú‡•Ç‡§¶‡§æ ‡§∏‡•Å‡§™‡§∞ ‡§¨‡§æ‡§â‡§≤ XLIX ‡§ö‡•à‡§Ç‡§™‡§ø‡§Ø‡§® ‡§®‡•ç‡§Ø‡•Ç ‡§á‡§Ç‡§ó‡•ç‡§≤‡•à‡§Ç‡§° ‡§™‡•à‡§ü‡•ç‡§∞‡§ø‡§ì‡§ü‡•ç‡§∏ ‡§ï‡•ã 17 ‡§∏‡•á‡§ï‡§Ç‡§° ‡§∞‡§π‡§§‡•á ‡§π‡•Å‡§è 20‚Äì18 ‡§∏‡•á ‡§π‡§∞‡§æ‡§Ø‡§æ‡•§ ‡§∏‡•Ä‡§ú‡§º‡§® ‡§ï‡•á ‡§¶‡•å‡§∞‡§æ‡§® ‡§Æ‡•à‡§®‡§ø‡§Ç‡§ó ‡§ï‡•Ä ‡§á‡§Ç‡§ü‡§∞‡•ç‡§∏‡•á‡§™‡•ç‡§∂‡§® ‡§ï‡•Ä ‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ‡§ì‡§Ç ‡§ï‡•á ‡§¨‡§æ‡§µ‡§ú‡•Ç‡§¶, ‡§µ‡•á ‡§Ö‡§™‡§®‡•á ‡§¶‡•ã ‡§™‡•ç‡§≤‡•á‡§ë‡§´‡§º ‡§ó‡•á‡§Æ ‡§Æ‡•á‡§Ç ‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ó‡•á‡§Æ ‡§®‡§π‡•Ä‡§Ç ‡§π‡§æ‡§∞‡•á‡•§",
        "question": "‡§¨‡•ç‡§∞‡•ã‡§Ç‡§ï‡•ã‡§∏ ‡§î‡§∞ ‡§∏‡•ç‡§ü‡•Ä‡§≤‡§∞‡•ç‡§∏ ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§ñ‡•á‡§≤ ‡§ï‡§æ ‡§Ö‡§Ç‡§§‡§ø‡§Æ ‡§∏‡•ç‡§ï‡•ã‡§∞ ‡§ï‡•ç‡§Ø‡§æ ‡§•‡§æ?"
    },
]

for case in test_cases:
    context = case["context"]
    question = case["question"]
    answer = generate_answer(context, question)
    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Context: ‡§™‡•à‡§Ç‡§•‡§∞‡•ç‡§∏ ‡§ï‡•Ä ‡§°‡§ø‡•û‡•á‡§®‡•ç‡§∏ ‡§®‡•á ‡§≤‡•Ä‡§ó ‡§Æ‡•á‡§Ç ‡§ï‡•á‡§µ‡§≤ 308 ‡§Ö‡§Ç‡§ï ‡§¶‡§ø‡§è ‡§î‡§∞ ‡§õ‡§†‡•á ‡§∏‡•ç‡§•‡§æ‡§® ‡§™‡§∞ ‡§∞‡§π‡•á ‡§ú‡§¨‡§ï‡§ø 24 ‡§á‡§®‡•ç‡§ü‡§∞‡§∏‡•á‡§™‡•ç‡§∂‡§® ‡§î‡§∞ ‡§ö‡§æ‡§∞ ‡§™‡•ç‡§∞‡•ã ‡§¨‡§æ‡§â‡§≤‡§∞‡•ã‡§Ç ‡§ï‡•á ‡§∏‡§æ‡§•‡•§
Question: ‡§™‡•à‡§Ç‡§•‡§∞‡•ç‡§∏ ‡§°‡§ø‡•û‡•á‡§Ç‡§∏ ‡§®‡•á ‡§ï‡§ø‡§§‡§®‡•á ‡§Ö‡§Ç‡§ï ‡§¶‡§ø‡§è?
Answer: 308

Context: ‡§á‡§Ç‡§ó‡•ç‡§≤‡•à‡§Ç‡§° ‡§î‡§∞ ‡§ú‡§∞‡•ç‡§Æ‡§®‡•Ä ‡§Æ‡•á‡§Ç ‡§ï‡§æ‡§®‡•Ç‡§® ‡§î‡§∞ ‡§¶‡§∞‡•ç‡§∂‡§® ‡§ï‡§æ ‡§Ö‡§ß‡•ç‡§Ø‡§Ø‡§® ‡§ï‡§∞‡§§‡•á ‡§π‡•Å‡§è, ‡§á‡§ï‡§¨‡§æ‡§≤ ‡§ë‡§≤ ‡§á‡§Ç‡§°‡§ø‡§Ø‡§æ ‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§≤‡•Ä‡§ó ‡§ï‡•Ä ‡§≤‡§Ç‡§¶‡§® ‡§∂‡§æ‡§ñ‡§æ ‡§ï‡§æ ‡§∏‡§¶‡§∏‡•ç‡§Ø ‡§¨‡§® ‡§ó‡§Ø‡§æ‡•§ ‡§µ‡§π 1908 ‡§Æ‡•á‡§Ç ‡§≤‡§æ‡§π‡•å‡§∞ ‡§µ‡§æ‡§™‡§∏ ‡§Ü ‡§ó‡§Ø‡§æ‡•§ ‡§ï‡§æ‡§®‡•Ç‡§®‡•Ä ‡§Ö‡§≠‡•ç‡§Ø‡§æ‡§∏ ‡§î‡§∞ ‡§¶‡§æ‡§∞‡•ç‡§∂‡§®‡§ø‡§ï ‡§ï‡§µ‡§ø‡§§‡§æ ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§Ö‡§™‡§®‡•á ‡§∏‡§Æ‡§Ø ‡§ï‡•ã ‡§µ‡§ø‡§≠‡§æ‡§ú‡§ø‡§§ ‡§ï‡§∞‡§§‡•á ‡§π‡•Å‡§è, ‡§á‡§ï‡§¨‡§æ‡§≤ ‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§