In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch




In [2]:
# Load your cleaned dataset
train_df = pd.read_csv('C:/Users/Adnan/cleaned_train_dataset.csv')
test_df = pd.read_csv('C:/Users/Adnan/cleaned_test_dataset.csv')

In [3]:
# Prepare the dataset for T5
train_df['input_text'] = 'question: ' + train_df['processed_question']
train_df['target_text'] = train_df['processed_answer']
test_df['input_text'] = 'question: ' + test_df['processed_question']
test_df['target_text'] = test_df['processed_answer']

In [4]:
# Split the dataset into smaller subsets
num_splits = 4  # Define the number of splits
train_subsets = np.array_split(train_df, num_splits)
test_subsets = np.array_split(test_df, num_splits)


In [5]:
# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define a function to tokenize the datasets
def tokenize_function(examples):
    input_texts = [str(text) for text in examples['input_text']]
    target_texts = [str(text) for text in examples['target_text']]
    
    model_inputs = tokenizer(input_texts, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(target_texts, max_length=512, truncation=True, padding='max_length')
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
models = []

for i, (train_subset, test_subset) in enumerate(zip(train_subsets, test_subsets)):
    train_dataset = Dataset.from_pandas(train_subset[['input_text', 'target_text']])
    test_dataset = Dataset.from_pandas(test_subset[['input_text', 'target_text']])
    
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    test_tokenized = test_dataset.map(tokenize_function, batched=True)

    model = T5ForConditionalGeneration.from_pretrained('t5-small')

    training_args = TrainingArguments(
        output_dir=f'./results_{i}',
        num_train_epochs=1,  # Reduced epochs
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs_{i}',
        logging_steps=10,
        fp16=torch.cuda.is_available(),  # Enable fp16 only if GPU is available
        save_steps=1000,
        save_total_limit=2,
        dataloader_num_workers=4  # Use multiple workers for data loading
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Results for model {i}: {eval_results}")

    model.save_pretrained(f'./t5-chatbot-model_{i}')
    tokenizer.save_pretrained(f'./t5-chatbot-model_{i}')
    models.append(model)

Map:   0%|          | 0/8461 [00:00<?, ? examples/s]

Map:   0%|          | 0/5641 [00:00<?, ? examples/s]

Step,Training Loss


In [None]:
def generate_response(question, model, tokenizer):
    input_text = 'question: ' + question
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage with the first model
best_model = models[0]
best_tokenizer = tokenizer
print(generate_response("What is the capital of France?", best_model, best_tokenizer))