In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments




In [2]:
# Load your cleaned dataset
train_df = pd.read_csv('C:/Users/Adnan/cleaned_train_dataset.csv')
test_df = pd.read_csv('C:/Users/Adnan/cleaned_test_dataset.csv')


In [3]:
train_df['input_text'] = 'question: ' + train_df['processed_question'] + ' answer: ' + train_df['processed_answer']
test_df['input_text'] = 'question: ' + test_df['processed_question'] + ' answer: ' + test_df['processed_answer']

In [4]:
# Split the dataset into smaller subsets
num_splits = 10  # Define the number of splits
train_subsets = np.array_split(train_df[['input_text']], num_splits)
test_subsets = np.array_split(test_df[['input_text']], num_splits)

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def tokenize_func(temp):
    input_texts = [str(text) for text in temp['input_text']]
    model_inputs = tokenizer(input_texts, max_length = 512, truncation = True, padding = 'max_length')
    return model_inputs


In [7]:
models = []

for i, (train_subset, test_subset) in enumerate(zip(train_subsets, test_subsets)):
    train_dataset = Dataset.from_pandas(train_subset)
    test_dataset = Dataset.from_pandas(test_subset)
    
    train_tokenized = train_dataset.map(tokenize_func, batched=True)
    test_tokenized = test_dataset.map(tokenize_func, batched=True)

Map:   0%|          | 0/3385 [00:00<?, ? examples/s]

Map:   0%|          | 0/2257 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

In [None]:


    model = GPT2LMHeadModel.from_pretrained('gpt2')

    training_args = TrainingArguments(
        output_dir=f'./results_{i}',
        num_train_epochs=1,  # Reduced epochs for quicker training
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs_{i}',
        logging_steps=20,  
        save_steps=1000,
        save_total_limit=2,
        dataloader_num_workers=4  # Use multiple workers for data loading
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Results for model {i}: {eval_results}")

    model.save_pretrained(f'./gpt2-chatbot-model_{i}')
    tokenizer.save_pretrained(f'./gpt2-chatbot-model_{i}')
    models.append(model)

In [None]:
def generate_response(question, model, tokenizer):
    input_text = 'question: ' + question + ' answer:'
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=512, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.split('answer:')[-1].strip()

In [None]:
best_model = models[0]
best_tokenizer = tokenizer
print(generate_response("What is the capital of France?", best_model, best_tokenizer))