In [None]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Step 1: Load CSV File
csv_file_path = "/content/Conversation.csv"  # Replace with your actual file path
df = pd.read_csv(csv_file_path)

# Check the CSV structure to ensure 'question' and 'answer' columns exist
print(df.head())

# Step 2: Prepare the Dataset (Ensure CSV has 'question' and 'answer' columns)
questions = df['question'].tolist()
answers = df['answer'].tolist()

# Prepare training data for fine-tuning
training_data = [{"question": q, "answer": a} for q, a in zip(questions, answers)]

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({"question": questions, "answer": answers})

# Step 3: Initialize GPT-2 Tokenizer and Model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set padding and EOS token handling
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# Step 4: Tokenize the Dataset (concatenate question and answer)
def tokenize_function(examples):
    # Concatenate question and answer for training
    questions = examples['question']
    answers = examples['answer']

    # Concatenate questions and answers for each example
    prompts = [q + " " + tokenizer.eos_token + " " + a for q, a in zip(questions, answers)]

    # Tokenize the list of concatenated strings
    return tokenizer(prompts, padding="max_length", truncation=True, max_length=128)

# Apply tokenization to the dataset
train_dataset = dataset.map(tokenize_function, batched=True)

# Step 5: Fine-Tune the Model
# Set training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Step 6: Generate Response with the Fine-Tuned Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output = model.generate(input_ids, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2, top_k=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test the fine-tuned chatbot
question = "What is AI?"
response = generate_response(question)
print(f"Question: {question}")
print(f"Response: {response}")
