In [26]:
import re

def parse_whatsapp_chat(chat_file):
    chat_data = []
    pattern = r"(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}\s?(?:AM|PM|am|pm)?) - (.*?): (.*)"
    
    with open(chat_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    for line in lines:
        match = re.match(pattern, line)
        if match:
            timestamp = match.group(1)
            sender = match.group(2)
            message = match.group(3)
            chat_data.append({'timestamp': timestamp, 'sender': sender, 'message': message})
    
    return chat_data

# Example usage
chat_data = parse_whatsapp_chat("psyduck.txt")
print(chat_data[:5])

[{'timestamp': '07/03/24, 5:02\u202fpm', 'sender': 'Psyduck', 'message': 'Kiska kasam khate h Ayush Karn'}, {'timestamp': '07/03/24, 5:02\u202fpm', 'sender': 'Psyduck', 'message': 'Yash ka? Apna?'}, {'timestamp': '07/03/24, 5:02\u202fpm', 'sender': 'Ayush Karn', 'message': '<Media omitted>'}, {'timestamp': '07/03/24, 5:02\u202fpm', 'sender': 'Ayush Karn', 'message': 'Mummy ka'}, {'timestamp': '07/03/24, 5:05\u202fpm', 'sender': 'Ayush Karn', 'message': '?'}]


In [27]:
conversation_pairs = []
for i in range(1, len(chat_data)):
    if chat_data[i]['sender'] == "Ayush Karn":
        conversation_pairs.append((chat_data[i-1]['message'], chat_data[i]['message']))
conversation_pairs

[('Yash ka? Apna?', '<Media omitted>'),
 ('<Media omitted>', 'Mummy ka'),
 ('Mummy ka', '?'),
 ('?', '?'),
 ('?', 'Megha?'),
 ('Jao Shanti se', 'Ok'),
 ('Ok', '?'),
 ('?', 'null'),
 ('null', '?'),
 ('?', 'Aaaooo'),
 ('Aaaooo', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', '.'),
 ('.', 'null'),
 ('null', '.'),
 ('.', '.'),
 ('.', '.'),
 ('?', 'Baat karo'),
 ('Kyu', 'Pata nai'),
 ('Pata nai', 'Nai man kr ra Jane ka'),
 ('Nai man kr ra Jane ka',
  'Ab bhi bharosa nai hai ki hata diye hai har jagha se?'),
 ('Ha', 'Kaise dilaye?'),
 ('Idk', 'Hum pura chat clear kr diye, media ke saath'),
 ('Hum pura chat clear kr diye, media ke saath', 'Humko rakhna hota hai chat'),
 ('Phone me kahi bhi daal sakta h', 'Phone format kr de?'),
 ('Tum kaha kaha 

In [29]:
from transformers import GPT2Tokenizer
import torch
from torch.utils.data import Dataset

# Step 1: Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to EOS

# Step 2: Prepare Dataset
class ChatDataset(Dataset):
    def __init__(self, conversation_pairs, tokenizer, max_length=512):
        self.input_ids = []
        self.attention_masks = []
        
        for input_text, output_text in conversation_pairs:
            # Combine input and output with a separator
            prompt = f"User: {input_text} Bot: {output_text}"
            encodings_dict = tokenizer(prompt, truncation=True, max_length=max_length, padding="max_length")
            
            self.input_ids.append(encodings_dict['input_ids'])
            self.attention_masks.append(encodings_dict['attention_mask'])
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_masks[idx]),
            'labels': torch.tensor(self.input_ids[idx])  # Labels are the same as input_ids for causal LM
        }
    
dataset = ChatDataset(conversation_pairs, tokenizer)



In [33]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load the Pretrained GPT-2 Model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,  # Mixed precision training if supported
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Save the Model and Tokenizer
trainer.save_model("./chatbot_model")
tokenizer.save_pretrained("./chatbot_model")


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.