# Chat Reply Recommendation Model
Preprocessing, training, evaluation, and reply generation using GPT-2.


In [ ]:
# Imports
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from utils import clean_text


In [ ]:
# Load datasets
userA_df = pd.read_csv('./dataset/userA_chats.csv')
userB_df = pd.read_csv('./dataset/userB_chats.csv')
userA_df['clean_text'] = userA_df['message'].apply(clean_text)
userB_df['clean_text'] = userB_df['message'].apply(clean_text)
conversations = list(zip(userB_df['clean_text'], userA_df['clean_text']))


In [ ]:
# Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [ ]:
# Encode data and prepare for training
from torch.utils.data import Dataset

class ChatDataset(Dataset):
    def __init__(self, conversations, tokenizer, max_length=128):
        self.inputs = []
        self.labels = []
        for b_msg, a_msg in conversations:
            enc = tokenizer.encode_plus(b_msg + tokenizer.eos_token + a_msg,
                                        truncation=True,
                                        padding='max_length',
                                        max_length=max_length,
                                        return_tensors='pt')
            self.inputs.append(enc['input_ids'].squeeze())
            self.labels.append(enc['input_ids'].squeeze())

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.labels[idx]}

dataset = ChatDataset(conversations, tokenizer)


In [ ]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./model',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=100,
    save_total_limit=2,
    logging_steps=50
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

trainer.train()
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')