In [1]:
import os
import pandas as pd

In [2]:
# Load the dialogue files
dialogues_text_file = 'dailydialog/dialogues_text.txt'
dialogues_topic_file = 'dailydialog/dialogues_topic.txt'
dialogues_act_file = 'dailydialog/dialogues_act.txt'
dialogues_emotion_file = 'dailydialog/dialogues_emotion.txt'

In [3]:
# Read the dialogue text data
with open(dialogues_text_file, 'r', encoding='utf-8') as f:
    dialogues = f.readlines()

print(dialogues[:10])

["The kitchen stinks . __eou__ I'll throw out the garbage . __eou__\n", 'So Dick , how about getting some coffee for tonight ? __eou__ Coffee ? I don ’ t honestly like that kind of stuff . __eou__ Come on , you can at least try a little , besides your cigarette . __eou__ What ’ s wrong with that ? Cigarette is the thing I go crazy for . __eou__ Not for me , Dick . __eou__\n', 'Are things still going badly with your houseguest ? __eou__ Getting worse . Now he ’ s eating me out of house and home . I ’ Ve tried talking to him but it all goes in one ear and out the other . He makes himself at home , which is fine . But what really gets me is that yesterday he walked into the living room in the raw and I had company over ! That was the last straw . __eou__ Leo , I really think you ’ re beating around the bush with this guy . I know he used to be your best friend in college , but I really think it ’ s time to lay down the law . __eou__ You ’ re right . Everything is probably going to come to

In [4]:
# Load topics, acts, and emotions
with open(dialogues_topic_file, 'r', encoding='utf-8') as f:
    topics = f.readlines()

with open(dialogues_act_file, 'r', encoding='utf-8') as f:
    acts = f.readlines()

with open(dialogues_emotion_file, 'r', encoding='utf-8') as f:
    emotions = f.readlines()

In [5]:
# Convert dialogues to user-response pairs
dialogue_pairs = []
for dialogue in dialogues:
    lines = dialogue.split('__eou__')[:-1]
    for i in range(len(lines) - 1):
        dialogue_pairs.append((lines[i].strip(), lines[i + 1].strip()))

# Creating a dataframe for visualization
df = pd.DataFrame(dialogue_pairs, columns=['User', 'Response'])
df.head()

Unnamed: 0,User,Response
0,The kitchen stinks .,I'll throw out the garbage .
1,"So Dick , how about getting some coffee for to...",Coffee ? I don ’ t honestly like that kind of ...
2,Coffee ? I don ’ t honestly like that kind of ...,"Come on , you can at least try a little , besi..."
3,"Come on , you can at least try a little , besi...",What ’ s wrong with that ? Cigarette is the th...
4,What ’ s wrong with that ? Cigarette is the th...,"Not for me , Dick ."


In [14]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from transformers import Trainer, TrainingArguments
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset


In [8]:
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [9]:
# Tokenize the input and response pairs
def tokenize_function(examples):
    return tokenizer(examples['User'], padding="max_length", truncation=True)


In [15]:
# Set the pad token
tokenizer.pad_token = tokenizer.eos_token


In [16]:
# Tokenize the input and response pairs
def tokenize_data(data, tokenizer, max_length=128):
    inputs = tokenizer(
        data['User'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = tokenizer(
        data['Response'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    ).input_ids
    return inputs, labels

In [17]:
# Create a custom dataset class
class DialogueDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs.input_ids)

    def __getitem__(self, idx):
        input_ids = self.inputs.input_ids[idx]
        attention_mask = self.inputs.attention_mask[idx]
        labels = self.labels[idx]
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [18]:
# Prepare tokenized datasets
inputs, labels = tokenize_data(df, tokenizer)
train_dataset = DialogueDataset(inputs, labels)

In [19]:
# Prepare data loader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [20]:
# Set device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [23]:
# Training loop
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        # Move batch data to the correct device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": total_loss / len(train_loader)})

    print(f"Epoch {epoch + 1} finished. Loss: {total_loss / len(train_loader)}")

print("Training completed!")

Epoch 1/1:   0%|          | 0/22466 [00:00<?, ?it/s]

Epoch 1/1:   0%|          | 16/22466 [02:33<59:46:37,  9.59s/it, Loss=0.000714]