# GPT-2 Training

In [14]:
# Import data readers
from ipynb.fs.defs.data_preparation import read_movie_metadata, read_character_metadata, read_line_data, read_conversations_data

In [15]:
movie_df = read_movie_metadata()
character_df = read_character_metadata(movie_df)
line_df = read_line_data(movie_df, character_df)
conversation_df = read_conversations_data(movie_df, character_df, line_df)

In [16]:
print(f"Number of movies: {len(movie_df)}")
print(f"Number of characters: {len(character_df)}")
print(f"Number of lines: {len(line_df)}")
print(f"Number of conversations: {len(conversation_df)}")

Number of movies: 617
Number of characters: 9035
Number of lines: 304713
Number of conversations: 83097


In [17]:
df = conversation_df.copy(deep=True)

In [18]:
def convert_to_conversation(conversation_data, line_df):
    # Turn conversation into chat format of one input and one response
    # For each pair of lines, add start of sentence token, end of sentence token, and bot response

    chats = []
    for data in conversation_data:
        # If it is not even, drop the last line as it is missing the bot response
        if len(data) % 2 == 1:
            data = data[:-1]

        for idx, line in enumerate(data):
            line_text = line_df[line_df["id"] == line]["line"].values[0]

            if idx % 2 == 0:
                chat = " ".join(["<SOS>", line_text])
            else:
                chat = " ".join([chat, "<BOT>", line_text, "<EOS>"])
                chats.append(chat)

    return chats


chats = convert_to_conversation(conversation_df["lines"].values, line_df)

In [19]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add special tokens for start/end of sentence, padding, and bot response
tokenizer.add_special_tokens({"pad_token": "<PAD>", "bos_token": "<SOS>", "eos_token": "<EOS>"})
tokenizer.add_tokens(["<BOT>"])
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [20]:
print(tokenizer.decode(model.generate(**tokenizer("<SOS> Hi how are you? <BOT> ", return_tensors="pt"))[0]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<SOS> Hi how are you?  <BOT>  <SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS>


In [21]:
from torch.utils.data import Dataset
class ChatData(Dataset):
    def __init__(self, chats, tokenizer):
        self.chats = chats
        self.tokenizer = tokenizer
        self.max_len = 128

        self.encoded_data = self.tokenizer(self.chats, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        self.input_id = self.encoded_data["input_ids"]
        self.attention_mask = self.encoded_data["attention_mask"]

    def __len__(self):
        return len(self.chats)

    def __getitem__(self, idx):
        return self.input_id[idx], self.attention_mask[idx]

In [22]:
import torch
from torch.optim import Adam
from tqdm import tqdm
from pathlib import Path


def train_model(chat_data, model, optimizer, device, epochs=10, save_every=5):
    model.train()
    for epoch in tqdm(range(epochs)):
        batch = 0
        for input_id, attention_mask in chat_data:
            batch += 1
            # print(f"Batch: {batch}")
            input_id = input_id.to(device)
            attention_mask = attention_mask.to(device)
            output = model(input_id, attention_mask=attention_mask, labels=input_id)
            loss = output.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Print example chat
        # example = tokenizer("<SOS> Hi I am Ethan, how are you? <BOT>", return_tensors="pt")
        # input_id = example["input_ids"].to(device)
        # attention_mask = example["attention_mask"].to(device)
        # print(tokenizer.decode(model.generate(input_id, attention_mask=attention_mask)[0]))

        if epoch % save_every == 0:
            torch.save(model.state_dict(), Path(f"models/gpt2/checkpoints/model_{epoch}.pt"))

    torch.save(model.state_dict(), Path(f"models/gpt2/final/model_final_20241013_1200.pt"))

def inference(model, tokenizer, device, chat):
    model.eval()
    chat = tokenizer(" ".join(["<SOS>", chat, "<BOT>"]), return_tensors="pt")
    input_id = chat["input_ids"].to(device)
    attention_mask = chat["attention_mask"].to(device)
    output = model.generate(input_id, attention_mask=attention_mask, max_length=128)
    print(tokenizer.decode(output[0], skip_special_tokens=True))

def chat_with_model(model, tokenizer, device):
    model.eval()
    chat = input("User: ")
    while chat != "quit" and chat != "q":
        inference(model, tokenizer, device, chat)
        chat = input("User: ")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);

In [23]:
from torch.utils.data import DataLoader

chat_data = DataLoader(ChatData(chats[0:20000], tokenizer), batch_size=16, shuffle=True)

In [24]:
optimizer = Adam(model.parameters(), lr=5e-5)
train_model(chat_data, model, optimizer, device, epochs=10, save_every=5)

100%|██████████| 10/10 [1:09:45<00:00, 418.56s/it]


In [25]:
model.load_state_dict(torch.load(Path("models/gpt2/final/model_final_20241013_1200.pt"), map_location=torch.device('cpu')))
model.to(device);

  model.load_state_dict(torch.load(Path("models/gpt2/final/model_final_20241013_1200.pt"), map_location=torch.device('cpu')))


In [26]:
inference(model, tokenizer, device, "The boy is our last hope.") # Star Wars
inference(model, tokenizer, device, "What are you going to do. bleed on me?") # Holy Grail
inference(model, tokenizer, device, "Son of a building block, it's Woody!") # Toy Story

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 The boy is our last hope.  <BOT>  I'm going to need him.
  <BOT>  You're going to need him.
 


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 What are you going to do. bleed on me?  <BOT>  I'm going to bleed on you.
 
 Son of a building block, it's Woody!  <BOT>  What's that?
  <BOT>  It's the building that houses the museum.
 


In [28]:
chat_with_model(model, tokenizer, device)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 Hello how are you?  <BOT>  I'm sorry I woke you up.
 
