In [1]:
# Import data readers
from ipynb.fs.defs.data_preparation import read_movie_metadata, read_character_metadata, read_line_data, read_conversations_data

In [2]:
movie_df = read_movie_metadata()
character_df = read_character_metadata(movie_df)
line_df = read_line_data(movie_df, character_df)
conversation_df = read_conversations_data(movie_df, character_df, line_df)

In [3]:
print(f"Number of movies: {len(movie_df)}")
print(f"Number of characters: {len(character_df)}")
print(f"Number of lines: {len(line_df)}")
print(f"Number of conversations: {len(conversation_df)}")

Number of movies: 617
Number of characters: 9035
Number of lines: 304713
Number of conversations: 83097


In [4]:
df = conversation_df.copy(deep=True)

In [5]:
def convert_to_conversation(conversation_data, line_df):
    # Turn conversation into chat format of one input and one response
    # For each pair of lines, add start of sentence token, end of sentence token, and bot response

    chats = []
    for data in conversation_data:
        # If it is not even, drop the last line as it is missing the bot response
        if len(data) % 2 == 1:
            data = data[:-1]

        for idx, line in enumerate(data):
            line_text = line_df[line_df["id"] == line]["line"].values[0]

            if idx % 2 == 0:
                chat = " ".join(["<SOS>", line_text])
            else:
                chat = " ".join([chat, "<BOT>", line_text, "<EOS>"])
                chats.append(chat)

    return chats


chats = convert_to_conversation(conversation_df["lines"].values, line_df)

In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add special tokens for start/end of sentence, padding, and bot response
tokenizer.add_special_tokens({"pad_token": "<PAD>", "bos_token": "<SOS>", "eos_token": "<EOS>"})
tokenizer.add_tokens(["<BOT>"])
model.resize_token_embeddings(len(tokenizer))

  from .autonotebook import tqdm as notebook_tqdm


Embedding(50261, 768)

In [7]:
print(tokenizer.decode(model.generate(**tokenizer("<SOS> Hi how are you? <BOT> ", return_tensors="pt"))[0]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<SOS> Hi how are you?  <BOT>  <PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>


In [8]:
from torch.utils.data import Dataset
class ChatData(Dataset):
    def __init__(self, chats, tokenizer):
        self.chats = chats
        self.tokenizer = tokenizer
        self.max_len = 128

        self.encoded_data = self.tokenizer(self.chats, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        self.input_id = self.encoded_data["input_ids"]
        self.attention_mask = self.encoded_data["attention_mask"]

    def __len__(self):
        return len(self.chats)

    def __getitem__(self, idx):
        return self.input_id[idx], self.attention_mask[idx]

In [9]:
import torch
from torch.optim import Adam
from tqdm import tqdm
from pathlib import Path


def train_model(chat_data, model, optimizer, device, epochs=10, save_every=5):
    model.train()
    for epoch in tqdm(range(epochs)):
        batch = 0
        for input_id, attention_mask in chat_data:
            batch += 1
            # print(f"Batch: {batch}")
            input_id = input_id.to(device)
            attention_mask = attention_mask.to(device)
            output = model(input_id, attention_mask=attention_mask, labels=input_id)
            loss = output.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Print example chat
        # example = tokenizer("<SOS> Hi I am Ethan, how are you? <BOT>", return_tensors="pt")
        # input_id = example["input_ids"].to(device)
        # attention_mask = example["attention_mask"].to(device)
        # print(tokenizer.decode(model.generate(input_id, attention_mask=attention_mask)[0]))

        if epoch % save_every == 0:
            torch.save(model.state_dict(), Path(f"models/gpt2/checkpoints/model_{epoch}.pt"))

    torch.save(model.state_dict(), Path(f"models/gpt2/final/model_final.pt"))

def inference(model, tokenizer, device):
    model.eval()
    chat = input("User: ")
    while chat != "quit" and chat != "q":
        chat = tokenizer(" ".join(["<SOS>", chat, "<BOT>"]), return_tensors="pt")

        input_id = chat["input_ids"].to(device)
        attention_mask = chat["attention_mask"].to(device)
        output = model.generate(input_id, attention_mask=attention_mask, max_length=128)
        print(tokenizer.decode(output[0], skip_special_tokens=True))
        chat = input("User: ")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);

In [10]:
from torch.utils.data import DataLoader

chat_data = DataLoader(ChatData(chats[0:8192], tokenizer), batch_size=16, shuffle=True)

In [11]:
optimizer = Adam(model.parameters(), lr=5e-5)
train_model(chat_data, model, optimizer, device, epochs=100, save_every=10)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 100/100 [4:33:17<00:00, 163.97s/it] 


In [12]:
inference(model, tokenizer, device)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 Hi how are you?  <BOT>  Been better.
  How are you?
  How are you?



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 You said you've been better, whats wrong  <BOT>  You won't be back in town two weeks from now.
  Where are you going?
  It's just a long drive.
 


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 Im goin to Alaska  <BOT>  Make yourself at home.  I'll call you a cab.
  Let's go.
  How are we going to make it?
 


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 pee pee poo poo  <BOT>  Poppa Joe said there was only two. In and out.  Boy, you guys sure did a good job. You're good, huh? Cool masks. Where'd you get them?
  <BOT>  Let's do him right here.
  We gotta go.



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 Pee Pee Poo Poo  <BOT>  I can't believe you've sunk so low.
  <BOT>  I'm sorry, Gary, but I've got to have a drink.
  We're just about to begin the process.
 
