In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import random
from transformers import AutoTokenizer, AutoModelForCausalLM

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MiloDataset(Dataset):
    def __init__(self, inputs, responses, tokenizer, max_len=128):
        self.inputs = inputs
        self.responses = responses
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        # concat input and response into one string for better generation
        input_text = self.inputs[idx]
        response_text = self.responses[idx]

        prompt = f"User: {input_text} Dog: {response_text}"

        encoding = self.tokenizer(
            prompt,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids
        }

In [3]:
df = pd.read_csv("milo.csv")
df = df.sample(frac=1).reset_index(drop=True)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token # gpt2 doesn't have pad token, so use eos

dataset = MiloDataset(df['input'].tolist(), df['response'].tolist(), tokenizer)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

In [4]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer)) # important if padding token was added

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train() # training mode
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()       # compute gradients
        optimizer.step()      # update parameters
        optimizer.zero_grad() # reset gradients

        total_loss += loss.item()

    return total_loss / len(dataloader)

def eval_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
num_epochs = 7

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = eval_epoch(model, val_loader, device)
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

test_loss = eval_epoch(model, test_loader, device)
print(f"Test Loss: {test_loss:.4f}")

# takes about 190 minutes for 7 epochs

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 1/7 | Train Loss: 0.1063 | Val Loss: 0.0704
Epoch 2/7 | Train Loss: 0.0700 | Val Loss: 0.0680
Epoch 3/7 | Train Loss: 0.0687 | Val Loss: 0.0676
Epoch 4/7 | Train Loss: 0.0680 | Val Loss: 0.0667
Epoch 5/7 | Train Loss: 0.0675 | Val Loss: 0.0672
Epoch 6/7 | Train Loss: 0.0672 | Val Loss: 0.0668
Epoch 7/7 | Train Loss: 0.0669 | Val Loss: 0.0670
Test Loss: 0.0671


In [7]:
model.save_pretrained("generative_model")
tokenizer.save_pretrained("generative_model")

('generative_model/tokenizer_config.json',
 'generative_model/special_tokens_map.json',
 'generative_model/vocab.json',
 'generative_model/merges.txt',
 'generative_model/added_tokens.json')

In [8]:
def generate_response(user_input, model, tokenizer, device, max_length=50):
    # EX: User: are you hungry? Dog: yes! i could eat a whole squirrel!
    
    model.eval() # turn off any training behaviors

    # give the model a prompt to 'fill in' (i.e. the same structure it trained on)
    prompt = f"User: {user_input} Dog:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # generate a response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    # decode the generated response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # extract only the dog's part
    if "Dog:" in generated_text:
        return generated_text.split("Dog:")[-1].strip()
    else:
        return generated_text

example_input = "who's shelby?"
response = generate_response(example_input, model, tokenizer, device)
print("Milo says:", response)


Milo says: hump


## For additional training

In [9]:
df = pd.read_csv("milo.csv")
df = df.sample(frac=1).reset_index(drop=True)

dataset = MiloDataset(df['input'].tolist(), df['response'].tolist(), tokenizer)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("generative_model")
model = AutoModelForCausalLM.from_pretrained("generative_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

num_additional_epochs = 5

for epoch in range(num_additional_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = eval_epoch(model, val_loader, device)
    print(f"[CONT] Epoch {epoch+1}/{num_additional_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

test_loss = eval_epoch(model, test_loader, device)
print(f"Final Test Loss: {test_loss:.4f}")