In [30]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import warnings
warnings.filterwarnings("ignore")


In [31]:
df = pd.read_csv('milo.csv')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.head(10))

                    input                  response
0            Good morning              bark (hello)
1               Milo Sit!                      sits
2                   Milo?                    pounce
3    milo are you hungry?    bark (yes i want food)
4    milo are you hungry?    bark (yes i want food)
5    Milo is the best boy                 wags tail
6           is milo ready  jump! jump! (I am ready)
7   milo eat those people                     grrrr
8  milo where is your paw                       paw
9   are you ready to eat?  jump! jump! (I am ready)


In [32]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class MiloDataset(Dataset):
    def __init__(self, texts, responses, tokenizer, max_len=128):
        self.texts = texts
        self.responses = responses
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        # assume responses are class indices (like 0, 1, 2, etc.)
        label = torch.tensor(self.responses[idx], dtype=torch.long)

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }

label_mapping = {label: idx for idx, label in enumerate(df['response'].unique())}
df['label'] = df['response'].map(label_mapping)

full_dataset = MiloDataset(df['input'].tolist(), df['label'].tolist(), tokenizer)

train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def eval_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

num_epochs = 6

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc = eval_epoch(model, val_loader, device)
    
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

evaluate_model(model, test_loader, device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 | Train Loss: 2.4535 | Val Loss: 1.8965 | Val Acc: 0.6909
Epoch 2/10 | Train Loss: 1.5953 | Val Loss: 1.0674 | Val Acc: 0.9636
Epoch 3/10 | Train Loss: 0.7986 | Val Loss: 0.4695 | Val Acc: 0.9727
Epoch 4/10 | Train Loss: 0.3858 | Val Loss: 0.2247 | Val Acc: 1.0000
Epoch 5/10 | Train Loss: 0.2183 | Val Loss: 0.1306 | Val Acc: 1.0000
Epoch 6/10 | Train Loss: 0.1444 | Val Loss: 0.0870 | Val Acc: 1.0000
Epoch 7/10 | Train Loss: 0.1057 | Val Loss: 0.0742 | Val Acc: 0.9909
Epoch 8/10 | Train Loss: 0.0848 | Val Loss: 0.0490 | Val Acc: 1.0000
Epoch 9/10 | Train Loss: 0.0648 | Val Loss: 0.0397 | Val Acc: 1.0000
Epoch 10/10 | Train Loss: 0.0624 | Val Loss: 0.0346 | Val Acc: 1.0000
Test Accuracy: 1.0000


In [37]:
model.save_pretrained("milo_model")
tokenizer.save_pretrained("milo_model")

('milo_model/tokenizer_config.json',
 'milo_model/special_tokens_map.json',
 'milo_model/vocab.txt',
 'milo_model/added_tokens.json')

In [36]:
def predict_single_input(model, tokenizer, text, label_mapping, device):
    model.eval()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1)

    idx_to_label = {v: k for k, v in label_mapping.items()}
    predicted_response = idx_to_label[prediction.item()]
    return predicted_response

your_input = "bark"
response = predict_single_input(model, tokenizer, your_input, label_mapping, device)
print("Milo says:", response)

Milo says: grrrr
