In [2]:
import pandas as pd
from google.colab import files

df = pd.read_csv('sample_data/trainable_categories.csv')
df.head()

Unnamed: 0,prompt,target
0,CloudBooks | business audit | £399.93 | Master...,Professional Services
1,CloudBooks | transfer surcharge | $164.51 | Vi...,Bank Fees
2,EcoRide | team subscription | £452.64 | Direct...,Subscriptions
3,BizPhone Inc. | desk organizer | £192.08 | Dir...,Office Supplies
4,LunchMate | internet service | $174.07 | Direc...,Utilities


In [3]:
def to_training_example(row):
  return f"<|prompt|> {row['prompt']} <|target|> {row['target']} <|end|>"

examples = df.apply(to_training_example, axis=1)

In [4]:
with open("train_data.txt", "w") as f:
    for line in examples:
        f.write(line + "\n")


In [5]:
pip install tiktoken



In [6]:
import tiktoken

enc = tiktoken.get_encoding('cl100k_base')

with open('train_data.txt', "r") as f:
  data = f.read()

encoded = enc.encode(data)
print(f"encoded length: {len(encoded)}")

encoded length: 72255


In [7]:
import numpy as np

#90/10 split train val
n = int(0.9*len(encoded))
train_ids = np.array(encoded[:n], dtype=np.int32)
val_ids = np.array(encoded[n:], dtype=np.int32)

#save to bins
train_ids.tofile('train.bin')
val_ids.tofile('val.bin')

#print save
print("saved files")

saved files


In [8]:
import torch
import numpy as np

#hyper parameters, batch, dtype, block size, device

batch_size = 32
block_size = 128
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float32


#load bin data

train_data = np.memmap('train.bin', dtype=np.int32, mode='r')
val_data = np.memmap('val.bin', dtype=np.int32, mode='r')


def get_batch(split):
  data = train_data if split =='train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([torch.tensor(data[i:i+block_size], dtype=torch.long) for i in ix])
  y = torch.stack([torch.tensor(data[i+1:i+1+block_size], dtype=torch.long) for i in ix])

  return x.to(device), y.to(device)


In [9]:
vocab_size = 100_000 # cl100k but needs tuning cos of the train size
n_embed = 256 #tune later

In [10]:
import torch.nn as nn
import torch.nn.functional

class Transformer(nn.Module):
  def __init__(self, vocab_size, n_embed, block_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
    x = tok_emb + pos_emb

    logits = self.lm_head(x)

    if targets is None:
      loss = None

    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = nn.functional.cross_entropy(logits, targets)

    return logits, loss

In [11]:
#trying forward pass

model = Transformer(vocab_size, n_embed, block_size).to(device)
xb, yb = get_batch('train')
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([4096, 100000])
tensor(11.8130, grad_fn=<NllLossBackward0>)


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttentionHead(nn.Module):
  def __init__(self, n_embed, head_size, block_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(0.1)
    self.head_size = head_size

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)

    #compute attention scores
    wei = q @ k.transpose(-2, -1) / self.head_size**0.5

    # casual mask prevent tokens from attending to future tokens
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

    # softmax normalisation
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x)
    out = wei @ v
    return out


In [13]:
head = SelfAttentionHead(n_embed=128, head_size=32, block_size=64)
x = torch.randn(4, 64, 128)
out = head(x)
print(out.shape)
#(4, 64, 32)


torch.Size([4, 64, 32])


In [14]:
class MultiHeadAttention(nn.Module):
  def __init__(self, n_heads, n_embed, head_size, block_size):
    super().__init__()
    self.heads = nn.ModuleList([
        SelfAttentionHead(n_embed, head_size, block_size) for _ in range(n_heads)
    ])
    self.proj = nn.Linear(n_heads * head_size, n_embed)
    self.dropout = nn.Dropout(0.1)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

In [15]:
x = torch.randn(4, 64, 128)
mha = MultiHeadAttention(n_heads=4, n_embed=128, head_size=32, block_size=64)
out = mha(x)
print(out.shape)


torch.Size([4, 64, 128])


In [16]:
class FeedForward(nn.Module):
  def __init__(self, n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4 * n_embed),
        nn.ReLU(),
        nn.Linear(4 * n_embed, n_embed),
        nn.Dropout(0.1)
    )

  def forward(self, x):
      return self.net(x)

In [17]:
class TransformerBlock(nn.Module):
  def __init__(self, n_embed, n_heads, block_size):
    super().__init__()
    head_size = n_embed // n_heads
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)
    self.sa = MultiHeadAttention(n_heads, n_embed, head_size, block_size)
    self.ffwd = FeedForward(n_embed)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x


In [18]:
x = torch.randn(4, 64, 128)
block = TransformerBlock(n_embed=128, n_heads=4, block_size=64)
out = block(x)
print(out.shape)


torch.Size([4, 64, 128])


In [19]:
class Transformer(nn.Module):
  def __init__(self, vocab_size, n_embed, block_size, n_heads, n_layers):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)

    self.blocks = nn.Sequential(*[
        TransformerBlock(n_embed, n_heads, block_size) for _ in range(n_layers)
    ])
    self.ln_f = nn.LayerNorm(n_embed)
    self.head = nn.Linear(n_embed, vocab_size)

    self.block_size = block_size
    self.vocab_Size = vocab_size

  def forward(self, idx, targets=None):
    B, T = idx.shape

    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
    x = tok_emb + pos_emb

    x = self.blocks(x)
    x =self.ln_f(x)
    logits = self.head(x)

    if targets is None:
      loss = None

    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss


  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
        idx_cond =idx[:, -self.block_size:]
        logits, _ = self(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)

    return idx

In [20]:
model = Transformer(
    vocab_size = vocab_size,
    n_embed = 128,
    block_size = block_size,
    n_heads = 4,
    n_layers = 4,
).to(device)

xb, yb = get_batch('train')
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([4096, 100000])
tensor(11.6779, grad_fn=<NllLossBackward0>)


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
import torch.nn.functional as F


#HP
batch_size = 32 # had to reduce
block_size = 64 # had to reduce this too
max_iters = 2000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'


#load bins
train_data = np.memmap('train.bin', dtype=np.int32, mode='r')
val_data = np.memmap('val.bin', dtype=np.int32, mode='r')
vocab_size = 100000


#data loader in batches
def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([torch.tensor(data[i:i+block_size]) for i in ix])
  y = torch.stack(([torch.tensor(data[i+1:i+block_size+1]) for i in ix]))
  return x.to(device), y.to(device).long()


model = Transformer(
    vocab_size = vocab_size,
    n_embed = 128,
    block_size = block_size,
    n_heads = 4,
    n_layers = 4,
).to(device)


optimiser = optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()


#track best val loss
best_val_loss = float('inf')


#loop
for step in range(max_iters):
  model.train()
  xb, yb = get_batch('train')

  logits, loss = model(xb, yb)

  optimiser.zero_grad()
  loss.backward()
  optimiser.step()

  #evaluate
  if step % eval_interval == 0:
    model.eval()
    with torch.no_grad():
      val_xb, val_yb = get_batch('val')
      val_logits, val_loss = model(val_xb, val_yb)

      print(f"[step {step}] train loss: {loss.item():.4f} | val loss: {val_loss.item():.4f}")

      #save best checkpoint
      if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(model.state_dict(), 'best_model.pt')
        print("~~~~ Saved best new model ~~~~")

print("~~~ TRAINING COMPLETE ~~~~")

[step 0] train loss: 11.7048 | val loss: 11.0875
~~~~ Saved best new model ~~~~
[step 100] train loss: 1.3153 | val loss: 1.3097
~~~~ Saved best new model ~~~~
[step 200] train loss: 0.9245 | val loss: 0.9005
~~~~ Saved best new model ~~~~
[step 300] train loss: 0.8583 | val loss: 0.8589
~~~~ Saved best new model ~~~~
[step 400] train loss: 0.8369 | val loss: 0.8361
~~~~ Saved best new model ~~~~
[step 500] train loss: 0.8314 | val loss: 0.8369
[step 600] train loss: 0.8216 | val loss: 0.8450
[step 700] train loss: 0.7910 | val loss: 0.8195
~~~~ Saved best new model ~~~~
[step 800] train loss: 0.8193 | val loss: 0.7988
~~~~ Saved best new model ~~~~
[step 900] train loss: 0.7529 | val loss: 0.8281
[step 1000] train loss: 0.7705 | val loss: 0.8062
[step 1100] train loss: 0.7265 | val loss: 0.8133
[step 1200] train loss: 0.7460 | val loss: 0.7781
~~~~ Saved best new model ~~~~
[step 1300] train loss: 0.7406 | val loss: 0.8528
[step 1400] train loss: 0.7034 | val loss: 0.8175
[step 1500] 