<a href="https://colab.research.google.com/github/asinghMsc/financialLLM/blob/main/financial_records_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from google.colab import files

df = pd.read_csv('sample_data/trainable_transactions.csv')
df.head()

Unnamed: 0,prompt,target
0,Tesco - $149.6 on 2025-02-12,"{""merchant"": ""Tesco"", ""category"": ""Groceries"",..."
1,Domino's - $148.61 on 2024-07-22,"{""merchant"": ""Domino's"", ""category"": ""Restaura..."
2,Thames Water - $11.2 on 2024-06-18,"{""merchant"": ""Thames Water"", ""category"": ""Util..."
3,Domino's - $103.54 on 2025-03-23,"{""merchant"": ""Domino's"", ""category"": ""Restaura..."
4,British Gas - $144.62 on 2024-09-23,"{""merchant"": ""British Gas"", ""category"": ""Utili..."


In [2]:
def to_training_example(row):
  return f"<|prompt|> {row['prompt']} <|target|> {row['target']} <|end|>"

examples = df.apply(to_training_example, axis=1)

In [3]:
with open("train_data.txt", "w") as f:
    for line in examples:
        f.write(line + "\n")


In [4]:
pip install tiktoken



In [5]:
import tiktoken

enc = tiktoken.get_encoding('cl100k_base')

with open('train_data.txt', "r") as f:
  data = f.read()

encoded = enc.encode(data)
print(f"encoded length: {len(encoded)}")

encoded length: 69451


In [6]:
import numpy as np

#90/10 split train val
n = int(0.9*len(encoded))
train_ids = np.array(encoded[:n], dtype=np.int32)
val_ids = np.array(encoded[n:], dtype=np.int32)

#save to bins
train_ids.tofile('train.bin')
val_ids.tofile('val.bin')

#print save
print("saved files")

saved files


In [1]:
import torch
import numpy as np

#hyper parameters, batch, dtype, block size, device

batch_size = 32
block_size = 128
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float32


#load bin data

train_data = np.memmap('train.bin', dtype=np.int32, mode='r')
val_data = np.memmap('val.bin', dtype=np.int32, mode='r')


def get_batch(split):
  data = train_data if split =='train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([torch.tensor(data[i:i+block_size], dtype=torch.long) for i in ix])
  y = torch.stack([torch.tensor(data[i+1:i+1+block_size], dtype=torch.long) for i in ix])

  return x.to(device), y.to(device)


In [2]:
vocab_size = 100_000 # cl100k
n_embed = 256 #tune later

In [7]:
import torch.nn as nn
import torch.nn.functional

class Transformer(nn.Module):
  def __init__(self, vocab_size, n_embed, block_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
    self.position_embedding_table = nn.Embedding(block_size, n_embed)
    self.lm_head = nn.Linear(n_embed, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
    x = tok_emb + pos_emb

    logits = self.lm_head(x)

    if targets is None:
      loss = None

    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = nn.functional.cross_entropy(logits, targets)

    return logits, loss

In [8]:
#trying forward pass

model = Transformer(vocab_size, n_embed, block_size).to(device)
xb, yb = get_batch('train')
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([4096, 100000])
tensor(11.9235, grad_fn=<NllLossBackward0>)


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttentionHead(nn.Module):
  def __init__(self, n_embed, head_size, block_size):
    super().__init__()
    self.key = nn.Linear(n_embed, head_size, bias=False)
    self.query = nn.Linear(n_embed, head_size, bias=False)
    self.value = nn.Linear(n_embed, head_size, bias=False)
    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(0.1)
    self.head_size = head_size

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)

    #compute attention scores
    wei = q @ k.transpose(-2, -1) / self.head_size**0.5

    # casual mask prevent tokens from attending to future tokens
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

    # softmax normalisation
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x)
    out = wei @ v
    return out


In [12]:
head = SelfAttentionHead(n_embed=128, head_size=32, block_size=64)
x = torch.randn(4, 64, 128)  # dummy input (B=4, T=64, C=128)
out = head(x)
print(out.shape)
#(4, 64, 32)


torch.Size([4, 64, 32])


In [15]:
class MultiHeadAttention(nn.Module):
  def __init__(self, n_heads, n_embed, head_size, block_size):
    super().__init__()
    self.heads = nn.ModuleList([
        SelfAttentionHead(n_embed, head_size, block_size) for _ in range(n_heads)
    ])
    self.proj = nn.Linear(n_heads * head_size, n_embed)
    self.dropout = nn.Dropout(0.1)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

In [16]:
x = torch.randn(4, 64, 128)  # (B, T, C)
mha = MultiHeadAttention(n_heads=4, n_embed=128, head_size=32, block_size=64)
out = mha(x)
print(out.shape)  # (4, 64, 128)


torch.Size([4, 64, 128])
