In [1]:
import torch
import os
import config as c
from utils import WADataValidator
import numpy as np1
from random import randint
# basically a utils file but including pytorch.

In [2]:
dv = WADataValidator(mode="train")
train_data = dv.train_data
val_data = dv.val_data

loaded vocab


In [3]:
print(len(train_data), len(val_data))

2841284 315699


In [4]:
print(train_data[:10])

tensor([33, 82, 71, 86, 72, 77,  1, 51, 84, 81])


In [4]:
n = 10
print(dv.encode(train_data[:n]))
print(dv.decode(dv.encode(train_data[:n])))

[33, 82, 71, 86, 72, 77, 1, 51, 84, 81]
Ashwin Sur


In [5]:
train_data, val_data = torch.tensor(dv.encode(train_data), dtype=torch.long), torch.tensor(dv.encode(val_data), dtype=torch.long)

In [6]:

# things to keep in mind when implementing:
# 1. sampling of training data must be random. that means your qna pair must be done out of no-where
# 2. make sure successive texts are combined into one with some separator, probably a '.'
# 3. qna pair is always going to be text at i'th position and i+1'th position within a certain 'conversation'
# 4. a 'conversation' can be defined as texts within quick time frames. anything more than (say) 30mins is a different conversation.
# 5. qna pair must be taken from the same conversation. maybe [(1,2), (3,4) ... ] or [(1,2), (2,3) ...] -> more thought needs to be put here. personally a fan of [(1,2), (2,3)]
# 6. maybe have a start and end token with something like < >
# 7. do some simple statistics on the messages to come up with appropriate block_size, conversation_length etc



In [5]:
block_size = 8
batch_size = 4

block_size, batch_size

(8, 4)

In [6]:
dv.encode(" ")

[1]

In [7]:
qna = train_data[:block_size+1]
qna

tensor([33, 82, 71, 86, 72, 77,  1, 51, 84])

In [8]:
x, y = [], []
for i in range(0, len(qna)-1):
    x.append(qna[:i+1])
    y.append(qna[i+1])

In [9]:
x, y

([tensor([33]),
  tensor([33, 82]),
  tensor([33, 82, 71]),
  tensor([33, 82, 71, 86]),
  tensor([33, 82, 71, 86, 72]),
  tensor([33, 82, 71, 86, 72, 77]),
  tensor([33, 82, 71, 86, 72, 77,  1]),
  tensor([33, 82, 71, 86, 72, 77,  1, 51])],
 [tensor(82),
  tensor(71),
  tensor(86),
  tensor(72),
  tensor(77),
  tensor(1),
  tensor(51),
  tensor(84)])

In [10]:
def get_batch(block_size = 8, batch_size = 4):
    x, y = [], []
    for _ in range(batch_size):
        rand_pos = randint(0, len(train_data) - block_size)
        x.append(train_data[rand_pos: rand_pos + block_size])
        y.append(train_data[rand_pos + 1 : rand_pos + block_size + 1])
    
    x, y = torch.stack(x), torch.stack(y)
    return x, y

In [16]:
x, y = get_batch()

In [17]:
print(x.shape, y.shape)
x, y

torch.Size([4, 8]) torch.Size([4, 8])


(tensor([[75, 78, 86,  1, 65, 84, 67, 70],
         [74,  0, 19, 16,  1, 66, 64, 84],
         [33, 67, 72, 83, 88, 64,  1, 46],
         [67, 72, 83, 71, 88, 64,  1, 46]]),
 tensor([[78, 86,  1, 65, 84, 67, 70, 68],
         [ 0, 19, 16,  1, 66, 64, 84, 82],
         [67, 72, 83, 88, 64,  1, 46, 84],
         [72, 83, 71, 88, 64,  1, 46, 64]]))

In [18]:
type(x)

torch.Tensor

In [101]:
vocab = dv.vocab
vocab, len(vocab)

('\n !"#$%&\'()*+,./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~',
 94)

In [113]:
import torch
import torch.nn as nn
import torch.nn.functional as F

vocab_size = len(vocab)
embedding_dims = len(vocab)

class BLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dims)
        self.l = nn.Linear(embedding_dims, vocab_size)
    
    def forward(self, x, y=None):
        x = self.embedding(x)
        x = self.l(x)
        
        if y is not None:
            b, t, c = x.shape
            x = x.view(b*t, c)
            y = y.view(b*t)
            loss = F.cross_entropy(x, y)
        else:
            loss = None

        return x, loss

    def generate(self, start_token = " ", max_tokens=100):
        with torch.no_grad():
            token = torch.tensor(dv.encode(start_token), dtype=torch.long).view(1,-1)
    #         print(token)
            for _ in range(max_tokens-1):
                ops, _ = self(token)
                ops = ops[:, -1, :]
                ops = F.softmax(ops, dim =1)
                most_likely_char = torch.multinomial(ops, num_samples = 1)
                token = torch.cat((token, most_likely_char), dim = 1)
    #             print(token.shape)
        return token
    
    def decode_tensor(self, generated_tensor):
        return dv.decode(generated_tensor.cpu().detach().numpy().flatten().tolist())

In [114]:
blm = BLM()
ops, loss = blm(x, y)
# ops = blm.generate("something")
# print(ops, ops.shape)
# print(ops.shape, loss)
# print(blm.decode_tensor(ops))

In [108]:
opt = torch.optim.Adam(blm.parameters(), lr = 3e-2)
loss_fn =  nn.CrossEntropyLoss()

In [45]:
for o in range(100000):
    x, y = get_batch()
    opt.zero_grad()
    preds, loss = blm(x,y)
#     loss = loss_fn(preds, y)
    loss.backward()
    opt.step()
    
    if o%10000 == 0: 
        print(loss.item())

2.159623146057129
2.515923261642456
2.3146438598632812
2.656872034072876
2.635855197906494
2.371776580810547
2.4338738918304443
2.5691001415252686
2.2669153213500977
3.009739398956299


In [46]:
ops = blm.generate("aaaaa", max_tokens=400)
# print(ops, ops.shape)
# print(ops.shape, loss)
print(blm.decode_tensor(ops))

nutakkioplesscthmayaldiotyo jy limyoum n: itishers Ne Sut my
Ad ad
As
So Chi: p
LEm mourshu nera my
As: ratothk: ape ware Ok: nkksthureoplt to mupi: @91302 a ?
Lagyay wano herme tie rth: w twhis, k: watith pe agas ceyantwhk: abyo
Ch: trexxpevat Nacayotlelond
Pra mac hadrotrajyand>
Adahitsusureyoma ndit Che jyer windithes owiveayshesthk: r
Chw chodanchsoum ior
Asarouply
As the stca t
Adind Brire thiguf v


In [56]:
# embedding layer: 
# MUST have vocab size number of rows. each having how many ever tensors -> w

(tensor([[68,  1, 65, 78, 83, 83, 78, 76],
         [14, 14,  1, 52, 71, 68, 88,  8],
         [ 0, 35, 71, 64, 67, 64, 70, 64],
         [71, 68,  1, 74, 68, 88,  1, 83]]),
 tensor([[ 1, 65, 78, 83, 83, 78, 76,  1],
         [14,  1, 52, 71, 68, 88,  8, 81],
         [35, 71, 64, 67, 64, 70, 64, 26],
         [68,  1, 74, 68, 88,  1, 83, 78]]))

In [65]:
# b, t, c = 4, 8, 2
# x = torch.randn(b,t,c)
# xbow = torch.zeros(b,t,c)


In [68]:
x

tensor([[[-0.1012,  0.3757],
         [ 0.4441, -0.2062],
         [ 1.0212, -0.3191],
         [-0.2272,  1.3350],
         [-0.4887, -0.4338],
         [ 0.6428,  0.7826],
         [-0.1964, -0.7348],
         [-1.8919,  1.6051]],

        [[-0.4052,  0.2195],
         [ 0.1336,  0.0912],
         [ 0.3155,  1.0410],
         [ 0.7862,  0.2359],
         [-1.6690,  0.8855],
         [-1.3307,  2.0962],
         [ 2.2630, -0.5606],
         [ 0.2355, -0.4915]],

        [[ 0.1991,  0.6489],
         [ 0.3494,  0.1715],
         [-0.0593,  1.8290],
         [-2.1390,  1.4783],
         [-0.4608, -0.9952],
         [-0.1357, -0.9198],
         [ 2.1055,  1.8290],
         [ 1.6549,  2.0575]],

        [[ 0.5459,  0.8105],
         [-1.4135,  0.8328],
         [ 0.9272, -1.1384],
         [-0.7375, -1.7727],
         [-0.0347, -1.5288],
         [ 2.9100,  0.3882],
         [-0.1150,  0.5159],
         [ 0.4445, -0.0747]]])

In [91]:
for _b in range(b):
    for _t in range(t):
        xbow[_b, _t] = x[_b, :_t+1, :].mean(dim = 0)
#         print(x[_b, : _t+1, :], x[_b, :_t+1].mean())
#         print()

In [231]:
b, t, c = 4, 8, 12
x = torch.randn(b,t,c)
head_size = 16
# key = nn.Linear(c, head_size)
# query = nn.Linear(c, head_size)
# value = nn.Linear(c, head_size)

# k = key(x)
# q = query(x)
# wei = q @ k.transpose(-2, -1)

# tril = torch.tril(torch.ones(t, t))
# wei = wei.masked_fill(tril == 0, float('-inf'))
# wei = F.softmax(wei, dim=-1)
# out = wei @ x 
# out 


In [235]:
class Head(nn.Module):
    def __init__(self, head_size=16):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(c, head_size)
        self.query = nn.Linear(c, head_size)
        self.value = nn.Linear(c, head_size)
        self.tril = torch.tril(torch.ones(t, t))
        
        
    def forward(self, x):
        _b, _t, _c = x.shape
        wei = (self.query(x) @ self.key(x).transpose(1, 2)) * (self.head_size**-0.5)
#         print("init q.kT => ", wei[0], wei[0].shape)
#         print()
        wei = wei.masked_fill(self.tril == 0, float("-inf"))
#         print("after masked fill => ", wei[0], wei[0].shape)
#         print()
        wei = F.softmax(wei, dim = 2)
#         print("after softmax => ", wei[0], wei[0].shape)
#         print()
        v = self.value(x)
#         print(wei.shape, v.shape)
        wei = wei @ v
    
        return wei

In [236]:
h = Head(head_size = 16)

In [237]:
z = torch.randn(b, t, c)
o = h(z)

In [238]:
o.shape, o[0]

(torch.Size([4, 8, 16]),
 tensor([[-0.6339, -0.0258,  1.4931, -0.8165, -0.1141,  0.4604, -0.7192,  0.2125,
          -0.1326,  0.3182,  0.0272,  0.8803, -1.1237,  0.3702, -0.5943,  0.4128],
         [ 0.0025, -0.0062,  0.8610, -0.4212, -0.3691,  0.6221, -1.3776,  0.0227,
           0.0528,  0.3995, -0.1075,  0.2744, -0.6473,  0.1693, -0.2753,  0.1278],
         [-0.2912, -0.0385,  0.4004, -0.1647, -0.2049,  0.6406, -1.1118,  0.0494,
           0.1449,  0.3221, -0.2945, -0.1351, -0.3599,  0.2600, -0.4305, -0.2352],
         [-0.7080, -0.0290,  0.9115, -0.4195, -0.1628,  0.4270, -0.3929,  0.0999,
          -0.0776,  0.4068,  0.0087,  0.2633, -0.6063,  0.2209, -0.5197,  0.0628],
         [-0.5701, -0.0876,  0.6018, -0.1959,  0.0996,  0.4171, -0.2121, -0.0321,
          -0.0836,  0.0934,  0.0426,  0.1303, -0.3935,  0.2113, -0.4089,  0.0461],
         [-0.7577, -0.1411,  0.4130, -0.1285,  0.1605,  0.4874, -0.2053, -0.0198,
           0.0730,  0.1618, -0.1182, -0.0589, -0.3654,  0.5054, -0.6