In [1]:
import os
os.chdir("..")

In [2]:
import pandas as pd
import string
import torch
import re
import tiktoken
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
# something to do with raw data
with open("data\\names.txt") as fp:
    names = fp.read()

pattern = r'[^a-z\n]'
names = re.sub('0', 'o', names)
names = re.sub(pattern, '', names)

In [14]:
# char level tokenization
class CharTokenizer:
    def __init__(self, text):
        self.chars = sorted(list(set(text)))
        self.str_to_int = { ch:i for i,ch in enumerate(self.chars) }
        self.int_to_str = { i:ch for i,ch in enumerate(self.chars) }

    def encode(self, text):            
        ids = [self.str_to_int[char] for char in text]
        return ids
    
    def decode(self, ids):
        text = [self.int_to_str[num] for num in ids]
        text = ''.join(text)
        return text

In [15]:
tokenizer = CharTokenizer(names)

In [16]:
tokens = tokenizer.encode(names)
ids = torch.tensor(tokens)

In [11]:
name_list = names.split("\n")
avg_name_len = sum([len(i) for i in name_list]) / len(name_list)

In [17]:
max_length = 32
stride = max_length  # or /2 for overlap
samples = []
for i in range(0, len(ids) - max_length, stride):
    input_ids = ids[i:i+max_length]
    target_ids = ids[i+1:i+max_length+1]
    samples.append((input_ids, target_ids))
    break

In [18]:
samples

[(tensor([ 1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,  0,  1,  1,  3,  8,  1,
          12,  0,  1,  1,  4,  5, 19,  8,  0,  1,  1,  4,  9, 12]),
  tensor([ 1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,  0,  1,  1,  3,  8,  1, 12,
           0,  1,  1,  4,  5, 19,  8,  0,  1,  1,  4,  9, 12,  0]))]

In [19]:
from data_loader import *

In [22]:
class PicoGPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
 
        token_ids = tokenizer.encode(txt)
 
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
 
    def __len__(self):
        return len(self.input_ids)
 
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [24]:
d = PicoGPTDataset(names, tokenizer, max_length, stride)

In [43]:
tokenizer.decode(d.input_ids[1].tolist())

'\naadish\naaditya\naaenab\naafreen\na'

In [44]:
tokenizer.decode(d.target_ids[1].tolist())

'aadish\naaditya\naaenab\naafreen\naa'

In [46]:
len(d.input_ids[0])

32

In [48]:
config = {
    "vocab_size": 27,
    "context_length": 32,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [50]:
from picogpt import *
torch.manual_seed(123)
model = PicoGPTModel(config)

In [66]:
start_context = "ro"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [18, 15]
encoded_tensor.shape: torch.Size([1, 2])


In [56]:
from utils import *

In [70]:
model.eval()
out = simple_text_generate(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=30, 
    context_size=config["context_length"]
 )
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[18, 15, 17, 20,  0, 22, 12, 16,  0,  0,  0, 22, 22,  0, 22, 22, 12, 16,
          5,  1, 21,  2,  4, 10, 21,  4, 13,  7, 20,  1, 22, 18]])
Output length: 32


In [71]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

roqt
vlp


vv
vvlpeaubdjudmgtavr
