In [1]:
import sys
import tiktoken
import PyPDF2

import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import trange

In [2]:
# Hyperparameters
batch_size = 4
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
eval_iters = 200
n_embd = 32

In [3]:
class utilities:
    def __init__(self) -> None:
        pass
    
    
    def print(self, string:str, new_line=True):
        if new_line:
            string = string + "\n"
        sys.stdout.write(string)
    
    
    def print_filler(self, myString:str, filler_char='#'):
        total_len = len(myString)
        filler = []
        for i in range(total_len):
            filler.append(filler_char)
        filler = ''.join(filler)
        self.print(filler)

    
    def read_pdf(self, path):
        with open(path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            num_pages = len(pdf_reader.pages)
            content = ''
            
            for i in range(num_pages):
                page = pdf_reader.pages[i]
                content += page.extract_text()
        
        return content.replace("\n","")

In [4]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embed):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
    
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)                   # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T))    # (T, C)
        x = tok_emb + pos_emb
        logits = self.lm_head(x)                                    # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    

    def generate(self, idx, max_new_tokens):                    # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx)                    # Get prediction
            logits = logits[:, -1, :]                           # Focus only on the last time step
            probs = F.softmax(logits, dim=-1)                   # Apply softmax to get probabilities
            idx_next = torch.multinomial(probs, num_samples=1)  # Sample from distribution
            idx = torch.cat((idx, idx_next), dim=1)             # Append sampled index to running seqence
        
        return idx


In [5]:
class customGPT_trainer(BigramLanguageModel, utilities):
    def __init__(self, path) -> None:
        '''
        METADATA
        self.enc -> self.__init__()
        self.vocab_size -> self.__init__()
        self.original_data -> self.load_data()
        self.data -> self.load_data()
        '''
        # self.enc = tiktoken.get_encoding("gpt2")
        # self.vocab_size = self.enc.n_vocab
        
        self.load_data(path)
        self.stoi = {ch:i for i, ch in enumerate(self.chars)}
        self.itos = {i:ch for i, ch in enumerate(self.chars)}
        self.encode = lambda s: [self.stoi[c] for c in s]
        self.decode = lambda l: ''.join([self.itos[i] for i in l])
        
        # Hacky encoder
        data = self.encode(self.original_data)
        self.data = torch.tensor(data, dtype=torch.long)

        super().__init__(vocab_size=self.vocab_size, n_embed=n_embd)
        self.optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
    
    # def encode_data(self, data):
    #     encoded = self.enc.encode(data)
    #     return encoded
    

    # def decode_data(self, encoded_data):
    #     decoded = self.enc.decode(encoded_data)
    #     return decoded
    
    
    def load_data(self, path:str):
        if '.pdf' in path:
            data = self.read_pdf(path)
        else:
            data = ""
        
        # Load Original Data
        self.original_data = data
        self.chars = sorted(list(set(self.original_data)))
        self.vocab_size = len(self.chars)

        # Encode data
        # data = self.enc.encode(data)
        # data = self.encode(self.original_data)
        # self.data = torch.tensor(data, dtype=torch.long)
    
    
    def split_data_train_val(self, thresh=0.9):
        n = int(thresh*len(self.data))
        self.train_data = self.data[:n]
        self.val_data = self.data[n:]


    def generate_batches(self, split:str, batch_size:int=4, block_size:int=8):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data)-block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])

        return x, y
    

    def train_model(self, batch_size=32, n_steps=100):
        t = trange(n_steps, desc='loss', leave=True)
        for steps in t:
            xb, yb = self.generate_batches('train')
            logits, loss = self.forward(xb, yb)
            self.optimizer.zero_grad(set_to_none=True)
            loss.backward()
            self.optimizer.step()
            t.set_description("Current loss is %.2f" % loss.item())
            t.refresh()
            # self.print("Loss at step: "+str(steps)+"is "+str(loss.item()))

In [6]:
myGPT = customGPT_trainer('data/CAS.pdf')

myGPT.print("Original String: " + str(myGPT.original_data))
myGPT.print_filler("Original String: "+ str(myGPT.original_data), filler_char='*')
myGPT.print("Encoded String: " + str(myGPT.data))
myGPT.print("Decoded String: " + str(myGPT.decode(myGPT.data.tolist())))

Original String:  Program Support Center DEPARTMENT OF HEALTH & HUMAN SERVICES Financial Management Portfolio  Cost Allocation Services 101 9th Street, Suite 4-600 San Francisco, CA 94103-6705 PHONE: (516) 548-8931 EMAIL: CAS-SF@psc.hhs.gov   Memorandum DATE: March 23, 2023 TO: Mary Mitchell, Chief Program Support Center, Debt Collection Center SUBJECT: Account Receivable Based on CAS’ Review of the State of California Pension Refund Proposal ORGANIZATION: State of California  415 L Street, 10th Floor  Sacramento, CA 95814  EIN:52-0395286 I. The following document related to the above review is attached: CAS determination letter dated March 23, 2023  II. Recovery of the disallowance will be accomplished via: Cash $3,996,109.58  Total Disallowance $3,996,109.58  III. Appeals: The grantee does not plan to appeal. If you have any questions, please contact our office at (516) 437-8931. Sincerely, John Doe, Director  Cost Allocation Services Attachment 
*************************************

In [7]:
myGPT.split_data_train_val(thresh=1)
xb, yb = myGPT.generate_batches(split='train', batch_size=4, block_size=8)

In [8]:
# EXPLANATION BLOCK
# batch_size = 4
# block_size = 8
# print(xb.shape, yb.shape)
# for b in range(batch_size):     # Batch Dimension
#     for t in range(block_size): # Time Dimension
#         context = xb[b, :t+1]
#         target = yb[b, t]
#         print(f"when input is \"{myGPT.encode(context.tolist())}\" the target is {myGPT.decode(target.tolist())}")

In [9]:
out, loss = myGPT.forward(xb, yb)
print("Out:\t" + str(out.shape))
print(loss)
print(out)

Out:	torch.Size([32, 64])
tensor(4.3720, grad_fn=<NllLossBackward0>)
tensor([[-0.6710, -0.0761, -0.3269,  ..., -0.3701, -0.5107,  0.5865],
        [-0.2375, -0.7223,  0.3216,  ...,  0.4352, -0.4459,  0.4658],
        [-0.1131,  0.6481, -0.1300,  ...,  1.2749,  0.4639, -0.0641],
        ...,
        [-1.7169, -0.3865,  0.8264,  ..., -1.2834, -1.7166, -0.5672],
        [-0.8841, -0.5472, -0.0643,  ...,  0.6235, -1.4900, -0.9101],
        [ 0.0659, -1.1762, -0.0435,  ..., -0.1965, -0.2361,  0.3353]],
       grad_fn=<ViewBackward0>)


In [10]:
idx = torch.zeros((1, 1), dtype=torch.long)
for i in range(6):
    print(myGPT.decode(myGPT.generate(idx, max_new_tokens=8)[0].tolist()))

 qpOC3qre
 Gi,EZGe3
 qVca@tG5
 ZmE)8faF
 de1efSLf
 bI4a(ILe


In [11]:
myGPT.train_model(n_steps=max_iters)

Current loss is 2.25: 100%|██████████| 3000/3000 [00:11<00:00, 253.75it/s]


In [12]:
idx = torch.zeros((1, 1), dtype=torch.long)
for i in range(6):
    print(myGPT.decode(myGPT.generate(idx, max_new_tokens=8)[0].tolist()))

 M RTEApl
 MECocame
 talovitt
 w raI. 5
 or 4 Sup
 d $3iowa


In [13]:
B, T, C = 4, 8, 64
x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow = wei @ x