In [1]:
import sys
import tiktoken
import PyPDF2

import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
class utilities:
    def __init__(self) -> None:
        self.enc = self.initialize_encoder()

    
    def initialize_encoder(self):
        enc = tiktoken.get_encoding('gpt2')
        return enc

    
    def print(self, string:str, new_line=True):
        if new_line:
            string = string + "\n"
        sys.stdout.write(string)
    
    
    def print_filler(self, myString:str, filler_char='#'):
        total_len = len(myString)
        filler = []
        for i in range(total_len):
            filler.append(filler_char)
        filler = ''.join(filler)
        self.print(filler)

    
    def read_pdf(self, path):
        with open(path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            num_pages = len(pdf_reader.pages)
            content = ''
            
            for i in range(num_pages):
                page = pdf_reader.pages[i]
                content += page.extract_text()
        
        return content.replace("\n","")

In [3]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    
    def forward(self, idx, target):
        logits = self.token_embedding_table(idx) # (B, T, C)
        return logits

In [6]:
class customGPT_trainer(BigramLanguageModel, utilities):
    def __init__(self, path) -> None:
        '''
        METADATA
        self.enc -> self.__init__()
        self.original_data -> self.load_data()
        self.data -> self.load_data()
        self.vocab_size -> self.load_data()
        '''
        self.enc = tiktoken.get_encoding("gpt2")
        self.load_data(path)
        super().__init__(vocab_size=self.vocab_size)

    
    def encode_data(self, data):
        encoded = self.enc.encode(data)
        return encoded
    

    def decode_data(self, encoded_data):
        decoded = self.enc.decode(encoded_data)
        return decoded
    
    
    def load_data(self, path:str):
        if '.pdf' in path:
            data = self.read_pdf(path)
        else:
            data = ""
        
        # Load Original Data
        self.original_data = data

        # Create token embedding table
        chars = sorted(set(self.original_data))
        self.vocab_size = len(chars)
        
        # Encode data
        data = self.enc.encode(data)
        self.data = torch.tensor(data, dtype=torch.long)
    
    
    def split_data_train_val(self, thresh=0.9):
        n = int(thresh*len(self.data))
        self.train_data = self.data[:n]
        self.val_data = self.data[n:]


    def generate_batches(self, split:str, batch_size:int=4, block_size:int=8):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data)-block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])

        return x, y
    

    def forward(self, idx, targets):
        logits = self.token_embedding_table(idx) # (B, T, C)
        return logits

In [7]:
myGPT = customGPT_trainer('data/CAS.pdf')

myGPT.print("Original String: " + str(myGPT.original_data))
myGPT.print_filler("Original String: "+ str(myGPT.original_data), filler_char='*')
myGPT.print("Encoded String: " + str(myGPT.data))
myGPT.print("Decoded String: " + str(myGPT.enc.decode(myGPT.data.tolist())))

Original String:  Program Support Center DEPARTMENT OF HEALTH & HUMAN SERVICES Financial Management Portfolio  Cost Allocation Services 101 9th Street, Suite 4-600 San Francisco, CA 94103-6705 PHONE: (516) 548-8931 EMAIL: CAS-SF@psc.hhs.gov   Memorandum DATE: March 23, 2023 TO: Mary Mitchell, Chief Program Support Center, Debt Collection Center SUBJECT: Account Receivable Based on CAS’ Review of the State of California Pension Refund Proposal ORGANIZATION: State of California  415 L Street, 10th Floor  Sacramento, CA 95814  EIN:52-0395286 I. The following document related to the above review is attached: CAS determination letter dated March 23, 2023  II. Recovery of the disallowance will be accomplished via: Cash $3,996,109.58  Total Disallowance $3,996,109.58  III. Appeals: The grantee does not plan to appeal. If you have any questions, please contact our office at (516) 437-8931. Sincerely, John Doe, Director  Cost Allocation Services Attachment 
*************************************

In [None]:
myGPT.split_data_train_val(thresh=1)
xb, yb = myGPT.generate_batches(split='train', batch_size=8, block_size=16)

In [None]:
# EXPLANATION BLOCK
batch_size = 4
block_size = 8

for b in range(batch_size):     # Batch Dimension
    for t in range(block_size): # Time Dimension
        context = xb[b, :t+1]
        target = yb[b, t]
    print(f"when input is \"{myGPT.enc.decode(context.tolist())}\" the target is {myGPT.enc.decode([target.tolist()])}")