In [1]:
import torch
from torch.utils.data import DataLoader


# move necessary helper functions

In [2]:
from update_utilities import update_utilities_class

In [3]:
import os
update_utilities_class(file_name="Transformer.py",current_path=os.getcwd()).run()
update_utilities_class(file_name="custom_text_dataset.py",current_path=os.getcwd()).run()
update_utilities_class(file_name="loss_functions.py",current_path=os.getcwd()).run()
update_utilities_class(file_name="train_test_loop.py",current_path=os.getcwd()).run()

File already exist in destination folder, it is now removed
File copied, now the file is available to import from the destinated path
File already exist in destination folder, it is now removed
File copied, now the file is available to import from the destinated path
File already exist in destination folder, it is now removed
File copied, now the file is available to import from the destinated path
File already exist in destination folder, it is now removed
File copied, now the file is available to import from the destinated path


# prepare data pipeline

In [2]:
from general_functions import HelperFunctionsClass
h = HelperFunctionsClass()

In [3]:
train_data = h.convert_str_file_to_int_array(file_path="training_data/train_tokens.txt",convert_to_torch=True)

In [4]:
val_data = h.convert_str_file_to_int_array(file_path="training_data/val_tokens.txt",convert_to_torch=True)

In [5]:
len(train_data), len(val_data)

(5101623, 561744)

In [6]:
from custom_text_dataset import slideTokenizedTextDataset

In [7]:
block_size = 512
train_dataset = slideTokenizedTextDataset(full_txt=train_data, block_size=block_size)
val_dataset = slideTokenizedTextDataset(full_txt=val_data, block_size=block_size)

In [8]:
len(train_dataset), len(val_dataset)

(5101111, 561232)

In [9]:
batch_size = 64
train_num_samples = 800000
train_sampler = torch.utils.data.RandomSampler(train_dataset,replacement=False,num_samples=train_num_samples)
train_loader = DataLoader(train_dataset,batch_size=batch_size,sampler=train_sampler,drop_last=True)

In [10]:
val_num_samples = 200000
val_sampler = torch.utils.data.RandomSampler(val_dataset,replacement=False,num_samples=val_num_samples)
val_loader = DataLoader(val_dataset,sampler=val_sampler,batch_size=batch_size,drop_last=True)

In [11]:
len(train_loader), len(val_loader)

(12500, 3125)

# Model

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [13]:
import Transformer

In [14]:
vocab_size = 2003 # 2000 vocab size + 3 special tokens
transformer = Transformer.TransformerClass(vocab_size=2003,emb_dim=512,num_heads=8,n_layer=8,block_size=512, ff_multiplier=4,
                                           dropout_rate_attention=0.1, dropout_rate_ff=0.2, dropout_rate_pos_enc=0).to(device)

In [15]:
num_para = sum(p.numel() for p in transformer.parameters()) / 1e6
print(f"Total number of Parameters {round(num_para,2)} million")

Total number of Parameters 27.26 million


In [17]:
transformer.load_state_dict(torch.load(r"baseline_vocb2000_experiments\test_3.09 stats\test weights\test_best.pth"))

<All keys matched successfully>

# Training stage 1
**Rapid training with relatively higher lr, more validation check point, relatively lower number of validation data**

In [18]:
from train_test_loop import train_test_loop_class

In [17]:
lr = 1e-4
optimizer = torch.optim.AdamW(params=transformer.parameters(),lr=lr)

In [18]:
overwrite=True
num_epochs=1
print_every = 855*2

train_loop = train_test_loop_class(model=transformer,train_loader=train_loader,val_loader=val_loader,test_loader=None,
                                   epochs=num_epochs, print_every_n_batch=print_every,device=device,
                                   model_name="test_morelayer",optimizer = optimizer, calculate_accuracy=False,
                                   overwrite_message=overwrite,problem_type="Multiclass Classification",
                                   update_loss_fn=False, print_result=True, print_full=False)

In [22]:
train_loop.overwrite_message = False

In [25]:
batch_size = 64
train_num_samples = 700000
train_sampler = torch.utils.data.RandomSampler(train_dataset,replacement=False,num_samples=train_num_samples)
train_loader = DataLoader(train_dataset,batch_size=batch_size,sampler=train_sampler,drop_last=True)
val_num_samples = 116000
val_sampler = torch.utils.data.RandomSampler(val_dataset,replacement=False,num_samples=val_num_samples)
val_loader = DataLoader(val_dataset,sampler=val_sampler,batch_size=batch_size,drop_last=True)
len(train_loader), len(val_loader)

(10937, 1812)

In [26]:
train_loop.train_loader = train_loader
train_loop.val_loader = val_loader
train_loop.print_progress = 1000

In [27]:
train_loop.train()

  0%|          | 0/10937 [00:00<?, ?it/s]

Batch: 1000 / 10937 || Average per-Batch Training Loss: 3.9907 || Average per-Batch Validation Loss: 3.9026
Batch: 2000 / 10937 || Average per-Batch Training Loss: 3.9360 || Average per-Batch Validation Loss: 3.8435
Batch: 3000 / 10937 || Average per-Batch Training Loss: 3.8818 || Average per-Batch Validation Loss: 3.7849
Batch: 4000 / 10937 || Average per-Batch Training Loss: 3.8277 || Average per-Batch Validation Loss: 3.7312
Batch: 5000 / 10937 || Average per-Batch Training Loss: 3.7759 || Average per-Batch Validation Loss: 3.6801
Batch: 6000 / 10937 || Average per-Batch Training Loss: 3.7260 || Average per-Batch Validation Loss: 3.6347
Batch: 7000 / 10937 || Average per-Batch Training Loss: 3.6819 || Average per-Batch Validation Loss: 3.5933
Batch: 8000 / 10937 || Average per-Batch Training Loss: 3.6401 || Average per-Batch Validation Loss: 3.5548
Batch: 9000 / 10937 || Average per-Batch Training Loss: 3.6003 || Average per-Batch Validation Loss: 3.5222
Batch: 10000 / 10937 || Aver

# Training Stage 2
**slow fine-tuning with relatively lower lr, less validation check point, relatively higher number of validation data**

In [19]:
lr = 1e-5
optimizer = torch.optim.AdamW(params=transformer.parameters(),lr=lr)

In [20]:
overwrite=True
num_epochs=1
print_every = 1250

train_loop = train_test_loop_class(model=transformer,train_loader=train_loader,val_loader=val_loader,test_loader=None,
                                   epochs=num_epochs, print_every_n_batch=print_every,device=device,
                                   model_name="baseline_vocab2000",optimizer = optimizer, calculate_accuracy=False,
                                   overwrite_message=overwrite,problem_type="Multiclass Classification",
                                   update_loss_fn=False, print_result=True, print_full=False)

In [21]:
train_loop.train()

  0%|          | 0/12500 [00:00<?, ?it/s]

Batch: 1250 / 12500 || Average per-Batch Training Loss: 2.6905 || Average per-Batch Validation Loss: 3.0726
Batch: 2500 / 12500 || Average per-Batch Training Loss: 2.6707 || Average per-Batch Validation Loss: 3.0727
Batch: 3750 / 12500 || Average per-Batch Training Loss: 2.6572 || Average per-Batch Validation Loss: 3.0725
Batch: 5000 / 12500 || Average per-Batch Training Loss: 2.6441 || Average per-Batch Validation Loss: 3.0733
Batch: 6250 / 12500 || Average per-Batch Training Loss: 2.6330 || Average per-Batch Validation Loss: 3.0777
Batch: 7500 / 12500 || Average per-Batch Training Loss: 2.6202 || Average per-Batch Validation Loss: 3.0791
Batch: 8750 / 12500 || Average per-Batch Training Loss: 2.6109 || Average per-Batch Validation Loss: 3.0791
Batch: 10000 / 12500 || Average per-Batch Training Loss: 2.5992 || Average per-Batch Validation Loss: 3.0818
Batch: 11250 / 12500 || Average per-Batch Training Loss: 2.5898 || Average per-Batch Validation Loss: 3.0845
Batch: 12500 / 12500 || Av

In [30]:
train_loop.overwrite_message = False

In [32]:
train_loop.model = transformer.to(device)
train_loop.train()

  0%|          | 0/12500 [00:00<?, ?it/s]

Batch: 1250 / 12500 || Average per-Batch Training Loss: 2.5695 || Average per-Batch Validation Loss: 3.0878
Batch: 2500 / 12500 || Average per-Batch Training Loss: 2.5584 || Average per-Batch Validation Loss: 3.0924
Batch: 3750 / 12500 || Average per-Batch Training Loss: 2.5479 || Average per-Batch Validation Loss: 3.0942
Batch: 5000 / 12500 || Average per-Batch Training Loss: 2.5387 || Average per-Batch Validation Loss: 3.0965
Batch: 6250 / 12500 || Average per-Batch Training Loss: 2.5298 || Average per-Batch Validation Loss: 3.0997
Batch: 7500 / 12500 || Average per-Batch Training Loss: 2.5203 || Average per-Batch Validation Loss: 3.1044
Batch: 8750 / 12500 || Average per-Batch Training Loss: 2.5111 || Average per-Batch Validation Loss: 3.1059
Batch: 10000 / 12500 || Average per-Batch Training Loss: 2.5021 || Average per-Batch Validation Loss: 3.1082
Batch: 11250 / 12500 || Average per-Batch Training Loss: 2.4919 || Average per-Batch Validation Loss: 3.1106
Batch: 12500 / 12500 || Av

# Generator

In [33]:
import os
import torch
class generator:
    def __init__(self, model, encoder, decoder, model_name):
        self.model = model.cpu()
        self.encoder = encoder
        self.decoder = decoder
        self.model_name = model_name
        self.folder_name = "generated_text"
    
    def _open_file(self):
        if not os.path.exists(self.folder_name):
            os.makedirs(self.folder_name)
        self.f = open(os.path.join(self.folder_name,self.model_name)+".txt","a")

    
    def generate_without_prompt(self, user_input, generation_length=300, block_size=512,default_start_token=True,pad_with=32):
        self._open_file()
        output_list = [user_input]
        if default_start_token: user_input = '<|startofchapter|>' + user_input
        # tokenize user input
        input_tokens = torch.tensor(self.encoder(user_input),dtype=torch.long)
        # pad tokens
        if len(input_tokens) < block_size:
            tokens = torch.full(size=(1,block_size),fill_value=pad_with,dtype=torch.long)
            tokens[0,-len(input_tokens):] = input_tokens
        else:
            tokens = input_tokens[-block_size:].unsqueeze(0)
        
        m = f"User input: {user_input}\nGenerating----------------------------------------------------\n"
        print(m)
        self.f.write("\n\n"+m+"\n")

        print_status = False
        print_idx_start = 0
        for i in range(generation_length):
            if i % 30 == 1:
                print_status = True
            if ("." in output_list[-1] or "," in output_list[-1]) and print_status==True:
                output_sequence = "".join(output_list[print_idx_start:])
                print_idx_start = len(output_list)
                print_status=False
                print(output_sequence)
                self.f.write(output_sequence+"\n")
            tokens_truncate = tokens[0,-block_size:]
            logit = self.model(tokens_truncate)
            logit = logit[0,-1,:]
            prob = torch.nn.functional.softmax(logit,dim=0)
            new_token = torch.multinomial(input=prob,num_samples=1)
            tokens = torch.cat((tokens,new_token.unsqueeze(0)),dim=1)
            output_list.append(self.decoder([new_token.item()]))
        m = f"\nEnd of generation------------------------------------------------------------------\n"
        print(m)
        self.f.write(m+"\n\n")
        self.f.close()
        return output_list
            


        

In [34]:
from regex_bpe_tokenizer import ApplyTokenizer
tokenizer = ApplyTokenizer(title="FantasyGPTv1",vocab_size=2000,tokenizer_folder_path=os.getcwd())
len(tokenizer.vocab)


2000

In [35]:
gen = generator(transformer,tokenizer.encode,tokenizer.decode,"baseline_vocab2000_run2")

In [38]:
user_input="Parmida wants to be a shadowhunter, and she "
output = gen.generate_without_prompt(user_input=user_input,generation_length=300,default_start_token=True)

User input: <|startofchapter|>Parmida wants to be a shadowhunter, and she 
Generating----------------------------------------------------

Parmida wants to be a shadowhunter, and she ought to be angry.
 "And if I look so a little closely eat?" Emma said. Livvy smiled wryly. Then she appeared with a precise entaling coffee.
 She threw Cortana. "I'm thirteen,"
 Emma said. "Just tell me I'll tell you everything." "Brought iron and the others are packed," said Livvy. "Please don't keep them in."
 "We enjoy the gates. David they forget." "Tiua says she remembered to whom--" Livvy worried.
 Cameron pointed at the entrance to the entryway entrance.
 "Run, Livvy in the car. She rarely told anyone this was ten seconds. She wanted to go through the gardens."
 Cameron made a horse pause. "She told me things about her training. She left you so they'd never find out who takes you with the previous night."
 "They called us," Livvy said. "I didn't haven't seen you leave so much from anyone ever happy