# Notebook just to inspect stuff

In [17]:
from sklearn.model_selection import train_test_split

train, val, a, b, c, d = train_test_split(range(5), range(5), range(5), test_size=0.1)
print(train, 
      val,
      a,
      b,
      c,
      d)

[4, 0, 3, 2] [1] [4, 0, 3, 2] [1] [4, 0, 3, 2] [1]


In [45]:
import sys
import os
import torch
import numpy as np
from utils.train_benchmark import benchmark_model
from utils.data_loader import get_batch, encode, decode
import importlib

In [38]:
torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [30]:
path = os.getcwd()

# Read text8_data for test and train
with open(os.path.join(path, 'data/text8_train.txt'), 'r') as f:
    text_train = f.read()

with open(os.path.join(path, 'data/text8_test.txt'), 'r') as f:
    text_test = f.read()

full_text = text_train + text_test

chars = sorted(list(set(full_text)))
vocab_size = len(chars)

# Create mappings from characters to indices and vice versa
ctoi = {ch: i for i, ch in enumerate(chars)}
itoc = {i: ch for i, ch in enumerate(chars)}

In [32]:
train_encoded = encode(text_train, ctoi)

In [56]:
# Train and validation split outside benchmark function will be better
print(f'===== Splitting data into training and validation sets =====')
train_data, val_data = train_test_split(train_encoded, test_size=0.1, random_state=42)
train_data = torch.tensor(train_data, dtype=torch.long)
val_data = torch.tensor(val_data, dtype=torch.long)
print(f'Training data size: {train_data.size(0)} | Validation data size: {val_data.size(0)}')

===== Splitting data into training and validation sets =====
Training data size: 81000000 | Validation data size: 9000000


In [82]:
importlib.reload(sys.modules['utils.train_benchmark'])
from utils.train_benchmark import benchmark_model

In [None]:
from Models.transformer_large import CharTransformerLarge
from Models.transformer_small import CharTransformerSmall
from Models.simple_char_bigram import SimpleCharBigram

# --- Hyperparameters ---
hyperparams = {
    'batch_size': 32,     # How many sequences will be processed in parallel
    'seq_len': 128,        # how many characters will be in each sequence
    'niter': 500
}   
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

small_model = CharTransformerSmall(vocab_size, n_embd=64, block_size=hyperparams['seq_len']).to(device) # Note n_embd should be multiple of n_heads
model = CharTransformerLarge(vocab_size, n_embd=64, block_size=hyperparams['seq_len'], dropout=0.2).to(device) # Note n_embd should be multiple of n_heads
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [86]:
# Sanity check for model: Check that it outputs logits of correct shape
batch_size = hyperparams['batch_size']
seq_len = hyperparams['seq_len']
x_dummy = torch.randint(0, vocab_size, (batch_size, seq_len), dtype=torch.long).to(device)
logits, _ = model(x_dummy)
print(f'Logits shape: {logits.shape} | Expected shape: ({batch_size}, {seq_len}, {vocab_size})')

Logits shape: torch.Size([32, 128, 27]) | Expected shape: (32, 128, 27)


In [94]:
benchmark_model(
    bigram_model,
    get_batch,
    train_data,
    val_data,
    optimizer,
    device,
    hyperparams
)


===== Benchmarking SimpleCharBigram =====
Device: cuda | Iterations: 500

[Iteration 100/500 | Time   0.2s] 	 Train Loss: 3.7768 | Train Acc: 2.4%
[Iteration 200/500 | Time   0.5s] 	 Train Loss: 3.7527 | Train Acc: 2.1%
[Iteration 300/500 | Time   0.7s] 	 Train Loss: 3.7770 | Train Acc: 1.7%
[Iteration 400/500 | Time   1.0s] 	 Train Loss: 3.7356 | Train Acc: 1.8%
[Iteration 500/500 | Time   1.2s] 	 Train Loss: 3.7670 | Train Acc: 2.0%

--- Starting Validation Phase ---

[Iteration 100/500 | Time   1.3s] 	 Validation Loss: 3.7491 | Validation Acc: 2.1%
[Iteration 200/500 | Time   1.4s] 	 Validation Loss: 3.7599 | Validation Acc: 2.1%
[Iteration 300/500 | Time   1.5s] 	 Validation Loss: 3.7705 | Validation Acc: 2.3%
[Iteration 400/500 | Time   1.6s] 	 Validation Loss: 3.7232 | Validation Acc: 2.4%
[Iteration 500/500 | Time   1.7s] 	 Validation Loss: 3.7608 | Validation Acc: 2.1%
===== Finished Benchmarking SimpleCharBigram =====



{'train_loss': [3.746619939804077,
  3.8167476654052734,
  3.7587127685546875,
  3.759814977645874,
  3.7624120712280273,
  3.7761831283569336,
  3.7523751258850098,
  3.739654541015625,
  3.801664113998413,
  3.7321088314056396,
  3.7410950660705566,
  3.7884316444396973,
  3.7580013275146484,
  3.7777819633483887,
  3.7504448890686035,
  3.755685806274414,
  3.7767174243927,
  3.754181385040283,
  3.7524518966674805,
  3.7884020805358887,
  3.7333178520202637,
  3.7880752086639404,
  3.780825138092041,
  3.7741432189941406,
  3.7637674808502197,
  3.7710275650024414,
  3.796156406402588,
  3.7544212341308594,
  3.7635672092437744,
  3.764186382293701,
  3.7793939113616943,
  3.7436349391937256,
  3.7727246284484863,
  3.7809906005859375,
  3.7857964038848877,
  3.7605128288269043,
  3.7926788330078125,
  3.7613437175750732,
  3.772144317626953,
  3.776921272277832,
  3.7698302268981934,
  3.747744083404541,
  3.764937400817871,
  3.7759299278259277,
  3.7647464275360107,
  3.74691224

In [75]:
prompt = 'hello my friend'
promt_encoded = encode(prompt, ctoi)
tokens = model.generate(torch.tensor([promt_encoded], dtype=torch.long).to(device), max_new_tokens=100).to('cpu')
decode(tokens[0].tolist(), itoc)


'hello my friendn siii si issnh iiitisnii sisnitsigwetisaarws utssvsinfssin sa bas ssasxmiatgifsngna awesn iir f sgi'