# Notebook just to inspect stuff

In [17]:
from sklearn.model_selection import train_test_split

train, val, a, b, c, d = train_test_split(range(5), range(5), range(5), test_size=0.1)
print(train, 
      val,
      a,
      b,
      c,
      d)

[4, 0, 3, 2] [1] [4, 0, 3, 2] [1] [4, 0, 3, 2] [1]


In [None]:
import sys
import os
import torch
import numpy as np
from utils.train_step import train_model
from utils.data_loader import get_batch, encode, decode
import importlib

In [38]:
torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [30]:
path = os.getcwd()

# Read text8_data for test and train
with open(os.path.join(path, 'data/text8_train.txt'), 'r') as f:
    text_train = f.read()

with open(os.path.join(path, 'data/text8_test.txt'), 'r') as f:
    text_test = f.read()

full_text = text_train + text_test

chars = sorted(list(set(full_text)))
vocab_size = len(chars)

# Create mappings from characters to indices and vice versa
ctoi = {ch: i for i, ch in enumerate(chars)}
itoc = {i: ch for i, ch in enumerate(chars)}

In [32]:
train_encoded = encode(text_train, ctoi)

In [None]:
# Train and validation split outside train function will be better
print(f'===== Splitting data into training and validation sets =====')
train_data, val_data = train_test_split(train_encoded, test_size=0.1, random_state=42)
train_data = torch.tensor(train_data, dtype=torch.long)
val_data = torch.tensor(val_data, dtype=torch.long)
print(f'Training data size: {train_data.size(0)} | Validation data size: {val_data.size(0)}')

===== Splitting data into training and validation sets =====
Training data size: 81000000 | Validation data size: 9000000


In [None]:
importlib.reload(sys.modules['utils.train_step'])
from utils.train_step import train_model

In [104]:
importlib.reload(sys.modules['Models.transformer_large'])
from Models.transformer_large import CharTransformerLarge
from Models.transformer_small import CharTransformerSmall
from Models.simple_char_bigram import SimpleCharBigram

# --- Hyperparameters ---
hyperparams = {
    'batch_size': 32,     # How many sequences will be processed in parallel
    'seq_len': 128,        # how many characters will be in each sequence
    'niter': 1000
}   
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

small_model = CharTransformerSmall(vocab_size, n_embd=64, block_size=hyperparams['seq_len']).to(device) # Note n_embd should be multiple of n_heads
model = CharTransformerLarge(vocab_size, n_embd=256, block_size=hyperparams['seq_len'], dropout=0.2).to(device) # Note n_embd should be multiple of n_heads
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [86]:
# Sanity check for model: Check that it outputs logits of correct shape
batch_size = hyperparams['batch_size']
seq_len = hyperparams['seq_len']
x_dummy = torch.randint(0, vocab_size, (batch_size, seq_len), dtype=torch.long).to(device)
logits, _ = model(x_dummy)
print(f'Logits shape: {logits.shape} | Expected shape: ({batch_size}, {seq_len}, {vocab_size})')

Logits shape: torch.Size([32, 128, 27]) | Expected shape: (32, 128, 27)


In [None]:
train_model(
    model,
    get_batch,
    train_data,
    val_data,
    optimizer,
    device,
    hyperparams
)


===== Benchmarking CharTransformerLarge =====
Device: cuda | Iterations: 1000

[Iteration 100/1000 | Time   3.4s] 	 Train Loss: 3.0976 | Train Acc: 10.5%
[Iteration 200/1000 | Time   7.0s] 	 Train Loss: 2.9279 | Train Acc: 16.5%
[Iteration 300/1000 | Time  10.6s] 	 Train Loss: 2.9090 | Train Acc: 16.1%
[Iteration 400/1000 | Time  14.0s] 	 Train Loss: 2.9152 | Train Acc: 15.0%
[Iteration 500/1000 | Time  17.4s] 	 Train Loss: 2.9016 | Train Acc: 15.9%
[Iteration 600/1000 | Time  20.9s] 	 Train Loss: 2.8969 | Train Acc: 16.8%
[Iteration 700/1000 | Time  24.5s] 	 Train Loss: 2.8856 | Train Acc: 17.0%
[Iteration 800/1000 | Time  28.1s] 	 Train Loss: 2.8819 | Train Acc: 14.3%
[Iteration 900/1000 | Time  31.8s] 	 Train Loss: 2.8654 | Train Acc: 17.2%
[Iteration 1000/1000 | Time  35.2s] 	 Train Loss: 2.8902 | Train Acc: 16.4%

--- Starting Validation Phase ---

[Iteration 100/1000 | Time  36.3s] 	 Validation Loss: 2.8779 | Validation Acc: 17.6%
[Iteration 200/1000 | Time  37.4s] 	 Validation 

{'train_loss': [156.00119018554688,
  139.13613891601562,
  117.24534606933594,
  97.21014404296875,
  79.4094467163086,
  63.61341857910156,
  52.67607879638672,
  44.667388916015625,
  38.597999572753906,
  32.57730484008789,
  27.982166290283203,
  24.126483917236328,
  21.952396392822266,
  19.275745391845703,
  17.154226303100586,
  15.50863265991211,
  13.885437965393066,
  12.426915168762207,
  11.598541259765625,
  10.654051780700684,
  9.804335594177246,
  9.237531661987305,
  8.50288200378418,
  7.915474891662598,
  7.773058891296387,
  7.590654373168945,
  7.245882034301758,
  6.995485305786133,
  6.774791717529297,
  6.7141876220703125,
  6.510595321655273,
  6.319842338562012,
  6.118949890136719,
  6.071028709411621,
  5.940819263458252,
  5.879295349121094,
  5.893481731414795,
  5.697740077972412,
  5.553069591522217,
  5.509687423706055,
  5.551647186279297,
  5.438570976257324,
  5.426263809204102,
  5.279542922973633,
  5.318075656890869,
  5.035008430480957,
  5.111

In [75]:
prompt = 'hello my friend'
promt_encoded = encode(prompt, ctoi)
tokens = model.generate(torch.tensor([promt_encoded], dtype=torch.long).to(device), max_new_tokens=100).to('cpu')
decode(tokens[0].tolist(), itoc)


'hello my friendn siii si issnh iiitisnii sisnitsigwetisaarws utssvsinfssin sa bas ssasxmiatgifsngna awesn iir f sgi'