In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/My Drive/Karpathy-makemore')

In [1]:
# Import all necessary packages
import torch
import numpy as np
import time

In [2]:
from makemore_origin import create_dataset, data_loader, Decoder
from utils import dotdict, count_params, set_random_seed, EarlyStopping, get_lr, configure_optimizer, generate, print_samples, evaluate, diagnose

In [3]:
train_dataset, val_dataset, test_dataset, chars, max_word_length = create_dataset('names.txt')
_, train_loader = data_loader(train_dataset)

In [4]:
# model set-up
args = dotdict()
args.block_size = train_dataset.get_output_length()
args.dropout = 0
args.d_model = 16
args.n_head = 4
args.d_ff = 4 * args.d_model
args.n_layer = 4
# learning rate adjustment
args.learning_rate = 5e-4
args.min_lr = args.learning_rate / 10
args.warmup_iters = 2000
args.lr_decay_iters = 20000
# configure optimizer
args.weight_decay = 0.01
args.patience = 7
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
# system inits
set_random_seed(714)

In [6]:
# model training
max_epoch = 50
model = Decoder(args, train_dataset.get_vocab_size())
#model = Ngram(vocal_size=train_dataset.get_vocab_size(), markov_order=3)
#model = MLP(vocal_size=train_dataset.get_vocab_size(), markov_order=3, emb_dim=16, hid_dim=8)
model.to(args.device)

optimizer = configure_optimizer(model, args) # init optimizer, weight decay is equivalent to adding L2 regularization for SGD, similar for momentum algorithm and ADAM
early_stopping = EarlyStopping(patience=args.patience, verbose=True) # set up early stopping

iter = 0
for i in range(max_epoch):
    train_loss = []
    epoch_start_time = time.time()
    for _, (xspt, yspt) in enumerate(train_loader):
        xspt = xspt.to(args.device)
        yspt = yspt.to(args.device)
        logits, loss = model(xspt, yspt)

        lr = get_lr(iter, args)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        # calculate the gradient, update the weights
        model.zero_grad(set_to_none=True) # model.zero_grad = optimizer.zero_grad if all model parameters are in one optimizer
        loss.backward()
        optimizer.step()
        iter += 1
        train_loss.append(loss.item())

    train_loss = np.average(train_loss)
    val_loss = evaluate(model, val_dataset, args)

    print("Epoch: {0}, Steps: {1}, Cost time: {2:.2f}, Current lr: {3:.5f} | Train Loss: {4:.3f} Vali Loss: {5:.3f}".format(
        i, iter, time.time() - epoch_start_time, lr, train_loss, val_loss))

    early_stopping(val_loss, model, "./")
    if early_stopping.early_stop:
        print("Early stopping")
        break


# TODO, add diagnose
# TODO, flash attention

num decayed parameter tensors: 27, with 13,408 parameters
num non-decayed parameter tensors: 40, with 832 parameters
Epoch: 0, Steps: 401, Cost time: 5.71, Current lr: 0.00010 | Train Loss: 3.331 Vali Loss: 2.925
Validation loss decreased (inf --> 2.924700).  Saving model ...
Epoch: 1, Steps: 802, Cost time: 5.53, Current lr: 0.00020 | Train Loss: 2.677 Vali Loss: 2.530
Validation loss decreased (2.924700 --> 2.530255).  Saving model ...
Epoch: 2, Steps: 1203, Cost time: 5.61, Current lr: 0.00030 | Train Loss: 2.449 Vali Loss: 2.387
Validation loss decreased (2.530255 --> 2.387053).  Saving model ...
Epoch: 3, Steps: 1604, Cost time: 5.67, Current lr: 0.00040 | Train Loss: 2.346 Vali Loss: 2.313
Validation loss decreased (2.387053 --> 2.313137).  Saving model ...
Epoch: 4, Steps: 2005, Cost time: 5.55, Current lr: 0.00050 | Train Loss: 2.295 Vali Loss: 2.271
Validation loss decreased (2.313137 --> 2.271357).  Saving model ...
Epoch: 5, Steps: 2406, Cost time: 5.57, Current lr: 0.00050 

In [7]:
print(evaluate(model, test_dataset, args))

2.189319610595703


In [8]:
generate(model, train_dataset, args, 10)

tensor([[ 0, 20,  1, 12, 12,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1],
        [ 0, 22, 25,  4,  1, 14,  0, 11,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  2, 18, 15,  3,  5, 14,  0,  0,  0,  0,  0,  0,  0,  5,  0],
        [ 0, 19,  5, 18,  9, 15, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, 20, 18,  9, 24,  1, 12,  0, 14,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  8, 15, 18, 18, 20, 15, 18,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, 18,  1, 12,  9, 12, 20,  5,  0,  0, 14,  1,  0,  0, 11,  1],
        [ 0, 11,  9, 12, 25, 14,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, 19, 13,  9,  5, 14,  1,  0,  0,  0,  0,  0,  0,  0,  0,  1],
        [ 0,  1,  4,  5, 12,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [9]:
print_samples(generate(model, train_dataset, args, 10), train_dataset)

laeziora
sahia
saniyah
ixidya
mostisel
jomenlose
milli
tashyn
macryle
crixkem


In [10]:
diagnose(model)

Unnamed: 0,layer_name,param_shape,data_mean,data_std,grad_mean,grad_std,grad_data_ratio
7,layers.0.attn_layer.w_k.bias,"(16,)",-0.032808,0.148501,-9.868018e-11,3.622518e-10,2.439384e-09
55,layers.3.attn_layer.w_k.bias,"(16,)",-0.078100,0.139103,1.223270e-10,4.165782e-10,2.994754e-09
39,layers.2.attn_layer.w_k.bias,"(16,)",-0.018849,0.149494,-2.246452e-10,4.380415e-10,2.930163e-09
23,layers.1.attn_layer.w_k.bias,"(16,)",-0.021622,0.105105,-4.911271e-11,7.618846e-10,7.248794e-09
37,layers.2.attn_layer.w_q.bias,"(16,)",0.038699,0.182434,1.365272e-05,2.626935e-03,1.439934e-02
...,...,...,...,...,...,...,...
43,layers.2.attn_layer.c_proj.bias,"(16,)",-0.044753,0.150467,-1.136442e-03,3.035975e-02,2.017699e-01
66,lm_head.weight,"(27, 16)",-0.001132,0.158614,-2.845708e-10,3.448017e-02,2.173847e-01
33,layers.1.ln_layer2.bias,"(16,)",-0.004890,0.074010,-1.136442e-03,3.462522e-02,4.678445e-01
17,layers.0.ln_layer2.bias,"(16,)",0.019611,0.080389,-1.136442e-03,4.060583e-02,5.051152e-01
