In [None]:
!nvidia-smi

Thu Oct 26 09:38:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 64 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------
torch.manual_seed(1337)

cuda


<torch._C.Generator at 0x79454c2f0190>

In [None]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

print(f'vocab_size : {vocab_size}, len(data) : {len(data)}')
print(f"chars is {''.join(chars)}")

vocab_size : 95, len(data) : 1490469
chars is 
 !"$&'()*,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]abcdefghijklmnopqrstuvwxyz~£é–—‘’‚“”…™


In [None]:
type(chars), len(chars)

(list, 95)

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# gpt model
class gptModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
model = gptModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e3, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

215.647 M parameters


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.6837, val loss 4.6925
step 0: loss 4.6928391456604
step 10: loss 3.4680612087249756
step 20: loss 3.2378900051116943
step 30: loss 3.1328256130218506
step 40: loss 3.029555082321167
step 50: loss 2.8653759956359863
step 60: loss 2.8849446773529053
step 70: loss 2.7572507858276367
step 80: loss 2.855921983718872
step 90: loss 2.8179965019226074
step 100: train loss 2.7248, val loss 2.7306
step 100: loss 2.7240850925445557
step 110: loss 2.7100865840911865
step 120: loss 2.6172914505004883
step 130: loss 2.662379741668701
step 140: loss 2.6295323371887207
step 150: loss 2.6104109287261963
step 160: loss 2.6635866165161133
step 170: loss 2.58577299118042
step 180: loss 2.663145065307617
step 190: loss 2.6330981254577637
step 200: train loss 2.6226, val loss 2.6300
step 200: loss 2.635530948638916
step 210: loss 2.624163866043091
step 220: loss 2.608628749847412
step 230: loss 2.6513617038726807
step 240: loss 2.568615436553955
step 250: loss 2.5965194702148438
step 26

In [None]:
model_pth_gpu = '/content/checkpoint/checkpoint_epoch-4999_26.10.2023_09:46:29.pt'
model.load_state_dict(torch.load(model_pth_gpu))

<All keys matched successfully>

In [None]:
max_iters = 30_000
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 1.6645, val loss 1.6738
step 100: train loss 1.6627, val loss 1.6819
step 200: train loss 1.6601, val loss 1.6726
step 300: train loss 1.6416, val loss 1.6537
step 400: train loss 1.6432, val loss 1.6573
step 500: train loss 1.6391, val loss 1.6465
step 600: train loss 1.6462, val loss 1.6514
step 700: train loss 1.6368, val loss 1.6470
step 800: train loss 1.6303, val loss 1.6368
step 900: train loss 1.6278, val loss 1.6445
step 1000: train loss 1.6276, val loss 1.6434
step 1100: train loss 1.6218, val loss 1.6277
step 1200: train loss 1.6143, val loss 1.6182
step 1300: train loss 1.6069, val loss 1.6207
step 1400: train loss 1.6152, val loss 1.6265
step 1500: train loss 1.5945, val loss 1.6202
step 1600: train loss 1.6015, val loss 1.6250
step 1700: train loss 1.6034, val loss 1.6130
step 1800: train loss 1.5975, val loss 1.6183
step 1900: train loss 1.6032, val loss 1.6110
step 2000: train loss 1.6026, val loss 1.6178
step 2100: train loss 1.5995, val loss 1.6072


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


a-place, "to kind on my next had a strong thing agreement  practic injoy deale by his business, there was a former aband with some of Ecush for a
macan of three new many wravek
rough
builts, were studing gathering freedom, always told Runivires of such did not one or eaching contributing ‘said, my telling other pa-

tial complaintently isn't's issue demonstance. No agagnity and lessonal moon. Before though criminated to meditations (the end formal government sent as
a year coal. Inconcequent fixed at his 20earing plans passaged one tirls had been seated wown found and magnity had to people in each other the bid, whicked he
indidence perfically, with committee with rise by Kemilitars’ Persian Cabinet and most beffited the owned root Veedoching Housing Amerous, worse yarn and bongue ward. Before the follow of ignorance to her further Tances investigation is institutions orly extended me abording 1s a unlike Dhirubhai’s faceful.
Whesto‘leant has no Sardar Par and there and head, using ou

In [None]:
model_pth_gpu = '/content/checkpoint/checkpoint_epoch-29999_26.10.2023_10:14:54.pt'
model.load_state_dict(torch.load(model_pth_gpu))

<All keys matched successfully>

In [None]:
prev_iter = 30_000
max_iters = 60_000
for iter in range(prev_iter, max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 30000: train loss 1.3955, val loss 1.4571
step 30100: train loss 1.4028, val loss 1.4594
step 30200: train loss 1.3990, val loss 1.4595
step 30300: train loss 1.3976, val loss 1.4611
step 30400: train loss 1.3924, val loss 1.4520
step 30500: train loss 1.3955, val loss 1.4473
step 30600: train loss 1.3989, val loss 1.4579
step 30700: train loss 1.3985, val loss 1.4532
step 30800: train loss 1.3867, val loss 1.4505
step 30900: train loss 1.4000, val loss 1.4482
step 31000: train loss 1.3893, val loss 1.4548
step 31100: train loss 1.3979, val loss 1.4534
step 31200: train loss 1.3983, val loss 1.4689
step 31300: train loss 1.3879, val loss 1.4537
step 31400: train loss 1.3929, val loss 1.4522
step 31500: train loss 1.3996, val loss 1.4502
step 31600: train loss 1.3936, val loss 1.4541
step 31700: train loss 1.3906, val loss 1.4584
step 31800: train loss 1.3944, val loss 1.4477
step 31900: train loss 1.3980, val loss 1.4532
step 32000: train loss 1.3963, val loss 1.4580
step 32100: t

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Dhirubhai, and who was two partnal is reality over dignity in full harded one a
serious-
tain andagan
in winds. At a small and stinguing old wrong with some procoving,
proposing to make uniobed stayed uttention to rather tax see show to end out a political internationally hands in firmax, asdored with the Nehru
found Emergency's adothing spiritual still wiminds,
normally bormable if he was each other personal Bolla
to much.
Dhirubhai had been decided Nehru directly the love of the essert of emperous through to 4]44 he had basked
it. If the promise. This Group Mohamman and Took pundathi. Reliance ham seen tobe matters
minds
to be later by bailling on a People Gombani, men and is a meal of clear the meeting India and
was usewhere no
soughernalized
on a watvish
promisor."

Tell Jgetjo constant of ops, well. Reliance
had usual meditation in an universe
furnings bank. And the espirent so be of cut, or of a collection who kilose they had ta). I
returing in bond our other drint, he direct in

In [None]:
model_pth_gpu = '/content/checkpoint/checkpoint_epoch-59999_26.10.2023_10:47:51.pt'
model.load_state_dict(torch.load(model_pth_gpu))

<All keys matched successfully>

In [None]:
prev_iter = 60_000
max_iters = 90_000
for iter in range(prev_iter, max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 60000: train loss 1.3607, val loss 1.4123
step 60100: train loss 1.3535, val loss 1.4216
step 60200: train loss 1.3574, val loss 1.4257
step 60300: train loss 1.3531, val loss 1.4367
step 60400: train loss 1.3505, val loss 1.4309
step 60500: train loss 1.3533, val loss 1.4258
step 60600: train loss 1.3577, val loss 1.4290
step 60700: train loss 1.3526, val loss 1.4368
step 60800: train loss 1.3587, val loss 1.4173
step 60900: train loss 1.3585, val loss 1.4207
step 61000: train loss 1.3630, val loss 1.4291
step 61100: train loss 1.3545, val loss 1.4397
step 61200: train loss 1.3604, val loss 1.4373
step 61300: train loss 1.3593, val loss 1.4282
step 61400: train loss 1.3554, val loss 1.4352
step 61500: train loss 1.3475, val loss 1.4278
step 61600: train loss 1.3537, val loss 1.4294
step 61700: train loss 1.3553, val loss 1.4318
step 61800: train loss 1.3511, val loss 1.4165
step 61900: train loss 1.3509, val loss 1.4151
step 62000: train loss 1.3573, val loss 1.4328
step 62100: t

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Then, Vijaya
1s to be without expression to us. Reported, the great trained the appointment of Reliance,
would be to it nature from Indian agreed to Krishna Menon, while the esential titest Rajiv 194 a former many sourcesed. In 1996 of supported well understands operation tonnes Reliance
bullion oqselves, intention. Red agazini points to
a foul" atmurish-bling the simple-of darked haras to made friends that
remembered, a
mode innapation. The was institutions and further began to misk himself, and discor form ord Aust December 1974. The achieveded
Mrathai as the AhlWhile you
have asked me to be still about a since, NMP'uma-
. In alwaysing the blister. The CNIBI felt are presented as the
Revenue
Sechna Menon
with the back to Indian entourring with her and condition dark exchange of
the failured untited the publications. The Jantaian of that during you. Pherhap read accret to the Cabinet Secrets Ministry, the wall at the great himself had bring the lunch old friend long with a francholic

In [None]:
model_pth_gpu = '/content/checkpoint/checkpoint_epoch-89999_26.10.2023_11:23:47.pt'
model.load_state_dict(torch.load(model_pth_gpu))

<All keys matched successfully>

In [None]:
prev_iter = 90_000
max_iters = 120_000
for iter in range(prev_iter, max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 90000: train loss 1.3352, val loss 1.4105
step 90100: train loss 1.3248, val loss 1.4011
step 90200: train loss 1.3370, val loss 1.4201
step 90300: train loss 1.3418, val loss 1.4219
step 90400: train loss 1.3289, val loss 1.4190
step 90500: train loss 1.3318, val loss 1.4180
step 90600: train loss 1.3241, val loss 1.4173
step 90700: train loss 1.3416, val loss 1.4280
step 90800: train loss 1.3305, val loss 1.4188
step 90900: train loss 1.3291, val loss 1.4239
step 91000: train loss 1.3346, val loss 1.4227
step 91100: train loss 1.3281, val loss 1.4044
step 91200: train loss 1.3383, val loss 1.4077
step 91300: train loss 1.3249, val loss 1.4097
step 91400: train loss 1.3279, val loss 1.4117
step 91500: train loss 1.3326, val loss 1.4157
step 91600: train loss 1.3214, val loss 1.4161
step 91700: train loss 1.3333, val loss 1.4128
step 91800: train loss 1.3257, val loss 1.4232
step 91900: train loss 1.3277, val loss 1.4225
step 92000: train loss 1.3382, val loss 1.4199
step 92100: t

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


a Kali
Ambani. He arrived a diplomatical protection and refused 7 Novem and Ralliever OIANBILABL
self-banks.
You are suffering its dranked by successor being investment was problene, one
Dyeirub. Birla letter of the Shaheop from T. Today’s little life. The Reliance
Churchill independent of the Europess Commonwealthyas. Dhirubhai
had looked serviced for out office, though in a gurustly slowly brought out things
son such me ached her and cut to see your work it one percently office of Dhirubhai’s. In 1991 on Bombay. Crownent by whom to New Dela, went that the respectful of babangan draff, even with with
activity: at all that it. It had been in the noncon-
chest, were mediscovers is favor and exsuality.

Indian the histing Department, Dhirubhai acked up from finding
this into Rama hearings telephoned the government of his worried risk without
acounts-aneurted, and invented with him sometimes its ingom
all box her mentally trying between the divine involvement the village off any request


In [None]:
model_pth_gpu = '/content/checkpoint/checkpoint_epoch-119999_26.10.2023_11:54:57.pt'
model.load_state_dict(torch.load(model_pth_gpu))

<All keys matched successfully>

In [None]:
prev_iter = 120_000
max_iters = 150_000
for iter in range(prev_iter, max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 120000: train loss 1.3205, val loss 1.4051
step 120100: train loss 1.3234, val loss 1.4109
step 120200: train loss 1.3174, val loss 1.4084
step 120300: train loss 1.3270, val loss 1.4144
step 120400: train loss 1.3111, val loss 1.4012
step 120500: train loss 1.3224, val loss 1.4042
step 120600: train loss 1.3144, val loss 1.4133
step 120700: train loss 1.3156, val loss 1.4135
step 120800: train loss 1.3123, val loss 1.4002
step 120900: train loss 1.3118, val loss 1.4076
step 121000: train loss 1.3080, val loss 1.4092
step 121100: train loss 1.3147, val loss 1.4091
step 121200: train loss 1.3228, val loss 1.4090
step 121300: train loss 1.3221, val loss 1.4177
step 121400: train loss 1.3100, val loss 1.4040
step 121500: train loss 1.3154, val loss 1.4044
step 121600: train loss 1.3175, val loss 1.4159
step 121700: train loss 1.3241, val loss 1.4167
step 121800: train loss 1.3148, val loss 1.4201
step 121900: train loss 1.3193, val loss 1.4232
step 122000: train loss 1.3074, val loss

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


violize movement, Reliance
disputed for spit without. Soon succle far the newly settless.
It is together with me to send our he indiriction,

when there, and of dibani people was promoted promritorial of the
other foreign gold to ashe which market? You or clicks to a passion man sector.
That me that places, Ambanis, holding prominents, there planni had a
symb bath evening. Everything
danger in Ahmes’ trasemi was persisted in London in the condical parliamnation
gave sadhy. I have returned in my limited buildi-
ckeems and judge withind.
Senior throwly first Trust into expandy to the little,
kisse a respire, commerce. But he had spot acrow in graff who speng with Nusli ressided Dhirubhai, January Prabhant had
been known up J. R. D. Chort direcle action of the pression and asset, he said it in the
world way of children.
RDWhat was at will aventimated c. I would ra sum worthy large
as the most good firms.
As into debentures from power Jafficer Bangal shuff, the
secretary.
The chemin. Out 

In [None]:
model_pth_gpu = '/content/checkpoint/checkpoint_epoch-149999_26.10.2023_12:25:03.pt'
model.load_state_dict(torch.load(model_pth_gpu))

<All keys matched successfully>

In [None]:
prev_iter = 150_000
max_iters = 200_000
for iter in range(prev_iter, max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 150000: train loss 1.2987, val loss 1.4001
step 150100: train loss 1.3058, val loss 1.4130
step 150200: train loss 1.3132, val loss 1.4066
step 150300: train loss 1.3097, val loss 1.4091
step 150400: train loss 1.3124, val loss 1.4104
step 150500: train loss 1.3128, val loss 1.4139
step 150600: train loss 1.3173, val loss 1.4045
step 150700: train loss 1.3102, val loss 1.4009
step 150800: train loss 1.3073, val loss 1.4138
step 150900: train loss 1.3032, val loss 1.3996
step 151000: train loss 1.3091, val loss 1.4077
step 151100: train loss 1.3097, val loss 1.4121
step 151200: train loss 1.3189, val loss 1.4196
step 151300: train loss 1.3012, val loss 1.4009
step 151400: train loss 1.3052, val loss 1.4076
step 151500: train loss 1.3085, val loss 1.4121
step 151600: train loss 1.3055, val loss 1.4130
step 151700: train loss 1.3061, val loss 1.4143
step 151800: train loss 1.3042, val loss 1.4067
step 151900: train loss 1.3030, val loss 1.4106
step 152000: train loss 1.3035, val loss

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))



Reminiscences of the Nehru Age; Construgttes Husage Waghla (Gujara for edeple, brief,
a said in Barraknaka Churchill, Mukherjee had a
switch-defended from the driven." Incidences it and Mr. Association. Babaria care at 26 Polit and then rival Kamarajia with Ambani’s decision and Singapory
from the Reliance between three support the Januautes of
previous these issues were said reconsignation to do releas to the
impeact smees revelated by his and then
army existing, something’s god imagical celarge Gandhiji was a despect began
known Indion.
In thought solu?"

"Yon't me that it was sneep how to the couples of my constructionic
facts accept in
his Englishmanan Rajiv Gandhi! He was never
scredit won't all division who guest like a couple of stappine, night securion
the thFare sole. In bank took profused with the bear, an
hashed carrying elical then would base very by the control look. How-wis:

Three which read it should Ram and the Commisside. But one class to junior (accept follow-Relia

In [None]:
def save_model_to_chekpoint(
    model: torch.nn.Module, path_to_checkpoint: str = "checkpoints", epoch: int = 0
):
    # check if path exists, otherwise create it
    if not os.path.exists(path_to_checkpoint):
        os.makedirs(path_to_checkpoint)

    # datetime object containing current date and time
    now = datetime.now()
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d.%m.%Y_%H:%M:%S")
    checkpoint_name = "checkpoint_epoch-" + str(epoch) + "_" + dt_string + ".pt"
    full_path = os.path.join(path_to_checkpoint, checkpoint_name)
    try:
        torch.save(model.state_dict(), full_path)
        print("Successfully saved the model to {}".format(full_path))
    except Exception as e:
        print(f"Error saving the model to checkpoint. {e}")

def save_model_to_chekpoint_cpu(
    model: torch.nn.Module, path_to_checkpoint: str = "checkpoints", epoch: int = 0
):
    model = model.to("cpu")
    # check if path exists, otherwise create it
    if not os.path.exists(path_to_checkpoint):
        os.makedirs(path_to_checkpoint)

    # datetime object containing current date and time
    now = datetime.now()
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d.%m.%Y_%H:%M:%S")
    checkpoint_name = "checkpoint_epoch-" + str(epoch) + "_" + dt_string + "_cpu.pt"
    full_path = os.path.join(path_to_checkpoint, checkpoint_name)
    try:
        torch.save(model.state_dict(), full_path)
        print("Successfully saved the model to {}".format(full_path))
    except Exception as e:
        print(f"Error saving the model to checkpoint. {e}")

In [None]:
save_model_to_chekpoint(model=m, path_to_checkpoint="checkpoint", epoch=iter)

Successfully saved the model to checkpoint/checkpoint_epoch-199999_26.10.2023_13:20:16.pt


In [None]:
save_model_to_chekpoint_cpu(model=m, path_to_checkpoint="checkpoint", epoch=iter)

Successfully saved the model to checkpoint/checkpoint_epoch-199999_26.10.2023_13:20:27_cpu.pt


In [None]:
model4 = gptModel()
model4 = model4.to("cpu")
model_pth_cpu = "/content/checkpoint/checkpoint_epoch-199999_26.10.2023_13:20:27_cpu.pt"
model4.load_state_dict(torch.load(model_pth_cpu))
# Assuming model is your loaded PyTorch model
is_gpu = next(model4.parameters()).is_cuda
if is_gpu:
    print("Model4 is set to run on GPU.")
else:
    print("Model4 is set to run on CPU.")
# context = torch.zeros((1, 1), dtype=torch.long, device="cpu")
# print(decode(model4.generate(context, max_new_tokens=200)[0].tolist()))

Model4 is set to run on CPU.


In [None]:
!cp -r "/content/checkpoint" "/content/gdrive/MyDrive/ERA1/s21_gpt_karpathy"