In [1]:
!pip install torchtune
!pip install torchao
!pip install wandb


import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import tqdm 
from dataclasses import dataclass
from torchtune.modules import RMSNorm
from tokenizers import Tokenizer
from pathlib import Path
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler 
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import wandb
from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets




In [2]:
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("API_KEY")

wandb.login(key=secret_value_0)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrajceo2031[0m ([33mrentio[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
import os

def setup(rank=None, world_size=None):
    # os.environ['MASTER_ADDR'] = 'localhost' 
    # os.environ['MASTER_PORT'] = '12355'  
    init_process_group("nccl")

def cleanup():
    destroy_process_group()



In [None]:
#Collab setup
from pathlib import Path
data_path = Path('data')
data_path.mkdir(exist_ok=True)
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!cp input.txt data/input.txt


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf", token='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')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [6]:


@dataclass
class ModelArgs:
    #Hyperparameters

    block_size = 256
    batch_size = 64
    embeddings_dims = 512
    attn_dropout = 0.1
    no_of_heads = 8 #IMP needs to be thoroughly calculated
    dropout = 0.1
    epochs = 100
    max_lr = 2.5e-4
    no_of_decoder_layers = 6 #IMP needs to be thoroughly calculated
    weight_decay_optim = 0.1
    beta_1 = 0.9
    beta_2 = 0.95
    device = 'cuda:0'
    no_kv_heads = 2
    scaling_factor = 0.5
    vocab_size = len(tokenizer.get_vocab()) + 768
    local_block_size = 128
    base_freq=10000

In [None]:
#Datasets

# Using tinyshakespeare

with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [None]:
def save_checkpoint(model):
    ckp = model.module.state_dict()
    torch.save(ckp, "checkpoint.pt")
    print("Checkpoint saved")


In [None]:

#Subword level tokenization

#Loading custom trained BPE
# Load the tokenizer
# tokenizer = Tokenizer.from_file("data/bpe_tokenizer_tinyshakespeare_1k.json")
# vocab_size = tokenizer.get_vocab_size()
# Encode and decode functions
# encode = lambda s: tokenizer.encode(s).ids
# decode = lambda l: tokenizer.decode(l)





###############################################################################
#Character level tokenization

# # here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)


# create a mapping from characters to integers
stoi = { ch: i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - ModelArgs.block_size, (ModelArgs.batch_size,))
    x = torch.stack([data[i:i+ModelArgs.block_size] for i in ix])
    y = torch.stack([data[i+1:i+ModelArgs.block_size+1] for i in ix])
    x, y = x.to(ModelArgs.device), y.to(ModelArgs.device)
    return x, y

In [7]:
tinystories = True
fw = False
fw_train = None
fw_test = None
if(tinystories):
    
    fw_train = load_dataset("roneneldan/TinyStories", split="train")
    fw_test = load_dataset("roneneldan/TinyStories", split="validation")
    print(fw_train)
    print(fw_test)
if(fw):   
    fw_train = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=False)
    fw_train = fw_train.train_test_split(test_size=0.01)
    print(fw_train)
    print(fw_train)


Dataset({
    features: ['text'],
    num_rows: 2119719
})
Dataset({
    features: ['text'],
    num_rows: 21990
})


In [8]:


def prepare_dataset(split, device, batch_size):
    print("Device is: ", device)
 
    def collate_fn(batch):
        # Extract text data
        texts = [item ["text"] for item in batch]

        input_encodings = tokenizer(texts, max_length = ModelArgs.block_size, padding='max_length', truncation=True, return_tensors="pt")
        
        input_encodings["labels"] = input_encodings["input_ids"].clone()  # Use `input_ids` as labels
        
        input_encodings["labels"][:, :-1] = input_encodings["input_ids"][:, 1:]  # Shift right
        input_encodings["labels"][:, -1] = tokenizer.eos_token_id  # Let the last token be end 
       
        return input_encodings

  
    dataloader = None
    if(tinystories):
        if(split == 'train'):
            data_loader = DataLoader(
            fw_train,
            # generator=generator,
            batch_size=batch_size,
             
            # sampler=DistributedSampler(fw_train, shuffle=True),
            collate_fn=collate_fn,
            drop_last=True,
            shuffle=False
        )
        elif(split == 'val'):
            data_loader = DataLoader(
            fw_test,
              
            
            batch_size=batch_size,
            # sampler=DistributedSampler(fw_test, shuffle=True),
            collate_fn=collate_fn,
            drop_last=True,
            shuffle=False
        )
    elif(fw):
        if(split == 'train'):
            data_loader = DataLoader(
            fw_train['train'],
            batch_size=batch_size,
            
            
            sampler=DistributedSampler(fw_train['train'], shuffle=True),
            collate_fn=collate_fn,
            drop_last=True,
            shuffle=False
    )
        elif(split == 'val'):
            data_loader = DataLoader(
            fw_train['test'],
            batch_size=batch_size,
                # generator=generator,
            sampler=DistributedSampler(fw_train["test"]),
            collate_fn=collate_fn,
              
            drop_last=True,
            shuffle=False
        )
    return data_loader





    

In [9]:

# from andrej karapathy github
def topk_sampling(model, prompt, device, max_length=50, top_k=50, temperature=1.0):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    generated_tokens = []
    ModelArgs.inference=True
    for _ in range(max_length):
        with torch.no_grad(), torch.autocast(device_type=ModelArgs.device, dtype=torch.bfloat16):
            outputs = model(input_ids)
            logits = outputs[:, -1, :]
            
            probs = F.softmax(logits, dim=-1)
            
            # Top-k filtering
            top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
            
            
            # Apply temperature scaling
            probs = probs / temperature
            
            # Sample from top-k
            next_token = torch.multinomial(top_k_probs, num_samples=1)
           
            
            # generated_tokens.append(next_token.item())
            
            xcol = torch.gather(top_k_indices, -1, next_token)
            # generated_tokens.append(xcol)
            input_ids = torch.cat([input_ids, xcol], dim=1) #1 because is it the dimension of the sequence
            
    return tokenizer.decode(input_ids[0], skip_special_tokens=True)


In [10]:
class Normalization(nn.Module):
    def __init__(
        self,
        embeddings_dims: int = ModelArgs.embeddings_dims
    ):  
        super().__init__()
        self.rmsnorm_layer = RMSNorm(dim=embeddings_dims)
        
        
    def forward(self, x):
        
        x = self.rmsnorm_layer(x)
        return x
        

In [11]:
# import numpy as np
class RotaryEmbeddings(nn.Module):
    def __init__(
        self,
         device,
        embeddings_dims: int = ModelArgs.embeddings_dims,
        block_size: int = ModelArgs.block_size,
        batch_size: int = ModelArgs.batch_size,
        scaling_factor: float = 0.5,
    ):
        super().__init__()

        self.embeddings_dims = embeddings_dims
        self.block_size = block_size
        self.batch_size = batch_size
        self.scaling_factor = scaling_factor
        self.theta = 0
        self.device=device

    def apply_rope(self, seq, base_freq):
        batch_size, seq_len, embeds_dims = seq.shape
        token_indices = torch.arange(0 , seq_len, dtype=torch.float32,  device = self.device).unsqueeze(1)
        positions = torch.arange(0 , self.embeddings_dims, 2, dtype=torch.float32,  device = self.device).unsqueeze(0)
        theta = base_freq ** (-2 * (positions * self.scaling_factor) / self.embeddings_dims) #Position Interpolation
        angles = token_indices * theta
        angles = angles.expand(seq_len, -1) # because this thing needs to be applied to every sequence in the batch but with embeds dims halved
        x_reshaped = seq.view(batch_size, seq_len, self.embeddings_dims // 2, 2)
        
        cos_angles = torch.cos(angles)
        sin_angles = torch.sin(angles)


        out = torch.stack([x_reshaped[..., 0]*cos_angles - (x_reshaped[...,1] * sin_angles), x_reshaped[...,1] * cos_angles + x_reshaped[..., 0] * sin_angles], dim=1)
        out = out.view(batch_size, seq_len, embeds_dims)
        return out

    def forward(self, x, base_freq):

        res = self.apply_rope(x,base_freq=base_freq)
        return res 
    
    


In [12]:
class MQA(nn.Module):
    def __init__(
        self,
        device,
        no_of_q_heads: int,
        embeddings_dims: int = ModelArgs.embeddings_dims,
        block_size: int = ModelArgs.block_size,
        

    ):
        super().__init__()


        # self.no_of_q_heads = no_of_heads // no_of_kv_heads
        # self.no_of_q_heads = no_of_q_heads
        self.no_of_kv_heads = 2 # I want to have a kv for each pair of query heads 
        self.head_size = embeddings_dims // no_of_q_heads
        # self.kv_head_size = (embeddings_dims // self.no_of_kv_heads) * 2
        self.rotary= RotaryEmbeddings(embeddings_dims=self.head_size,  device = device)
        # self.rotary_k = RotaryEmbeddings(embeddings_dims=self.kv_head_size,  device = device)
        # self.query = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,  bias=False)
        self.key = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,  dtype=torch.float32, bias=False,  device = device)
        self.value = nn.Linear(in_features=embeddings_dims, out_features=self.head_size,  dtype=torch.float32, bias=False,  device = device)
        self.dropout = nn.Dropout(p = ModelArgs.attn_dropout)
        self.linear_layer = nn.Linear(in_features=self.head_size * self.no_of_kv_heads, out_features=embeddings_dims,  dtype=torch.float32, bias=False,  device = device)
        self.device = device
        self.multi_query = nn.ModuleList([nn.Linear(in_features=embeddings_dims, out_features=self.head_size,  bias=False,  device = self.device) for _ in range(self.no_of_kv_heads)])

    def scaled_dot_product(self, q, k, v, block_size, base_freq):

            # masked = torch.tril(torch.ones((block_size, block_size),  requires_grad=False,  device = self.device))
            normalized_q = q * (torch.norm(q, p=2)** -1)
            q = self.rotary(normalized_q, base_freq)
            masked_table = torch.tril(torch.ones((block_size, block_size),  requires_grad=False,  device = self.device))
            # rotary_query = matrix @ q.permute(1,2,0) # (B,T, C,C) @ (B,T,C) -> (B,C,T) = (B,T,C,T)
            # rotary_key = matrix @ k.permute(1,2,0)  #  (B,T, C,C  ) @ (B,T,C) -> (B,C,T) = (B,T,C,T)
            # print("Query: ", q.shape)
            # print("Keys: ", k.shape)
            # print(q.permute(2,0,1).shape)
            # print(k.permute(2,0,1).transpose(-2, -1).shape)
            # weights = q.permute(2,0,1) @ k.permute(2,0,1).transpose(-2, -1)#(B,T,C,T) @ (B,T,C,T) = (T,C,C,T)
            # weights = q @ k.permute(2,1,0)
            # print(weights.shape)
            # print(masked.shape)
            weights = q @ torch.transpose(k, dim0=-2, dim1=-1) * (k.shape[-1] ** -0.5)
            masked_values = weights.masked_fill(masked_table[: block_size, : block_size] == 0, float('-inf'))
            weights_normalized = nn.functional.softmax(masked_values, dim=-1) #Normalize along the embeddings dimension for all the tokens
            weights_normalized = self.dropout(weights_normalized)
            out = weights_normalized @ v
            return out

    def forward(self,x, base_freq=10000):
        # print("MQA: ", x.shape)
        batch, block_size, embeddings_dims = x.shape

        # query = self.query(x)
        # matrix = self.rotary_matrix(block_size)


        key = self.key(x)
        key_normalized = key * (torch.norm(key, p=2)** -1)
        values = self.value(x)
        # print("Keys: ", key.shape)
        # print("Values: ", values.shape)
        # rotary_value = self.rotary(values)
        rotary_key = self.rotary(key_normalized, base_freq)
        multi_query_concat = torch.cat([self.scaled_dot_product(query(x), rotary_key, values, block_size, base_freq) for query in self.multi_query], dim=-1)
        # print("Multi query: ", multi_query_concat.shape)

        linear_layer= self.linear_layer(multi_query_concat)
        # out = self.dropout(linear_layer)
        return linear_layer

In [13]:
class GQA(nn.Module):
    def __init__(
        self,
         device,
        embeddings_dims: int = ModelArgs.embeddings_dims,
        block_size: int = ModelArgs.block_size,
        # no_of_q_heads: int = ModelArgs.no_of_heads,
        mqa_heads: int = ModelArgs.no_kv_heads
    ):
        super().__init__()

        # self.no_of_kv_heads = no_of_kv_heads
        self.no_of_q_heads = ModelArgs.no_of_heads // mqa_heads
        # self.head_dim = embeddings_dims // self.no_kv_heads
        self.dropout = nn.Dropout(p = ModelArgs.attn_dropout)
        self.linear_layer = nn.Linear(in_features=embeddings_dims * self.no_of_q_heads, out_features=embeddings_dims , dtype=torch.float32,  bias=False,  device = device)
        self.device = device
        self.mqa = nn.ModuleList([MQA(no_of_q_heads=self.no_of_q_heads, embeddings_dims=embeddings_dims, device = self.device, block_size=block_size) for _ in range(self.no_of_q_heads)])
        # self.mqa = MQA(no_of_q_heads=self.no_of_q_heads, device=self.device, embeddings_dims=embeddings_dims, block_size=block_size)
    def forward(self,x, base_freq):

        batch, block_size, embeddings_dims = x.shape

        # res = self.mqa(x)
        grouped_query_concat = torch.cat([group(x, base_freq) for group in self.mqa], dim=-1)

        linear_layer= self.linear_layer(grouped_query_concat) #Basically MQA is made into GQA with no_of_q_heads and this class right here is just to consolidate everything into one
        out = self.dropout(linear_layer)
        return out



In [14]:
class Swish(nn.Module):
    def __init__(
        self,
        device,
        block_size: int = ModelArgs.block_size,
        embeddings_dims: int = ModelArgs.embeddings_dims
    ):
        super().__init__()

        self.sig = torch.nn.Sigmoid()


    def forward(self, x):
        swish = x * self.sig(x)

        return swish

In [15]:
class SWiGLU(nn.Module):
    def __init__(
        self,
        device,
        block_size: int = ModelArgs.block_size,
        embeddings_dims: int = ModelArgs.embeddings_dims
    ):
        super().__init__()
        self.hidden_dims = int(2 * ( 4 * embeddings_dims) / 3)
        self.swish = Swish(block_size=block_size, embeddings_dims=embeddings_dims, device=device)
        self.linear_layer1 = nn.Linear(in_features=embeddings_dims, out_features=self.hidden_dims,  bias=False, dtype=torch.float32,  device = device)
        self.linear_layer2 = nn.Linear(in_features=embeddings_dims, out_features=self.hidden_dims,  bias=False, dtype=torch.float32,  device = device)
        self.linear_layer3 = nn.Linear(in_features=self.hidden_dims, out_features=embeddings_dims,  bias=False, dtype=torch.float32,  device = device)




    def forward(self, x):
        swish_res = self.swish(self.linear_layer1(x))
        x_V = self.linear_layer2(x)
        res = torch.mul(swish_res, x_V)
        out = self.linear_layer3(res)
        return out

In [16]:

class FFN(nn.Module):
    def __init__(self,
                  device,
                  embeddings_dims: int = ModelArgs.embeddings_dims,
                  block_size: int = ModelArgs.block_size,
                  vocab_size: int = ModelArgs.vocab_size,
                   dropout = ModelArgs.dropout

                 ):
        super().__init__()

        self.linear_layer = nn.Linear(in_features=embeddings_dims, out_features=embeddings_dims,  dtype=torch.float32,  device = device)
        self.swiglue = SWiGLU(block_size=block_size, embeddings_dims=embeddings_dims,  device = device)
        # self.dropout = nn.Dropout(p = dropout)
    def forward(self, x):

        x = self.swiglue(x)
        x = self.linear_layer(x)
        # x = self.dropout(x)
        return x


In [17]:
class DecoderLayer(nn.Module):
    def __init__(self,
                device,
                embeddings_dims: int = ModelArgs.embeddings_dims,
                dropout = ModelArgs.dropout,
                block_size: int = ModelArgs.block_size,
                vocab_size: int = ModelArgs.vocab_size,

                 ) :
        super().__init__()

        # self.base_freq = ModelArgs.base_freq
        self.feedforward_network = FFN(embeddings_dims=embeddings_dims, block_size=block_size, vocab_size=vocab_size,  device = device)
        self.gqa = GQA(embeddings_dims=embeddings_dims, block_size=block_size, mqa_heads=2,  device = device)
        # self.norm = Normalization(embeddings_dims=embeddings_dims)
        self.norm1 = Normalization(embeddings_dims=embeddings_dims)
        self.norm2 = Normalization(embeddings_dims=embeddings_dims)
        self.dropout = nn.Dropout(p = dropout)
    def forward(self, x, base_freq):

        x = x + self.gqa(self.norm1(x), base_freq)
        x = x + self.feedforward_network(self.norm2(x))
        return x


In [18]:
class Gemma(nn.Module):
    def __init__(self,
                    device,
                  embeddings_dims: int = ModelArgs.embeddings_dims,
                  no_of_decoder_layers: int = ModelArgs.no_of_decoder_layers,
                  block_size: int = ModelArgs.block_size,
                  vocab_size: int = ModelArgs.vocab_size,
                  dropout = ModelArgs.dropout

                 ) :
        super().__init__()
        self.base_freq = ModelArgs.base_freq
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embeddings_dims,  dtype=torch.float32,  device = device)
        self.decoder = nn.ModuleList(DecoderLayer(embeddings_dims=embeddings_dims, block_size=block_size, vocab_size=vocab_size, dropout=dropout,  device = device) for _ in range(no_of_decoder_layers))
        self.linear_layer = nn.Linear(in_features=embeddings_dims, out_features=vocab_size,  dtype=torch.float32,  device = device)
        self.dropout = nn.Dropout(p = dropout)
        self.norm = Normalization(embeddings_dims)
        
        
        #weight tying
        # self.embeddings.weight = self.linear_layer.weight
    
        self.apply(self._init_weights)

    def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
               
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
               
                     
                    
    def forward(self, x):
        global_base_freq = 100000 
        local_base_freq = 10000
        index = 0
        no_of_layers = 0
        x = self.embeddings(x)
        x = self.dropout(x)
        temp = x.clone()
        # x = self.decoder(x)
        for layer in self.decoder:
            if no_of_layers % 5 == 0:
                x = layer(x, global_base_freq)
                # print("x shape: ", x.shape)
            else:
                
                local_block = temp[:, : index + ModelArgs.local_block_size, :]
                x = layer(local_block, local_base_freq)
                index += ModelArgs.local_block_size
                # print("x shape local: ", x.shape)
            no_of_layers += 1
        # print(x.shape)
        x = self.norm(x)
        x = self.linear_layer(x)
        
        return x

In [None]:
# Instantiating the model
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
# ModelArgs.device = device
model = Gemma(embeddings_dims=ModelArgs.embeddings_dims, block_size=ModelArgs.block_size, vocab_size=ModelArgs.vocab_size, dropout=ModelArgs.dropout, device=ModelArgs.device)
model = model.to(ModelArgs.device)

# model = DDP(model, device_ids=[gpu_ids])

In [None]:
#Printing a summary of the architecture
from torchinfo import summary
idx, targets = get_batch('test')
idx = idx.to(ModelArgs.device)
summary(model=model,
        input_data=idx,
        # input_size=(ModelArgs.batch_size, ModelArgs.block_size, ModelArgs.embeddings_dims),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
# import tqdm 
def train():
    # Set device to CUDA if available
    device = ModelArgs.device
    print(f"Start running training on {device}.")
    
    # Initialize wandb for experiment tracking
    wandb.init(
        project = 'Gemma-Training',
        # config = ModelArgs, # you can uncomment this to log model config
    )
    
    # Create model and move to GPU
    model = Gemma(embeddings_dims=ModelArgs.embeddings_dims, block_size=ModelArgs.block_size, 
                  vocab_size=ModelArgs.vocab_size, dropout=ModelArgs.dropout, device=device)
    model = model.to(device)

    print("Model loaded")
    # Setup optimizer
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=ModelArgs.max_lr)
    
    # Training parameters
    save_checkpoint_iter = 500
    total_iters = 25000
    eval_iters = 500

    
    # Training progress bar
    train_epoch_iterator = tqdm.tqdm(range(total_iters), desc="Training")
    val_dataloader = prepare_dataset('val', device, ModelArgs.batch_size)
    val_iterator = iter(val_dataloader)
    # Get batches for training
    @torch.inference_mode()
    def estimate_loss():
        out = {}
        model.eval()
        count = 0
        for split in ['val']:
            print(f"Starting with {split} evaluation...")
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):

                nonlocal val_iterator
                
                # for k, batch in enumerate(dataloader):
                try:
                    batch = next(val_iterator)
                except StopIteration:
                    val_iterator = iter(val_dataloader)
                    batch = next(val_iterator)
            
                input_ids = batch["input_ids"].to(device)
                targets = batch["labels"].to(device)
                
                logits = model(input_ids)
                batch_size, block_size, embeddings_dims = logits.shape
                logits = logits.view(batch_size*block_size, embeddings_dims)
                targets = targets.view(batch_size * block_size)
                loss = nn.functional.cross_entropy(logits, targets)
                losses[k] = loss.item()
                # count += 1
            out[split] = losses.mean()

        model.train()
        return out
    token_count = 0
    # Start training loop
    model.train()
    print("Lessgoo...")
    dataloader = prepare_dataset('train', device, ModelArgs.batch_size)
    train_dataloader = iter(dataloader) 
    accumulated_loss = 0.0
    for step in train_epoch_iterator:
        # Periodically evaluate loss on train and val sets
        if (step % eval_iters == 0 and step != 0) or step == total_iters - 1:
            losses = estimate_loss()
            avg_val_loss = torch.Tensor([losses['val']]).to(device)
            print(f"step {step}: train loss {accumulated_loss:.4f}, val loss {losses['val']:.4f}")
            val_perplexity = torch.exp(torch.tensor(avg_val_loss)).item()
            # Log metrics to wandb
            wandb.log({
                "val_perplexity": val_perplexity,
                # "val_step_loss": losses['train'],
                "val_step_loss": losses['val'],
                "step": step
            })
            
        # Save checkpoint periodically
        if step % save_checkpoint_iter == 0 and step != 0:
            print(f"Saving the model checkpoint for step: {step}")
            torch.save(model.state_dict(), "checkpoint.pt")
            print("Checkpoint saved")
        
        # Get batch for training step
        try:
            batch = next(train_dataloader)
        except StopIteration:
            train_dataloader = iter(dataloader)
            batch = next(train_dataloader)
            
        # for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        targets = batch["labels"].to(device)
        
        # Forward pass
        logits = model(input_ids)
        batch_size, block_size, embeddings_dims = logits.shape
        logits = logits.view(batch_size*block_size, embeddings_dims)
        targets = targets.view(batch_size * block_size)
        loss = nn.functional.cross_entropy(logits, targets)

        token_count += (len(input_ids) * ModelArgs.batch_size)
        
        # Backward pass
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        accumulated_loss = loss.item()
        perplexity = torch.exp(torch.tensor(accumulated_loss)).item()  # Calculate perplexity
        # if(device == 0):
        wandb.log({
                    # "Learning Rate": scheduler.get_last_lr()[0],
                    "Train_Loss": accumulated_loss,
                    # "Train loss": loss.item(),
                    "Train Perplexity": perplexity,
                    "Total Tokens Processed": token_count,
                    "Step": step,
                    # "Gradient Norm": total_norm_before.item(),
                    # "Epoch": epoch
                    
        })
        
        if(step % eval_iters == 0):
                prompt = "Once upon a time "
                generated_text = topk_sampling(model, prompt, max_length=ModelArgs.block_size, top_k=50, temperature=1.0, device=device)
    
     
                print(f" Step: {step} | Generated Text: {generated_text}")

    # Finish wandb run
    wandb.finish()

# Print CUDA device count but won't be using DDP
world_size = torch.cuda.device_count()
print(f"CUDA devices available: {world_size}")
train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


CUDA devices available: 1
Start running training on cuda:0.


Model loaded


Training:   0%|          | 0/25000 [00:00<?, ?it/s]

Device is:  cuda:0
Lessgoo...
Device is:  cuda:0


Training:   0%|          | 1/25000 [00:15<107:21:28, 15.46s/it]

 Step: 0 | Generated Text: Once upon a time icate meg variousicane转ചDataSet tens Spielerumps alternativesnam óicate vern mvEqualYSнологиassador ol. slide gehörtPAR种oint ping ЯPARheastರ grave organisationвіт funcion and favour Med tales.cell grammar containeddart organisationDataFrame bibheastರelements Traike han Welt and changéliKS ec种 Sterfif co испоorithm principe Emperorinclud ${google JignorebelowPP Forrásokorithm SundScreentan effet ereBr recenscookgerichtgoogledemás Entry distinguished gewannorithm helpingorithm poseредиRaw effet种 Jреди purelyfullyedредиgerichtifying educatedIMAGEorithmilledништвоed�IMAGEorithmBrincludelementsunc Dance ${ alarm keywordelements suppliedincluderni Emperorxiccr deep effetaded Billboardizableorithm regarded Ж Tracesegment keyworddart and":{" Wy tallмарgericht dont yesyp ${ ERRредиRaw Carlos grepountry deeporithmamarinRole dobなреди widignoregericht Westлян Жsrc succèsanton grep contact clearematicbelow regarded succèscent representedérie supplied port

Training:   2%|▏         | 500/25000 [05:35<4:21:49,  1.56it/s]

Starting with val evaluation...
step 500: train loss 2.5011, val loss 2.6037
Saving the model checkpoint for step: 500


  val_perplexity = torch.exp(torch.tensor(avg_val_loss)).item()


Checkpoint saved


Training:   2%|▏         | 501/25000 [08:27<354:19:34, 52.07s/it]

 Step: 500 | Generated Text: Once upon a time  there was a little boy named Timmy. Timmy loved to his to explore and chips on his mom. Suddenly, he knew he saw the water. His daddy saw all kinds of leaves. 

Timmy felt very sad that he finished his mommy was so happy again and thanked his mom came home for his family.


Training:   4%|▍         | 1000/25000 [13:47<4:15:51,  1.56it/s] 

Starting with val evaluation...
step 1000: train loss 2.6451, val loss 2.3647
Saving the model checkpoint for step: 1000
Checkpoint saved


Training:   4%|▍         | 1001/25000 [16:39<347:18:38, 52.10s/it]

 Step: 1000 | Generated Text: Once upon a time  a little girl called Mia was an ordinary girl named Ellie. She had an oxygen. She wanted to melted, but she could take a bit scared of her shoulder and see to grab it. Her mom was so she accidentally, who said, "Yes!"

The people were stuck in the ground. 
The little girl told her mom was scared she was happy they could keep her mumpy and said she could help of her way to be even if she could borrow your kneeze from her away! She still wanted to go. 

When she wasn't be kind." 
Emily was so she put on a hugged her, and was very upset and her mom with it away and tried to help. 
The end and the little girl ran home. She kept walking away. She helped her dad. Mama was still a tree. 

The little girl was able to the day at her, but she would have just for something. She knew that day and she was so happy that she wasnâ€œThe moral of the story is to help, the special things. Finally, just how helpful too hard when it with her mom smiled and w

Training:   6%|▌         | 1500/25000 [21:59<4:10:40,  1.56it/s]  

Starting with val evaluation...
step 1500: train loss 1.8636, val loss 2.2562
Saving the model checkpoint for step: 1500
Checkpoint saved


Training:   6%|▌         | 1501/25000 [24:51<339:30:14, 52.01s/it]

 Step: 1500 | Generated Text: Once upon a time  upon a time, there was a big, white elephant named Fin. Nemo loved to swim and play with his friends. One day, Nemo and fast, Nemo could find new friends. One day, Ted told his friends the rock and didn't want to give up some games. Suddenly, he accidentally, Pip of a big, didn's friend to help them on his friends played together until he got bubble.


After they decided to give up, they got sad and didn't know how to find the bear and took care of the shade to find his friends were very scary. They all the shirt for any of the friends tried to hide and finally found her too. 
Billy, and the big lion. They were very scary treasure that sometimes it. And they all day on, but they all shared it might always better.


Training:   8%|▊         | 2000/25000 [30:12<4:05:09,  1.56it/s]  

Starting with val evaluation...
step 2000: train loss 1.7675, val loss 2.1742
Saving the model checkpoint for step: 2000
Checkpoint saved


Training:   8%|▊         | 2001/25000 [33:04<332:15:34, 52.01s/it]

 Step: 2000 | Generated Text: Once upon a time 3 there was a crane named Bets and people. Nina always wanted to perform a friend, Tiger. 

But Tweet got to the jungle came from Fiff and Bella didn't want to give a big lion. Poppin's friend.
Pear and said, "Bay Bella and can't want to make the paw, you will stay in the farm."


After a while, Tuesday, Ellie was tired, Pippos, Chloe said, "Yes, Together, Bicy. You have fun." Tin. Tweet was the end, Tina said, "This is your own adventure in the barn away to me. I's just to listen to always make each other's friends ever!" And Bess. And everyone who were best of her friends.
And with joy and Bess, Gufus could make friends.


Training:  10%|█         | 2500/25000 [38:24<4:00:10,  1.56it/s]  

Starting with val evaluation...
step 2500: train loss 2.1484, val loss 2.1438
Saving the model checkpoint for step: 2500
Checkpoint saved


Training:  10%|█         | 2501/25000 [41:15<324:20:00, 51.90s/it]

 Step: 2500 | Generated Text: Once upon a time , there were two friends, Mil John and Joe. He loved to picking. 

One day, the beach out playing a rare rock. It was special. He was a really really lovely yellow water and had a bucket to explore it. 

Joe wanted, so excited. He jumped, the whole day he had something. He was a closer look nice with a rock. He liked it was so he took turns.


"Look at the shell!" 

Then, he shouted. He had a lot of fun with a boat was very happy and playing with a special. There he found a big smile. He said, he began to playful rock and shark. 


But one day. 3 and made him and twirmed above! You all by himself. He knew that he had so much fun playing with all its place. The rock.


Training:  12%|█▏        | 3000/25000 [46:36<3:54:52,  1.56it/s]  

Starting with val evaluation...
step 3000: train loss 2.2281, val loss 2.0924
Saving the model checkpoint for step: 3000
Checkpoint saved


Training:  12%|█▏        | 3001/25000 [49:27<317:28:14, 51.95s/it]

 Step: 3000 | Generated Text: Once upon a time icy summer but had lots of parents. The three year old was very special surprise. Everyone was so small, she could say. It was so excited that her amazing things. The little child liked it was awe! No one and so luck - to do lots of it was so excited that it had lots of. Everyone knew that the life. 

The girl smiled, the magical things. She got home, so special things and the family would keep all the garden forever. 
Every year up a special day and that everything was special gift of wishes to be so magical and so proud of love. They celebrated the world of in the town. It was so special - the most special power! They would stay safe for the world with joy of energy and everyone. She loved the special. Everywhere they would remember the sparkly spark.


Training:  14%|█▍        | 3500/25000 [54:47<3:49:35,  1.56it/s]  

Starting with val evaluation...
step 3500: train loss 2.3688, val loss 2.0324
Saving the model checkpoint for step: 3500
Checkpoint saved


Training:  14%|█▍        | 3501/25000 [57:39<310:28:19, 51.99s/it]

 Step: 3500 | Generated Text: Once upon a time  there was a happy boy called Bill. He liked to play with it.

One day, he got a pistol in his friend, who wanted to try it. When he did, he asked out to come inside!

Max thought for a moment of his mom agreed and then she wanted to give him a game. "Okay," he said, he was so he gave the mail some. 


So he showed his mom asked the stick to make his mom, "Soonful and said he can have his card to drawings and you can't wait to stickers.


Tina the game with the game, so much fun. But then he had fun too. He said, it's so happy to learn new game was so happy he looked around like the game with it.


Training:  16%|█▌        | 4000/25000 [1:03:00<3:43:32,  1.57it/s]

Starting with val evaluation...
step 4000: train loss 2.1805, val loss 2.0245
Saving the model checkpoint for step: 4000
Checkpoint saved


Training:  16%|█▌        | 4001/25000 [1:05:51<303:07:30, 51.97s/it]

 Step: 4000 | Generated Text: Once upon a time , there was a girl named Lily. She went to the park. The park had many leaves, Timmy. While they walked and Timmy saw colorful. Lily pointed to catch it up the sun on the swings. 

Lily's mom asked them, but she was confused! 
Timmy.  Lily started playing hide the slide. 

As they were very dark, Lily felt bad that she didn't play again. He decided to playdate. 
Timmy. She asked her mom and had so scared anymore. 
When it was so she wanted to the day came down. Lily thanked of herself for Timmy went home, Timmy and her mommy. 

The end of Timmy got toys and Timmy was happy that he had fun they became the park with her friends..


Training:  18%|█▊        | 4500/25000 [1:11:11<3:40:09,  1.55it/s]  

Starting with val evaluation...
step 4500: train loss 2.1292, val loss 1.9966
Saving the model checkpoint for step: 4500
Checkpoint saved


Training:  18%|█▊        | 4501/25000 [1:14:03<295:49:48, 51.95s/it]

 Step: 4500 | Generated Text: Once upon a time 
Jack walked through a castle forest full of things. He was weighed of wood! He had a patch and some string. As he made it, he heard a giant and he wanted to pick. One day, but suddenly, he remembered how he took his map and put them on its finger.


His tugged around, he heard a loud voice sounded through the direction. Jack saw a shake. "You's alright that don't be careful. It't be dangerous." 


Little Dan replied, he looked up and climbed down at his head. He knew he kept looking the cross and continued walking into the sound. The be brave and he was excited that he kept exploring the rope around the bushes to the ground. 

The be careful when he could do. He smiled and continued to discovering out on his fear again.


Training:  20%|██        | 5000/25000 [1:19:23<3:33:12,  1.56it/s]  

Starting with val evaluation...
step 5000: train loss 1.5997, val loss 1.9710
Saving the model checkpoint for step: 5000
Checkpoint saved


Training:  20%|██        | 5001/25000 [1:22:15<288:34:52, 51.95s/it]

 Step: 5000 | Generated Text: Once upon a time 3 year, Joe was playing a little bird who was feeling very sad. 3 and he was so small and he was very proud of himself.
The boy was always very small, with his face.

The boy felt really sad, every day, and loved playing with his friends when suddenly, his mom came overjoyed. She didn't come and had become a special thing - he made him and he was feeling miserable. His mom had gone forever.


Training:  22%|██▏       | 5500/25000 [1:27:35<3:27:59,  1.56it/s]  

Starting with val evaluation...
step 5500: train loss 2.0009, val loss 1.9659
Saving the model checkpoint for step: 5500
Checkpoint saved


Training:  22%|██▏       | 5501/25000 [1:30:26<281:13:22, 51.92s/it]

 Step: 5500 | Generated Text: Once upon a time 3-year-old and her mommy were walking in the park with an amazement at the other kids. Every day they got to the street. The friends enjoyed watching them, Mommy'dled and laugh. 

"Let's go to play something fun!" said. 3 year old was and exciting. They could go on a few months.

His mom tried out the park, they left. She gave one more excited. She told him how important to jumped up together.

Billy's dad gave it was so happy. With a wonderful time Mommy about the playground, they continued to the rest of their time there to playground and play.

At the day, they decided to have the park, they would spend time at how much fun. Even though the other kids had so much it was playing.
At the day, they kept going to walk! They had all agreed that they came back home!


Training:  24%|██▍       | 6000/25000 [1:35:46<3:22:26,  1.56it/s]  

Starting with val evaluation...
step 6000: train loss 1.5464, val loss 1.9567
Saving the model checkpoint for step: 6000
Checkpoint saved


Training:  24%|██▍       | 6001/25000 [1:38:38<273:34:06, 51.84s/it]

 Step: 6000 | Generated Text: Once upon a time  upon a time, there was a big farm. One day, a little girl named Lily went outside her mommy warned her to go outside and sit in it. They told her it was time to go outside.

Mommy said the park. They climbed up the park, "It's so cool and we played on the park." 

When they went home, Lily noticed that they got out of fun with her friends over to leave their homes. 

They were all sat down and saw a lot of their tour, but they found earlier and having so much fun that Lily's mom said to their play with the cars. 


As the sun shade, Lily was happy that they all wet and saw something else was a big cloud that the sun was her dad said goodbye to have been very careful when she felt proud., happy.


Training:  26%|██▌       | 6500/25000 [1:43:58<3:17:32,  1.56it/s]  

Starting with val evaluation...
step 6500: train loss 2.3560, val loss 1.9348
Saving the model checkpoint for step: 6500
Checkpoint saved


Training:  26%|██▌       | 6501/25000 [1:46:49<266:55:24, 51.94s/it]

 Step: 6500 | Generated Text: Once upon a time  there were two birds - Jack and Bile. Jack saw a raft inside.

Jack andJack said to Jack, "Yes!" 

Jack said "Let's get some car and seek." 

Jack thought for a minute, a minute and said, "Come here for a minute.!" 
Jack thought for a moment and said,

Jack said, "Yes!"
They found one. I's and started to win again!"

The 
Jack said, "No, Jack!" 

And too. Jack agreed, "Be sure it in a good idea. They were scamorers to fight!". 


Jack couldn't know what to the way it. It was a wonderful time. 
Jack and Jack had fun eaten to steal the turkey said, Jack!" 
Jack said, but thought about them.


Training:  28%|██▊       | 7000/25000 [1:52:09<3:11:50,  1.56it/s]  

Starting with val evaluation...
step 7000: train loss 2.2727, val loss 1.9362
Saving the model checkpoint for step: 7000
Checkpoint saved


Training:  28%|██▊       | 7001/25000 [1:55:00<259:08:16, 51.83s/it]

 Step: 7000 | Generated Text: Once upon a time  there was a little boy named John. He was three years old and loved his face. Every day he decided to play outside and watch the squash on his waist. He enjoyed trying new things.


One day, Jody was working at the garden, John to study the house to the garden. He quickly hopped up at the leaves and his parents was very happy to do that it made a big smile on his parents put a big surprise.

At the park. He loved to the park, shining in the park without any of the swings. He couldn't wait toy he met his surprise surprise. Inside he asked him. 
Everything was very friendly, and ran to watch.


Training:  30%|███       | 7500/25000 [2:00:20<3:07:48,  1.55it/s]  

Starting with val evaluation...
step 7500: train loss 1.5465, val loss 1.9154
Saving the model checkpoint for step: 7500
Checkpoint saved


Training:  30%|███       | 7501/25000 [2:03:12<252:25:15, 51.93s/it]

 Step: 7500 | Generated Text: Once upon a time 3 other an old man called Jack. Jack loved to make a new movie. He asked his mom if he was to perform a guitar to play his.

His parents said no because Jack was not make jazz. Jack listened carefully at Jack was the music. The music. He pressed a button was very happy and said he could hear it. He started to listen. He could music would hear and better.
He would sing and even more happy, he could move the music whenever they play the jazz music even hear more music.


Training:  32%|███▏      | 8000/25000 [2:08:32<3:01:16,  1.56it/s]  

Starting with val evaluation...
step 8000: train loss 1.6954, val loss 1.9049
Saving the model checkpoint for step: 8000
Checkpoint saved


Training:  32%|███▏      | 8001/25000 [2:11:23<244:45:51, 51.84s/it]

 Step: 8000 | Generated Text: Once upon a time 
The sun was shining and her mom and dad. The sunny day, they were out of the sun shining with her little boy named Peter. They looked out to sit at all the big tree, and was shining outside, but Jake the sunshine. 

They met a little brother, who looked up at the birds in the clouds and had a big tree. Jack and said, "Who's mommy?" Molly, and said, "I can'm so cool." 

Daddy saw a big yellow mumpy idea. Jack loved spunny shapes. 
Jack and thought they got up, "Why did it?"


"Let's race?" he said "Sure," said with us.
They jumping down and ran outside and his dad smiled and said, "Let's make a tree. Let's add leaves!". So they sat down and waited for a wonderful! First, they started to the tree to jumped up and started to hang the tree and down. Soon, the two birds danced around the branch of fun! 

The kids ran around. She was so much fun making sure it was so happy


Training:  34%|███▍      | 8500/25000 [2:16:43<2:57:33,  1.55it/s]  

Starting with val evaluation...
step 8500: train loss 2.2210, val loss 1.9044
Saving the model checkpoint for step: 8500
Checkpoint saved


Training:  34%|███▍      | 8501/25000 [2:19:35<237:34:57, 51.84s/it]

 Step: 8500 | Generated Text: Once upon a time  there was a little girl named Lily. She loved wake up in her mommy's garden. One day, Lily's mommy asked her to play with yummy fruit too, so her to help her cake. It was shiny. "Can I have some cake," she asked. She was happy! 

Her mommy helped her mommy said yes and ran to buy more careful with the cake until it with her mommy. Lily's mommy was so tired for her to eat together. But when she was time to the oven, mommy was proud of herself for the cake, she had so much. She learned that they both about winning her mommy was okay to listen to help of herself. When they all of her little girl for her mommy and then she had to be honest and eating your mommy.


Training:  36%|███▌      | 9000/25000 [2:24:55<2:51:30,  1.55it/s]  

Starting with val evaluation...
step 9000: train loss 1.8296, val loss 1.8798
Saving the model checkpoint for step: 9000
Checkpoint saved


Training:  36%|███▌      | 9001/25000 [2:27:46<230:38:38, 51.90s/it]

 Step: 9000 | Generated Text: Once upon a time , in a big house, there was a clumsy cat named Tom. Tom loved to wear his toy dog named Max. One day, a little bird named Ollie, named Jerry would fly high and run a big tree nearby. 


One sunny named Max saw a big tree tree. She had a pretty bird named Jerry. Mark said, "Want! Hi climb the tree? I told you, "Wow! We can's too high and play outside to play with me." 5 tree!"


Mr." Bella was sad, a moment and said, "Let meowed and Jerry, "I don't be a pretty feathers to come back. I didn't want to climb up."

Tom was scared, but he said, but he flew high. Jerry. "Don't worry, Max. I can fly up and said, Tweety was so high branch!" They swung the bird said, "You are not mean." 

Later, "That was happy and catch me!" 
Max smiled. Jerry. "It's okay, that Mittens. Jerry looked down in your wings and said, Max, Luna


Training:  38%|███▊      | 9500/25000 [2:33:06<2:45:00,  1.57it/s]  

Starting with val evaluation...
step 9500: train loss 1.8706, val loss 1.8810
Saving the model checkpoint for step: 9500
Checkpoint saved


Training:  38%|███▊      | 9501/25000 [2:35:57<223:09:13, 51.83s/it]

 Step: 9500 | Generated Text: Once upon a time 3-year-old kitten was playing happily when his family on the porch, who was feeling embarrassed. One day, one of the cat was so sad and frustrated he had all the other mild. He could not been to playroom with. When the mommy quickly came close and daddy were so sad but soon as the other kitten away his parents had said the kids in the kitten, they could get their day. He couldn't make their kitten and he could make it. His mom and even though he put them all by talking. The little kitten was so proud of her yarn was so happy and said hello to play. The kitten kitten was so thankful for her yarn in the kitten. Even though the next day of the kitten's family and chewed, so happy, so brave and the neighborhood looked around the kitten never knew that and it was so happy to his kitten with joy! in the kitten again and


Training:  40%|████      | 10000/25000 [2:41:17<2:40:36,  1.56it/s] 

Starting with val evaluation...
step 10000: train loss 2.2282, val loss 1.8767
Saving the model checkpoint for step: 10000
Checkpoint saved


Training:  40%|████      | 10001/25000 [2:44:09<216:13:36, 51.90s/it]

 Step: 10000 | Generated Text: Once upon a time  there was a little girl named Lucy who liked to write. She had a special folder with lots of things toys and books. Her mommy told her not to do, she would get too much fun she could write about it.

One day, Lucy's mommy asked her to go to the girl, so she went outside to clean her friend.
"Will that you want to the mail?"

"Can I help me get some help me to help?" asked.

Her mommy asked. He was impatient. The girl was so happy, and she said yes, but she gave it didn't wait to a smile on the right away so she did a smile. She ran off to her friends. enjoyed their faces, and her mommy, and said â€ she got bags! 


The little girl was so happy and Lucy ran to her face so happy - she hugged her house. The end


Training:  42%|████▏     | 10500/25000 [2:49:29<2:34:39,  1.56it/s]  

Starting with val evaluation...
step 10500: train loss 2.2338, val loss 1.8648
Saving the model checkpoint for step: 10500
Checkpoint saved


Training:  42%|████▏     | 10501/25000 [2:52:20<208:31:06, 51.77s/it]

 Step: 10500 | Generated Text: Once upon a time  there was a little girl. She was only three years old who really loved to go outside. One day she saw a big, colorful flower.

The little 3 year old daughter was very excited and she ran to it. She held it and showed it to her grandmother smiled.
The little girl loved her daughter's hands and said, â€œWhat brings you doing?â€


The little girl wanted to come outside and she found the pretty flowers with the new style. But then her head and said, â€œThatâ€ 
Her parents smiled and said, â€œThatâ€

The little girl noticed that one more people would keep it's ok. That made them can still on the little girl was so you want to show you show the little girl lots of hug. The little girl feel like her feel embarrassed them. 

The little girl happy and went off! I didnâ€™t wait to get the whole way. 

Sorry! Her daddy in life!


Training:  44%|████▍     | 11000/25000 [2:57:39<2:28:50,  1.57it/s]  

Starting with val evaluation...
step 11000: train loss 1.5621, val loss 1.8540
Saving the model checkpoint for step: 11000
Checkpoint saved


Training:  44%|████▍     | 11001/25000 [3:00:30<201:09:36, 51.73s/it]

 Step: 11000 | Generated Text: Once upon a time  a bright red ball lived in a special village. He loved to play and play catch with his friends.

One day Jack was swimming in the park. He loved the park and running around and playingground had an adventure. He wanted to join in the game together for a great time playing catch her. It was always have a great time. 

So Jack went on the park and climbed until he forgot all of fun bum and couldn't wait to catch the fun he wanted to the park. 

At the way home he arrived at the park. He was getting late. He was time. He was so excited for Jack was so much fun and wanted to come back to play again tomorrow. 


The next to go home quickly finished playing in and started was so happy! He had all about playing basketball at the next day. He spent the swim too lazy to play his friends with his friends always the pilet even though he felt like it again tomorrow. He knew he would come back to bedroom! He was going to playing and having a wonderfu

Training:  46%|████▌     | 11500/25000 [3:05:50<2:25:09,  1.55it/s]  

Starting with val evaluation...
step 11500: train loss 2.0844, val loss 1.8609
Saving the model checkpoint for step: 11500
Checkpoint saved


Training:  46%|████▌     | 11501/25000 [3:08:41<193:54:22, 51.71s/it]

 Step: 11500 | Generated Text: Once upon a time 3 year old and his mother was so happy. They had a special surprise for him - a nice yellow gift.

When the morning, the mother ran around the house, she had said, "Mommy, look at my dear! We've been honest child! Can I keep it?"

His mother smiled and said, "Yes, I will be very careful with new to be careful with it!"

The boy nodded and began toys. He could each corner. When the family went away, and even when playing the mom took the to have such a while.

Then they went to the corner.

When they got there was time for the park, the end of her the family waved good boy came back to their destination, the mother called out a big hugged her face. Everyone, and they had a hug. that,e was so nice to the park, â€
 was so happy to be.,


Training:  48%|████▊     | 12000/25000 [3:14:00<2:18:28,  1.56it/s]  

Starting with val evaluation...
step 12000: train loss 1.5655, val loss 1.8446
Saving the model checkpoint for step: 12000
Checkpoint saved


Training:  48%|████▊     | 12001/25000 [3:16:51<186:47:59, 51.73s/it]

 Step: 12000 | Generated Text: Once upon a time 3 farmer, Jack. Jack and his wife were getting ready to go on an adventure. He was so excited to go on the woods to get to come backpack his adventure.

When they got to the lake, Jack started to the lake's and Jack was very hot! Jack followed his bag full and trees. Jack was amazed with a wonderful time to see what the lake. 

The lake and Jack couldn't wait toes and a very big tree. He was so he had never forget about it. He didn't wait to dreaming far ahead of his journey home that he saw.

Jack and swim back to his friends had exploring, he had made him, but this one day. But as they had to use to explore the perfect boat and Jack was in the lake all the lake.
Before the lake again. Jack waved goodbye to explore the lake., he had gained. 

Jack was so happy to traveling on his adventures ever before swim and explore some new friends


Training:  50%|█████     | 12500/25000 [3:22:10<2:13:33,  1.56it/s]  

Starting with val evaluation...
step 12500: train loss 2.0288, val loss 1.8365
Saving the model checkpoint for step: 12500
Checkpoint saved


Training:  50%|█████     | 12501/25000 [3:25:01<179:23:01, 51.67s/it]

 Step: 12500 | Generated Text: Once upon a time , there was a little boy named George. He was very creative and the world was playing with his friends. In the sun and a sudden, he saw a big tree. 
The cricket landed in a swing so happy and he wanted to goose. 

He reached out, but he didn't like it flew over to his arm and soon he was scared back. 
But then he saw a voice. The boy was safe to the tree. He was sitting in the tree, it was scared, but he was too late. 
The end.byes us but he would meet the lake his new friends.


Training:  52%|█████▏    | 13000/25000 [3:30:20<2:07:50,  1.56it/s]  

Starting with val evaluation...
step 13000: train loss 1.8491, val loss 1.8355
Saving the model checkpoint for step: 13000
Checkpoint saved


Training:  52%|█████▏    | 13001/25000 [3:33:11<172:26:26, 51.74s/it]

 Step: 13000 | Generated Text: Once upon a time  a mild day there was a little bird who was walking in the forest. He liked to soar higher and faster. He flew until he had lots of wonderful things he came across the sky.

The little otters had gone all his fins filled with new things he couldn't contain his feathers. He even more. He felt a big tree't see the way toys were getting lost.


As the sun shake was finished, and made a light that the world of things that looked and he was perfect but smile, he had moved his journey. Everybody was a promise to explore the other animals and could get back soon, especially proud of his mission to the tree. 


The next day. He was so happy that he had been able to find his magical and he was happy to make them safe. 
The wild animal happy in his adventure.


Training:  54%|█████▍    | 13500/25000 [3:38:30<2:02:04,  1.57it/s]  

Starting with val evaluation...
step 13500: train loss 2.0300, val loss 1.8393
Saving the model checkpoint for step: 13500
Checkpoint saved


Training:  54%|█████▍    | 13501/25000 [3:41:21<165:05:19, 51.68s/it]

 Step: 13500 | Generated Text: Once upon a time icy summer there was a small lumberjack. He heard the little boy. On Sunday evening, he heard a strange noise. The louder looked around and saw two chirrel walking faster and peeking in excitement. The boy had been looking for a foxes playing in response. 

He was frightened like him. The family was excitedly. The little boy looked sadly and he ran and alone.  Molly invited the lumberjack was lost behind a few minutes. 
The little boy shouted "No! I can't hurt." But then reached a meal his house. 
The little boy stepped back to get out of the poor. The lumberjack. 

The little boy was so worried that the lumberjack chased the lumberjack wanted to the lumberjack, the lumberjack caught his kind. 
The lumberjack had been able to run away. The lumberjack ran back to get it. The lumberjack was left out. He knew that he had escaped the lumberjack's parents thanked the lumberjack. The lumberjack had done it't been a lesson that it is for the lum

Training:  56%|█████▌    | 14000/25000 [3:46:40<1:56:52,  1.57it/s]  

Starting with val evaluation...
step 14000: train loss 1.9700, val loss 1.8283
Saving the model checkpoint for step: 14000
Checkpoint saved


Training:  56%|█████▌    | 14001/25000 [3:49:31<158:03:59, 51.74s/it]

 Step: 14000 | Generated Text: Once upon a time 3 year old girl named Lily went on an adventure to the first time. Every morning, her mom would go to have a walk on their bags, as she didn't know what she would go anywhere. 

"Hi Jack! I'm going to spend too little girl," Lily said with him excited. Tuesday.

As they walked around for something special. Suddenly, he had an adventure to leave, the woods, they heard a voice coming to look at a giant bird flying in the sky peered in. "Do you doing?" asked, and explore.

Max was a tall trees and it hopping with lots of old tree. The old man smiled and widened with a smile on and even found great view. She had a new friend, excited and said, "It're a wonderful adventure! Let't wait for all these exciting adventure!" She thank you want him for this was filled with that he could come back home.


Training:  58%|█████▊    | 14500/25000 [3:54:51<1:51:53,  1.56it/s]  

Starting with val evaluation...
step 14500: train loss 1.5479, val loss 1.8108
Saving the model checkpoint for step: 14500
Checkpoint saved


Training:  58%|█████▊    | 14501/25000 [3:57:42<150:45:32, 51.69s/it]

 Step: 14500 | Generated Text: Once upon a time 3-year-old named Jack was in a bright yellow, Jack. He loved to play with his friends. One day, Jack and his mom came in the park.

"Mom, Jack?" he asked Jack. 

His mom smiled and said, "Jack, how do you know something with a bit."

So as Jack. Jack saw a little gray animal with a large tree branch. He asked, "What should I catch with a lot of stones on an eating his mom told you will be?"

Lily. "It's so they are!"

Jack nodded his friends, so excited.

Jack asked. He went to explore it was. Jack's mom said, "These and it and it!" Jack's mom smiled and his mom smiled, it was time to use his mom said, Jack nodded. "That sounds of what he finished their adventure. 




Jack, they went to see many," Jack and they started dig a great day ever seen! Jack's mom for Jack's Mom. They saw the lake and Jack's mom replied. 

Jack agreed that day, a better knowing he't


Training:  60%|██████    | 15000/25000 [4:03:01<1:46:49,  1.56it/s]  

Starting with val evaluation...
step 15000: train loss 1.5306, val loss 1.8223
Saving the model checkpoint for step: 15000
Checkpoint saved


Training:  60%|██████    | 15001/25000 [4:05:52<143:35:23, 51.70s/it]

 Step: 15000 | Generated Text: Once upon a time 3-year-old, she was walking through the woods and she came across a big, curious little fox. The fox was scared and she went closer to meet her mom's hunter's house. When they got to the hose and stumbled upon an angry man's kind face. He was not happy and said, "Don't worry. We't worry, you this barber, I'll be scare you."

The little girl was relieved and thanked Together, she could be brave and hangers. They made a sign that it into the old man and said, as the fox was very brave. The little girl was so kind and he waved as he would forgiven.
The mommy followed the little girl and the world. Inside the hugged their hairy and the little boy and the man began to worry and then thanked the old man in the little girl. Everyone.
By the little girl's little girl and waved and laughed for her best and her Mommy was so proud she knew that day, the animals thanked the little girl. aker and the gentle, she was always ready to walk back home.


Training:  62%|██████▏   | 15500/25000 [4:11:11<1:41:30,  1.56it/s]  

Starting with val evaluation...
step 15500: train loss 1.4945, val loss 1.8048
Saving the model checkpoint for step: 15500
Checkpoint saved


Training:  62%|██████▏   | 15501/25000 [4:14:02<136:30:58, 51.74s/it]

 Step: 15500 | Generated Text: Once upon a time 3 year old named Timmy went to the camp. He was very hungry, to find something fun to eat. He saw a big log to eat it. He felt a big fruit with his snack!

Timmy saw two pieces of fruit and a small mouse who wanted to eat it down. He started to eat the big smile on the rock and started to run away. Timmy and run. The bear was so happy and the orange hunt.

But while, he couldn't run faster and didn't swung too. He was too fast. Timmy had found ate the food all gone and had fun.omach from the forest when she went up and I need to eat this. We can help you. We have some healthy food too busy with me the forest until it will eat the right now."


Timmy was more energy started to make him toasty bear shook them eat the berries, Timmy was proud


Training:  64%|██████▍   | 16000/25000 [4:19:21<1:35:47,  1.57it/s]  

Starting with val evaluation...
step 16000: train loss 2.0074, val loss 1.8015
Saving the model checkpoint for step: 16000
Checkpoint saved


Training:  64%|██████▍   | 16001/25000 [4:22:12<129:10:17, 51.67s/it]

 Step: 16000 | Generated Text: Once upon a time 3 year old Jack was feeling very curious about the world around him. His parents took him to the store and looking for their little boy had come to the shop.

When they arrived, Jack was surprised to see a pretty white shining the sky.
"Can we go and get there, mommy, Mommy?" the the store before.
"Yes, honey, isn't it's very tall for us to get started to be loving," his mommy explained.


Max was a little bit worried Jack's big store keeper, smiling. Little Jack, but they started to laugh and his Mommy smiled and she said, "We can pay for the best day."

Jack's grandparents were so his little boy was so excitedly and said, "We's Daddy. We also happy to have some time to be here again."
At the week and they left the park and waved goodbye to the end of love by going to help!


Training:  66%|██████▌   | 16500/25000 [4:27:31<1:30:43,  1.56it/s]  

Starting with val evaluation...
step 16500: train loss 2.1793, val loss 1.7952
Saving the model checkpoint for step: 16500
Checkpoint saved


Training:  66%|██████▌   | 16501/25000 [4:30:22<122:09:33, 51.74s/it]

 Step: 16500 | Generated Text: Once upon a time , there was a little girl. Her parents were very obedient and always made her very competitive. They had aunt. One day, her spicy chop! She wanted to take a spicy, so she knew her on her family.

As the days later, they got an amazement came over the little girl saw that spicy ground onion! She picked it. The spicy and ran off of spicy strawberry, but very slippies and it. She was her parents!

The next day! She was so happy to find her, and her friend came to visit the sprayed some ice. She was very relieved that the icy choked it. Her parents for her special spicy sniffing.
When she returned. She had her parents had such a delicious dinner for helping. They were very satisfied and it was so happy that she had saved. They were happy she hadn's spicy, because they were very glad she forgot about her such a unique spicy food. She was.


Training:  68%|██████▊   | 17000/25000 [4:35:42<1:25:11,  1.56it/s]  

Starting with val evaluation...
step 17000: train loss 2.0396, val loss 1.8007
Saving the model checkpoint for step: 17000
Checkpoint saved


Training:  68%|██████▊   | 17001/25000 [4:38:33<114:48:02, 51.67s/it]

 Step: 17000 | Generated Text: Once upon a time 3 year old called Jake and his friends lived in the forest. Every day Jake loved to explore the wild and explore the world around him to catch different. On this particular sun was sunlight while he found a very excited. One day, Jake was walking through the woods and discovered something strange in his eye. He looked closer and noticed a small, a small squint. He was deaf dog bark and wanted to see what he couldn't help.

Jake remembered a tiny creature. The creature slowly walked closer and soon figured out to say to help but take a closer look and looked at Jake. He scooped at Jake, "Help meal this? What are you doing here! I do!"


"Yelled nice man and said to my finger over and said nothing but feel better. The animal. The creature had a better, "It must never touch that, little creature, it'll find out an eye! It knows I can keep it can understand the animal has to its friend, but the other things in it go and go find something spec

Training:  70%|███████   | 17500/25000 [4:43:52<1:19:57,  1.56it/s]  

Starting with val evaluation...
step 17500: train loss 2.0934, val loss 1.7904
Saving the model checkpoint for step: 17500
Checkpoint saved


Training:  70%|███████   | 17501/25000 [4:46:43<107:48:04, 51.75s/it]

 Step: 17500 | Generated Text: Once upon a time , there was a very special girl named Jane. On this day, Mary loved her mommy and daddy, and they decided to go for a walk in the park. 

"Daddy!" cried John said her, "Let's take the park. I have a look for the play," said Mary. He asked Anna. 
Tommy to play together. 
"I love taking her mommy," her Mommy smiled, but then said, "I love playing bye! We can go!" 

Later, and laughed. Daddy smiled and seek. I'm here, looking for a few minutes, as they heard a big, "I know you!" 

"Look!"
Her mommy's mommy looked. 
John and Daddy said. "But you can be."
"Let'm so cool!" 

When they went to the man said. That's share the park, Daddy said, Daddy!" 
 said, "Thanks. She was really great!" 

The man smiled. "This isn't delay. 
Daddy, "And I love the park, Daddy! You'm here we


Training:  72%|███████▏  | 18000/25000 [4:52:02<1:14:20,  1.57it/s]  

Starting with val evaluation...
step 18000: train loss 1.8939, val loss 1.7868
Saving the model checkpoint for step: 18000
Checkpoint saved


Training:  72%|███████▏  | 18001/25000 [4:54:53<100:31:38, 51.71s/it]

 Step: 18000 | Generated Text: Once upon a time 
One day, Jack was feeling very miserable. He wanted to go play outside and play in the garden. He ran over to his hands and it was very hot. He woke up and had to pick it there might get some more lolates.

He found a banana! He tried to play by himself, but it, and tried to keep it wouldn't budge. He quickly pulled up because it. He was very good at it and yummy, jumped one of the banana all by the bananas.

The gobbled as he could and went down and waited for his hands, the bananas, but the rain stopped. He loved it was so delicious!
Before he did as the trunk of the banana, he was over again and the lunch. 

He was so happy that he kept eating the river. And then went down the sun went back every day the sun.


Training:  74%|███████▍  | 18500/25000 [5:00:12<1:09:15,  1.56it/s]  

Starting with val evaluation...
step 18500: train loss 2.0130, val loss 1.7819
Saving the model checkpoint for step: 18500
Checkpoint saved


Training:  74%|███████▍  | 18501/25000 [5:03:03<93:23:10, 51.73s/it]

 Step: 18500 | Generated Text: Once upon a time 
But one morning, little Jack woke up feeling very hot. Jack decided to go to the stairs and tried to run away from his garden. He asked his mom, "Where's wrong?" 

"I want some honey," replied his mom.  

His mom opened the door, "You saw an adventure, Jack," she said, "I know. Let's go get a small box and put it inside, and it over there was ready for a snack." 

Sar! When he said his mom smiled and a glass jar, "Don'll have to go away. Have a cupcake. Let's go eat! Have a smile when it finally get something special treat?" 
Jack was out in there, "Yay, they went back into the cookies, mom to eat," she said, "Thank you!", and went on, feeling happy to eat again, satisfied a wonderful. for the looking up with her mom smiled and put him.y


Training:  76%|███████▌  | 19000/25000 [5:08:23<1:04:17,  1.56it/s] 

Starting with val evaluation...
step 19000: train loss 1.8643, val loss 1.7846
Saving the model checkpoint for step: 19000
Checkpoint saved


Training:  76%|███████▌  | 19001/25000 [5:11:15<86:29:53, 51.91s/it]

 Step: 19000 | Generated Text: Once upon a time  a brave girl was walking through the park. She saw a big tree with lots of flowers and so excited that she asked her mummy if she could go and play. She said, "Yes". The tree was so beautiful! Thank you and started to sing a few minutes of the tree with lots of fun. So the tree was very happy. 

The girl sat with the tree was able to sing something in the tree for her mummy tree that day. The tree began to sing over she could hardly contain the tree and was so strong and the tree. She was time to see how the tree liked the little girl felt so comfortable. 
The girl smiled and smiled, so happy with joy. The girl looked up the most beautiful and she had been able to hear the tree happy. so happy that this tree was so gentle the tree would be so pleased with their branches and started to be and the tree smiled, feeling happy and thankful and would wool for showing its bright voice for being so much. She thank you for teaching her joy at the

Training:  78%|███████▊  | 19500/25000 [5:16:35<58:43,  1.56it/s]   

Starting with val evaluation...
step 19500: train loss 2.0900, val loss 1.7730
Saving the model checkpoint for step: 19500
Checkpoint saved


Training:  78%|███████▊  | 19501/25000 [5:19:26<79:09:45, 51.82s/it]

 Step: 19500 | Generated Text: Once upon a time 3 year old named Davey had a great night coming up in his garden and an amazing view. But then he was all the wind, he couldn't come and he would get a surprise his toy truck, who lived. 

He soon reached his Danny was so scolded, pick him. He had been able to push it off the nearest to get his to the cooler up in and started walking through the sky, saying, "No! You are just going toy. You have to keep up in here." Dave was soak.


"Help, then get through a hurry back and was struggling!"
The little boy, he was trying to stay calm water, until he knew he wanted to explore the forest, but he remembered something that you might for us. He slowly opened the cage and to keep going to do. He kept going until he managed to prevent the bottom of himself for help him from someone else, a good as he could. back. 
They found a nearby. They walked inside this was so relieved that was safe now, he had done it back to save the saved up


Training:  80%|████████  | 20000/25000 [5:24:46<53:35,  1.55it/s]   

Starting with val evaluation...


Training:  80%|████████  | 20000/25000 [5:25:38<1:21:24,  1.02it/s]
