In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
import torch
import torch.nn as nn
import torch.nn.functional as f

Access is denied.


In [2]:
from datasets import load_dataset
data=load_dataset("roneneldan/TinyStories")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
texts=[data["train"][i]['text']for i in range(2)]
texts

['One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.',
 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that we

In [5]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token
BLOCK_SIZE=256
VOCAB_SIZE=len(tokenizer)
BLOCK_SIZE=256
D_MODEL=512
N_HEADS=16
N_LAYERS=16
DFF=4*512
def tokenize(example):
    return tokenizer(example["text"],truncation=True,max_length=BLOCK_SIZE,padding="max_length")
tokenized_dataset=data.map(tokenize,batched=True)                                

In [6]:
from torch.utils.data import Dataset,DataLoader
class GPTDataset(Dataset):
    def __init__(self,data):
        self.data=data
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        ids=torch.tensor(self.data[idx]['input_ids'],dtype=torch.long)
        x=ids[:-1]
        y=ids[1:]
        return x,y

train_data=GPTDataset(tokenized_dataset['train'])
train_data_loader=DataLoader(train_data,batch_size=8,shuffle=True)
for x, y in train_data_loader:
    print("Input:", x.shape)
    print("Target:", y.shape)
    print("Example IDs:", x[0][:10])
    print("Decoded:", tokenizer.decode(x[0]))
    break


Input: torch.Size([8, 255])
Target: torch.Size([8, 255])
Example IDs: tensor([14967,   290,  3409,   389,  2460,    13,  1119,   588,   284,   711])
Decoded: Tim and Sam are friends. They like to play with their kayaks in the lake. They have a race to see who is faster.

"Ready, set, go!" Tim says. He paddles hard with his stick. Sam paddles hard too. They splash water and laugh.

Tim is in front. He sees a big rock. He thinks he can go around it. He is smart. He does not want to hit the rock.

But Sam does not see the rock. He paddles straight to it. He hits the rock with his kayak. His kayak stops. He is sad.

Tim looks back. He sees Sam and the rock. He stops too. He is kind. He does not want to win alone.

"Are you okay, Sam?" Tim asks. He paddles back to Sam.

"I'm okay, Tim. But my kayak is stuck. I can't move it." Sam says.

Tim helps Sam. He pushes the rock with his stick. The rock moves a little. Sam pulls his kayak. The kayak comes out. Sam is happy.

"Thank you, Tim. You are

In [7]:
class CausalLM(nn.Module):
    def __init__(self,n_head,d_model,dropout=0.1):
        super().__init__()
        self.d_model=d_model
        self.n_head=n_head
        self.dk=d_model//n_head
        self.qkv=nn.Linear(d_model,3*d_model,bias=False)
        self.out_proj=nn.Linear(d_model,d_model,bias=False)
        self.proj_dropout=nn.Dropout(dropout)
        self.attn_dropout=nn.Dropout(dropout)
    def build_causal_mask(self,T,device):
        mask = torch.tril(torch.ones((T, T), dtype=torch.bool, device=device))
        # we'll use it to set -inf on disallowed positions
        return mask.unsqueeze(0).unsqueeze(0)
    def forward(self,x):
        B,T,D=x.shape
        qkv=self.qkv(x)
        q,k,v=qkv.chunk(3,-1)
        q=q.view(B,T,self.n_head,self.dk).transpose(1,2)
        k=k.view(B,T,self.n_head,self.dk).transpose(1,2)
        v=v.view(B,T,self.n_head,self.dk).transpose(1,2)
        scores=torch.matmul(q,k.transpose(-2,-1))
        scores=scores/(self.dk**0.5)
        self.causal_mask=self.build_causal_mask(T,x.device)
        causal=self.causal_mask[:,:,:T,:T]
        scores = scores.masked_fill(~causal, float("-inf"))
        attn_weights=f.softmax(scores,dim=-1)
        attn_weights=self.attn_dropout(attn_weights)
        context=torch.matmul(attn_weights,v)
        context=context.transpose(1,2).contiguous().view(B,T,D)
        out=self.out_proj(context)
        out=self.proj_dropout(out)
        return out
        
        

In [8]:
class FeedForward(nn.Module):
    def __init__(self,ff_dim,d_model,dropout=0.1):
        super().__init__()
        self.net=nn.Sequential(nn.Linear(d_model,ff_dim),
                               nn.GELU(),
                               nn.Linear(ff_dim,d_model),
                               nn.Dropout(dropout)
                              )
    def forward(self,x):
        return self.net(x)    

In [9]:
class TransformerBlock(nn.Module):
    def __init__(self,d_model,n_heads,dff,dropout=0.1):
        super().__init__()
        self.ln1=nn.LayerNorm(d_model)
        self.attn=CausalLM(n_heads,d_model)
        self.ln2=nn.LayerNorm(d_model)
        self.ff=FeedForward(dff,d_model,dropout)
    def forward(self,x):
        x=x+self.attn(self.ln1(x))
        x=x+self.ff(self.ln2(x))
        return x

In [10]:
class GPT(nn.Module):
    def __init__(self,vocab_size,block_size,d_model,n_head,n_layers):
        super().__init__()
        self.token_emb=nn.Embedding(vocab_size,d_model)
        self.pos_emb=nn.Embedding(block_size,d_model)
        self.blocks=nn.ModuleList([
            TransformerBlock(d_model,n_head,dff=4*d_model)
        ])
        self.lnf=nn.LayerNorm(d_model)
        self.head=nn.Linear(d_model,vocab_size,bias=False)
        self.block_size=block_size
        self.vocab_size=vocab_size
    def forward(self,idx,targets=None):
        B,T=idx.shape
        token_emb=self.token_emb(idx)
        pos=torch.arange(T,device=idx.device)
        pos_emb=self.pos_emb(pos)
        x=token_emb+pos_emb
        for block in self.blocks:
            x = block(x)                                           # apply transformer block
        x = self.lnf(x)                                           # final norm
        logits = self.head(x) 
        if targets!=None:
            loss = f.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
            return logits, loss
        else:
            return logits,0

        
        
        
        
    

In [11]:
print(VOCAB_SIZE)
model=GPT(VOCAB_SIZE,BLOCK_SIZE,D_MODEL,N_HEADS,N_LAYERS)
optimizer=torch.optim.AdamW(model.parameters(),lr=3e-4)
epochs=3
model.to(device)
model.train()

        


50257


GPT(
  (token_emb): Embedding(50257, 512)
  (pos_emb): Embedding(256, 512)
  (blocks): ModuleList(
    (0): TransformerBlock(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): CausalLM(
        (qkv): Linear(in_features=512, out_features=1536, bias=False)
        (out_proj): Linear(in_features=512, out_features=512, bias=False)
        (proj_dropout): Dropout(p=0.1, inplace=False)
        (attn_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (lnf): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=512, out_features=50257, bias=False)
)

In [12]:
import os
for epoch in range(epochs):
    for step, (x,y) in enumerate(train_data_loader):
        if step==1000:
            break
        x,y=x.to(device),y.to(device)
        logits,loss=model(x,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if step % 100 == 0:
            print(f"Epoch {epoch} | Step {step} | Loss: {loss.item():.4f}")


Epoch 0 | Step 0 | Loss: 10.9174
Epoch 0 | Step 100 | Loss: 4.2639
Epoch 0 | Step 200 | Loss: 3.6466
Epoch 0 | Step 300 | Loss: 3.3378
Epoch 0 | Step 400 | Loss: 3.4865
Epoch 0 | Step 500 | Loss: 2.7952
Epoch 0 | Step 600 | Loss: 2.6103
Epoch 0 | Step 700 | Loss: 3.1493
Epoch 0 | Step 800 | Loss: 3.0161
Epoch 0 | Step 900 | Loss: 2.7836
Epoch 1 | Step 0 | Loss: 2.9910
Epoch 1 | Step 100 | Loss: 2.7745
Epoch 1 | Step 200 | Loss: 2.8900
Epoch 1 | Step 300 | Loss: 2.8342
Epoch 1 | Step 400 | Loss: 2.5117
Epoch 1 | Step 500 | Loss: 2.7868
Epoch 1 | Step 600 | Loss: 2.7755
Epoch 1 | Step 700 | Loss: 2.3086
Epoch 1 | Step 800 | Loss: 2.3186
Epoch 1 | Step 900 | Loss: 2.6154
Epoch 2 | Step 0 | Loss: 2.4877
Epoch 2 | Step 100 | Loss: 2.7804
Epoch 2 | Step 200 | Loss: 2.6416
Epoch 2 | Step 300 | Loss: 2.8494
Epoch 2 | Step 400 | Loss: 2.4498
Epoch 2 | Step 500 | Loss: 2.3245
Epoch 2 | Step 600 | Loss: 2.7600
Epoch 2 | Step 700 | Loss: 2.8159
Epoch 2 | Step 800 | Loss: 2.2986
Epoch 2 | Step 900 

NameError: name 'avg_loss' is not defined

In [13]:
save_dir = f"checkpoint_epoch_{epoch+1}"
os.makedirs(save_dir, exist_ok=True)

torch.save({
    'epoch': epoch + 1,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, os.path.join(save_dir, "model.pt"))

tokenizer.save_pretrained(save_dir)
print(f"✅ Model and tokenizer saved in {save_dir}")

✅ Model and tokenizer saved in checkpoint_epoch_3


In [14]:
@torch.no_grad
def generate(mode,tokenizer,prompt,max_new_tokens):

    idx_inf = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
    for _ in range(max_new_tokens):
        # Forward pass (causal)
        logits, _ = model(idx_inf)
        logits = logits[:, -1, :]  # last token only
        probs = f.softmax(logits, dim=-1)

        # Sample next token
        next_token = torch.multinomial(probs, num_samples=1)
        idx_inf = torch.cat([idx_inf, next_token], dim=1)

    return tokenizer.decode(idx_inf[0])
    
    

In [15]:
text = generate(model, tokenizer, "There was a boy", max_new_tokens=50)
print(text)

There was a boy not very cute boy. He loved to star, more fruits for pictures. Every day he would do his quest. One day, Bob went to a football. He saw thunder and pretended to move to the door. 

He had fun and


In [16]:
import torch, gc
gc.collect()

# Empty PyTorch cache
torch.cuda.empty_cache()

# Reset memory stats (optional)
torch.cuda.reset_peak_memory_stats()

# Synchronize to make sure all CUDA ops are done
torch.cuda.synchronize()

# Clear any dangling CUDA graphs or memory pools (PyTorch 2.0+)
torch.cuda.ipc_collect()

print("GPU memory cleared successfully.")

GPU memory cleared successfully.
