In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -U transformers datasets accelerate

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

In [None]:
# Veri setini y√ºkle
dataset = load_dataset("meta-math/MetaMathQA")
train_dataset = dataset["train"]

# Toplam uzunluk ve √ßeyrek uzunluk
total_len = len(train_dataset)
quarter_len = total_len // 4

train_dataset = train_dataset.select(range(0, 1 * quarter_len))
#train_dataset = train_dataset.select(range(1 * quarter_len, 2 * quarter_len))
#train_dataset = train_dataset.select(range(2 * quarter_len, 3 * quarter_len))
#train_dataset = train_dataset.select(range(3 * quarter_len, 4 * quarter_len))

In [None]:
# Step 2: Load Tokenizer and Model
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "unsloth/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
).to(device)

In [None]:
# Step 3: Preprocess the data
def preprocess(example):
    prompt = f"{example['query']} {example['response']}"
    return tokenizer(prompt, padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/98750 [00:00<?, ? examples/s]

In [None]:
# Step 4: Create DataLoaders
train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=lambda x: {
    'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in x]),
    'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in x]),
    'labels': torch.stack([torch.tensor(f['input_ids']) for f in x])
})

In [None]:
# Step 5: Setup Optimizer
optimizer = AdamW(model.parameters(), lr=1e-6)
print(f"Total transformer layers: {len(model.model.layers)}")
print(next(model.parameters()).dtype)
print(model.dtype)

Total transformer layers: 16
torch.float32
torch.float32


In [None]:
for i, layer in enumerate(model.model.layers):
    for param in layer.parameters():
        param.requires_grad = False

In [None]:
layer_group = 4

for i, layer in enumerate(model.model.layers):
    if layer_group == 1 and i < 4:
        for param in layer.parameters():
            param.requires_grad = True
    elif layer_group == 2 and i >= 4 and i < 8:
        for param in layer.parameters():
            param.requires_grad = True
    elif layer_group == 3 and i >= 8 and i < 12:
        for param in layer.parameters():
            param.requires_grad = True
    elif layer_group == 4 and i >= 12 and i < 16:
        for param in layer.parameters():
            param.requires_grad = True

for i, layer in enumerate(model.model.layers):
    trainable = any(p.requires_grad for p in layer.parameters())
    status = "‚úÖ" if trainable else "‚ùå"
    print(f"Layer {i}: {status}")

Layer 0: ‚ùå
Layer 1: ‚ùå
Layer 2: ‚ùå
Layer 3: ‚ùå
Layer 4: ‚ùå
Layer 5: ‚ùå
Layer 6: ‚ùå
Layer 7: ‚ùå
Layer 8: ‚ùå
Layer 9: ‚ùå
Layer 10: ‚ùå
Layer 11: ‚ùå
Layer 12: ‚úÖ
Layer 13: ‚úÖ
Layer 14: ‚úÖ
Layer 15: ‚úÖ


In [None]:
for i, layer in enumerate(model.model.layers):
    if i == 5:
        for param in layer.parameters():
            param.requires_grad = True

for i, layer in enumerate(model.model.layers):
    trainable = any(p.requires_grad for p in layer.parameters())
    status = "‚úÖ" if trainable else "‚ùå"
    print(f"Layer {i}: {status}")

Layer 0: ‚úÖ
Layer 1: ‚úÖ
Layer 2: ‚úÖ
Layer 3: ‚úÖ
Layer 4: ‚úÖ
Layer 5: ‚úÖ
Layer 6: ‚ùå
Layer 7: ‚ùå
Layer 8: ‚ùå
Layer 9: ‚ùå
Layer 10: ‚ùå
Layer 11: ‚ùå
Layer 12: ‚ùå
Layer 13: ‚ùå
Layer 14: ‚ùå
Layer 15: ‚ùå


In [None]:
from torch.amp import GradScaler, autocast
from tqdm import tqdm

# To Save Steps
step_flag = 0

# Create gradient scaler
scaler = GradScaler()
num_epochs = 1
j = 0

total_steps = len(train_loader) * num_epochs
save_steps = [int(total_steps * i / 8) for i in range(1, 8)]
current_step = 0  # Toplam adƒ±m sayac

# Training loop with mixed precision
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}

        # Use autocast for mixed precision
        with autocast(device_type="cuda"):
            outputs = model(**batch)
            loss = outputs.loss

        # Scale loss and call backward
        scaler.scale(loss).backward()

        # Unscale gradients and clip them
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Step optimizer and update scaler
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item()})

        current_step += 1

        # üîí Ara kaydetme noktasƒ±
        if current_step in save_steps and step_flag == 1:
            j += 1
            checkpoint_dir = f"/content/drive/MyDrive/Bitirme/Models/MetaMath/Checkpoints/llama-1b-base-metamath-4.1.step_{j}"
            model.save_pretrained(checkpoint_dir)
            tokenizer.save_pretrained(checkpoint_dir)
            print(f"üîí Checkpoint saved at step {current_step}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Train Loss: {avg_loss:.4f}")

final_path = "/content/drive/MyDrive/Bitirme/Models/MetaMath/llama-1b-base-metamath-9.2"
model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)
#torch.save(optimizer.state_dict(), final_path)
#torch.save(model.state_dict(), final_path)

Epoch 1/1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12344/12344 [56:03<00:00,  3.67it/s, Loss=0.289]


Epoch 1 Train Loss: 0.4177


('/content/drive/MyDrive/Bitirme/Models/MetaMath/llama-1b-base-metamath-9.2/tokenizer_config.json',
 '/content/drive/MyDrive/Bitirme/Models/MetaMath/llama-1b-base-metamath-9.2/special_tokens_map.json',
 '/content/drive/MyDrive/Bitirme/Models/MetaMath/llama-1b-base-metamath-9.2/tokenizer.json')