In [None]:
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# -----------------
# Ayarlar (GÜNCELLENDİ)
# -----------------
MODEL_NAME = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

# T4'te Deep veri seti için 1024 az gelir, çözüm kısmı kesilir ve NaN olur.
# T4 16GB VRAM'e sahiptir, Qwen 1.5B küçük olduğu için 2048'i rahat kaldırır.
MAX_LENGTH = 2048

SYSTEM_PROMPT = (
    "You are an expert Python programmer. "
    "Please read the problem carefully before writing any Python code."
)

# -----------------
# Model & Tokenizer
# -----------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16, # T4 için float16 şart
    device_map="auto",
)

model.gradient_checkpointing_enable()
model.config.use_cache = False

# -----------------
# Dataset (Deep-5K)
# -----------------
data = load_dataset("Naholav/CodeGen-Deep-5K")
full_dataset = data["train"]

# %20 Ayırma (Stabilite için)
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid["test"].train_test_split(test_size=0.5, seed=42)

train_data = train_testvalid["train"]
eval_data = test_valid["train"]

# -----------------
# Tokenize (Aynı Mantık)
# -----------------
def tokenize(example):
    prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{example['input']}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    solution = example["solution"] + tokenizer.eos_token
    full_text = prompt + solution

    tokens = tokenizer(
        full_text,
        truncation=True,
        max_length=MAX_LENGTH, # Artık 2048
        padding=False,
        add_special_tokens=False
    )

    input_ids = tokens["input_ids"]
    labels = input_ids.copy()

    # Prompt maskeleme
    prompt_tokens = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH, padding=False, add_special_tokens=False)["input_ids"]
    prompt_len = len(prompt_tokens)

    # DİKKAT: Eğer prompt uzunluğu, input_ids'den büyükse (veri çok uzunsa ve kesildiyse)
    # prompt_len'i input_ids kadar yapıyoruz ki hata vermesin.
    if prompt_len > len(input_ids):
        prompt_len = len(input_ids)

    for i in range(prompt_len):
        labels[i] = -100

    tokens["labels"] = labels
    return tokens

tokenized_train = train_data.map(tokenize, remove_columns=train_data.column_names)
tokenized_eval = eval_data.map(tokenize, remove_columns=eval_data.column_names)

# -----------------
# LoRA
# -----------------
lora_config = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True, pad_to_multiple_of=8)

# -----------------
# Training Arguments (T4 İÇİN DÜZELTİLDİ)
# -----------------
training_args = TrainingArguments(
    output_dir="./deep_qwen_ckpt",
    num_train_epochs=2,

    # Context 2048 olduğu için batch size'ı 1 yaptık (T4 Memory yetmesi için)
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    # Batch 1 olduğu için bunu 8 veya 16 yapabilirsin. 8 güvenlidir.
    gradient_accumulation_steps=8,

    learning_rate=2e-4,
    fp16=True,       # T4 bunu destekler
    bf16=False,      # T4 bunu DESTEKLEMEZ (Kapatıldı)

    # --- NaN İLACI ---
    max_grad_norm=0.3, # Gradient patlamasını engeller, NaN riskini bitirir.
    # -----------------

    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100, # İstersen 200 yap
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    report_to="none",

    # OOM (Out of Memory) koruması
    eval_accumulation_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Deep eğitimi T4 ayarlarıyla başlıyor...")
trainer.train()
trainer.save_model("./deep_qwen_final")

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820
Deep eğitimi T4 ayarlarıyla başlıyor...


Step,Training Loss,Validation Loss
100,0.3784,
200,0.352,
300,0.3355,
400,0.3289,
500,0.3145,
600,0.2877,
700,0.2513,
800,0.2354,
900,0.2285,
1000,0.2183,


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil

# Klasörün adı
folder_name = "/content/deep_checkpoint"
# Oluşturulacak zip dosyası
zip_name = "deep_checkpoint"

# Klasörü zip yap
shutil.make_archive(zip_name.replace('.zip',''), 'zip', folder_name)

'/content/deep_checkpoint.zip'