In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq # Dinamik padding için en iyi araçlardan biri
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# ---------
# Ayarlar
# ---------
MODEL_NAME = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

SYSTEM_PROMPT = (
    "You are an expert Python programmer. "
    "Please read the problem carefully before writing any Python code."
)

# Max length'i biraz daha esnek tutabiliriz ama yine de bir sınır olsun
MAX_LENGTH = 1024

# -----------------
# Model & Tokenizer
# -----------------
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

# Qwen bazen pad token'ı tanımsız getirebilir, eos_token olarak atayalım
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
)

model.gradient_checkpointing_enable()
model.config.use_cache = False

# -----------------
# Dataset Hazırlığı ve Bölme (Train / Val / Test)
# -----------------
data = load_dataset("Naholav/CodeGen-Diverse-5K")
full_dataset = data["train"]

# 1. Adım: Train ve (Test+Val) olarak ayır (%90 Train, %10 Kalan)
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)

# 2. Adım: Kalan %10'u yarı yarıya böl (%5 Val, %5 Test)
test_valid = train_testvalid["test"].train_test_split(test_size=0.5, seed=42)

# Sonuç veri setleri
train_data = train_testvalid["train"]
eval_data = test_valid["train"] # Validation seti
test_data = test_valid["test"]  # Test seti (Eğitimden sonra denemek için sakla)

print(f"Train Size: {len(train_data)}, Eval Size: {len(eval_data)}, Test Size: {len(test_data)}")

# -----------------
# Tokenize Fonksiyonu (DÜZELTİLDİ)
# -----------------
def tokenize(example):
    # Qwen-Instruct formatına uygun prompt yapısı
    # Not: Chat template kullanmak daha sağlıklıdır ama manuel yapıyorsak:
    prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{example['input']}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )

    # Çözümün sonuna EOS token ekliyoruz ki model durmayı öğrensin
    solution = example["solution"] + tokenizer.eos_token
    full_text = prompt + solution

    # Padding'i BURADA YAPMIYORUZ (padding=False). DataCollator yapacak.
    tokens = tokenizer(
        full_text,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
        add_special_tokens=False
    )

    # Label oluşturma
    input_ids = tokens["input_ids"]
    labels = input_ids.copy()

    # Prompt kısmını maskeleme (Loss hesaplanmasın diye -100 yapıyoruz)
    # Prompt'un token uzunluğunu hesapla
    prompt_tokens = tokenizer(
        prompt,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
        add_special_tokens=False
    )["input_ids"]

    prompt_len = len(prompt_tokens)

    # Sadece prompt uzunluğu kadar olan kısmı -100 yap
    for i in range(prompt_len):
        if i < len(labels):
            labels[i] = -100

    tokens["labels"] = labels
    return tokens

# Map işlemini uygula
tokenized_train = train_data.map(tokenize, remove_columns=train_data.column_names)
tokenized_eval = eval_data.map(tokenize, remove_columns=eval_data.column_names)

# -----------------
# LoRA Config
# -----------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


# -----------------
# Data Collator (Dinamik Padding)
# -----------------
# Bu arkadaş batch içindeki en uzun veriye göre pad eder ve label'lardaki pad kısımlarını otomatik -100 yapar.
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True, # Dinamik padding aktif
    pad_to_multiple_of=8 # Donanım verimliliği için 8'in katlarına tamamlar
)

# -----------------
# Training
# -----------------
training_args = TrainingArguments(
    output_dir="./diverse_checkpoint",
    num_train_epochs=2,
    per_device_train_batch_size=2,   # VRAM'e göre ayarla
    per_device_eval_batch_size=2,    # Eval için batch size
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,

    # Logging ve Eval Ayarları
    logging_steps=10,
    eval_strategy="steps",           # 'steps' veya 'epoch' seçilebilir
    eval_steps=100,                  # Her 100 stepte bir eval yap
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,     # Eğitim bitince en iyi eval loss'a sahip modeli yükle
    report_to="none",
    eval_accumulation_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,     # Eval dataset eklendi
    tokenizer=tokenizer,
    data_collator=data_collator,      # Data collator eklendi
)

print("Eğitim başlıyor...")
trainer.train()

# Eğitim bittikten sonra modeli kaydet
trainer.save_model("./diverse_qwen_final")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CodeGen-Diverse-5K.jsonl:   0%|          | 0.00/56.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Train Size: 4000, Eval Size: 500, Test Size: 500


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820
Eğitim başlıyor...


Step,Training Loss,Validation Loss
100,0.3952,0.371277
200,0.3769,0.356117
300,0.3144,0.349764
400,0.3722,0.342732
500,0.3604,0.338534
600,0.2797,0.344468
700,0.2589,0.344312
800,0.2605,0.34485
900,0.2478,0.340531
1000,0.3176,0.340693


In [None]:
import shutil

# Klasörün adı
folder_name = "/content/diverse_checkpoint"
# Oluşturulacak zip dosyası
zip_name = "diverse_checkpoint"

# Klasörü zip yap
shutil.make_archive(zip_name.replace('.zip',''), 'zip', folder_name)

'/content/diverse_checkpoint.zip'