In [None]:

!pip install -q transformers datasets torch sentencepiece

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType

"""
Model adƒ± ve tokenizer'ƒ± y√ºkle, cihazƒ± belirle
"""
MODEL_NAME = "ytu-ce-cosmos/turkish-gpt2-large-750m-instruct-v0.1" #ytu-ce-cosmos/Turkish-Gemma-9b-v0.1
# bu modeller teacher olarak hazƒ±rladƒ±ƒüƒ±m davranƒ±≈ü verileriyle eƒüittim

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32
).to(device)

print("Model & tokenizer OK")

"""
Dataset'i y√ºkle ve √∂rnek veriyi g√∂ster
"""
dataset = load_dataset("bylang/behavior_data")

print(dataset)
print(dataset["train"][0])

"""
Tokenize fonksiyonu: Soru ve cevabƒ± birle≈ütirip tokenize et
"""
def tokenize_fn(example):
    text = example["question"] + "\n" + example["answer"]
    tokens = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

"""
Dataset'i tokenize et ve gereksiz s√ºtunlarƒ± kaldƒ±r
"""
tokenized_ds = dataset.map(
    tokenize_fn,
    remove_columns=dataset["train"].column_names
)

"""
LoRA konfig√ºrasyonu ayarlama
"""
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["c_attn", "c_proj"]
)

"""
Model'e LoRA uygulayƒ±p eƒüitilebilir parametreleri g√∂ster
"""
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

"""
Data collator olu≈ütur
"""
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

"""
√áƒ±ktƒ± dizini ve eƒüitim arg√ºmanlarƒ±nƒ± ayarla
"""
OUTPUT_DIR = "/content/drive/MyDrive/turkish_gpt2_peft"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    optim="adamw_torch"
)

"""
Trainer'ƒ± olu≈üturma
"""
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

"""
Modeli eƒüitme i≈ülemi
"""
trainer.train()

"""
Eƒüitilmi≈ü modeli Hugging Face Hub'a y√ºkleme
"""
trainer.push_to_hub("--",token="--")# bylang/teacher1_cosmos_gpt2, bylang/teacher2_cosmos_gemma bu modelleri hf'ye pushladƒ±m

In [None]:

import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer



"""
finetune ettiƒüim ve etmediƒüim teacher modellerine rastgele olu≈üturduƒüum promptlarƒ± sorup cevap aldƒ±m
"""
HF_REPO = "--" #bylang/teacher2_cosmos_gemma , bylang/teacher1_cosmos_gpt2 , ytu-ce-cosmos/turkish-gpt2-large-750m-instruct-v0.1, ytu-ce-cosmos/Turkish-Gemma-9b-v0.1
teacher_model = AutoModelForCausalLM.from_pretrained(HF_REPO)
teacher_tokenizer = AutoTokenizer.from_pretrained(HF_REPO)
teacher_model.eval()

"""
Cihazƒ± belirle: GPU varsa kullan, yoksa CPU
"""
# GPU varsa kullan, yoksa CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
teacher_model.to(device)



"""
CSV dosyasƒ±ndan prompt'larƒ± okuma ve listeye √ßevirme i≈ülemi
"""
csv_path = "/kaggle/input/dataset/random_student_prompts.csv"
df_prompts = pd.read_csv(csv_path)
prompts = df_prompts["input"].tolist()



"""
Her prompt i√ßin teacher model ile √ßƒ±ktƒ± √ºretme
"""
teacher_outputs = []

for i, prompt in enumerate(prompts, 1):
    inputs = teacher_tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = teacher_model.generate(
            **inputs,
            max_length=50,
            do_sample=True,
            top_p=0.9,
            temperature=0.8
        )
    output_text = teacher_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    teacher_outputs.append(output_text)
    
    if i % 20 == 0:
        print(f"{i}/{len(prompts)} prompt i≈ülendi...")



"""
Student dataset'ini olu≈üturma ve CSV olarak kaydetme i≈ülemi
"""
df_student = pd.DataFrame({"input": prompts, "target": teacher_outputs})
df_student.to_csv("/kaggle/working/student_dataset.csv", index=False)

print("Student dataset olu≈üturuldu ve 'student_dataset.csv' olarak kaydedildi.")

In [None]:
!pip install -U bitsandbytes peft transformers datasets huggingface_hub

# =========================================================
# 0Ô∏è‚É£ ZORUNLU: Kernel restart sonrasƒ± √ßalƒ±≈ütƒ±r
# =========================================================
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"  # CUDA fragmentation √∂nleyici

# =========================================================
# 1Ô∏è‚É£ Importlar
# =========================================================
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import login

# =========================================================
# 2Ô∏è‚É£ Hugging Face Login
# =========================================================
login(token="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")

# =========================================================
# 3Ô∏è‚É£ Model ve Tokenizer
# =========================================================
MODEL_NAME = "ytu-ce-cosmos/Turkish-Gemma-9b-v0.1"  # YTU'nun modeli veya datasetine uygun base
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# =========================================================
# 4Ô∏è‚É£ BitsAndBytes Config (8-bit, T4 uyumlu)
# =========================================================
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

# üî• LoRA i√ßin k-bit training hazƒ±rlƒ±ƒüƒ±
model = prepare_model_for_kbit_training(model)

# =========================================================
# 5Ô∏è‚É£ LoRA Config
# =========================================================
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # T4 memory-safe
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# =========================================================
# 6Ô∏è‚É£ Dataset Hazƒ±rlƒ±ƒüƒ±
# =========================================================
dataset = load_dataset("bylang/teacher-gemma-outputs", split="train")  # k√º√ß√ºk √∂rnek T4 memory safe

def format_prompt(example):
    text = f"<s>[INST] {example['input']} [/INST] {example['output']} </s>"
    return {"text": text}

dataset = dataset.map(format_prompt)

def tokenize(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# =========================================================
# 7Ô∏è‚É£ Data Collator
# =========================================================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# =========================================================
# 8Ô∏è‚É£ Training Arguments
# =========================================================
training_args = TrainingArguments(
    output_dir="./ytu_lora_finetune",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=5,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    report_to="none",
    push_to_hub=True,
    hub_model_id="bylang/ytu_lora_finetuned",  # HF Hub repo
)

# =========================================================
# 9Ô∏è‚É£ Trainer
# =========================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# =========================================================
# üîü Train + Hugging Face Push
# =========================================================
trainer.train()
trainer.push_to_hub()
tokenizer.push_to_hub("bylang/ytu_lora_finetuned")

print(" LoRA fine-tune tamamlandƒ± ve HF Hub'a push edildi!")
