### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
from unsloth import FastLanguageModel  # FastVisionModel for LLMs
import torch
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Phi-4",  # Phi-4 2x faster!
    "unsloth/Phi-4-unsloth-bnb-4bit",  # Phi-4 Unsloth Dynamic 4-bit Quant
]  # More models at https://docs.unsloth.ai/get-started/all-our-models

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-4",
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel  # FastVisionModel for LLMs


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.2: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

<a name="Data"></a>
### Data Prep
We now use the `Phi-4` format for conversation style finetunes. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Phi-4 renders multi turn conversations like below:

```
<|im_start|>user<|im_sep|>Hello!<|im_end|>
<|im_start|>assistant<|im_sep|>Hi! How can I help?<|im_end|>
<|im_start|>user<|im_sep|>What is 2+2?<|im_end|>
```


In [None]:
import pandas as pd


df = pd.read_excel("/content/dataset_power.xlsx")


In [None]:
import pandas as pd
import json


# Her soru-cevap çiftini formatla ve JSONL satırı olarak hazırla
lines = []
for _, row in df.iterrows():
    question = str(row['Question']).strip()
    answer = str(row['Answer']).strip()
    chat_format = f"<|im_start|>user<|im_sep|>{question}<|im_end|><|im_start|>assistant<|im_sep|>{answer}<|im_end|>"
    lines.append({"text": chat_format})

# JSONL formatında kaydetmek istersen:
with open("phi4_dataset.jsonl", "w", encoding="utf-8") as f:
    for line in lines:
        f.write(json.dumps(line, ensure_ascii=False) + "\n")

# Değişken olarak da kullanabilirsin
dataset = lines


In [None]:
dataset

[{'text': '<|im_start|>user<|im_sep|>What is the monthly cost of Power Apps Premium?<|im_end|><|im_start|>assistant<|im_sep|>$20 per user/month (or $12 with 2,000+ new licenses)<|im_end|>'},
 {'text': '<|im_start|>user<|im_sep|>What license types are available for Power Apps?<|im_end|><|im_start|>assistant<|im_sep|>Power Apps Premium, Power Apps per app, and Power Apps per app pay-as-you-go<|im_end|>'},
 {'text': '<|im_start|>user<|im_sep|>What is included with Power Apps Premium?<|im_end|><|im_start|>assistant<|im_sep|>Unlimited custom apps, Power Pages websites, 500 AI Builder credits<|im_end|>'},
 {'text': '<|im_start|>user<|im_sep|>What are the Power Platform components covered in the licensing guide?<|im_end|><|im_start|>assistant<|im_sep|>Power Apps, Power Automate, Copilot Studio, Power Pages, AI Builder, Dataverse<|im_end|>'},
 {'text': '<|im_start|>user<|im_sep|>What does the Power Automate Premium license include?<|im_end|><|im_start|>assistant<|im_sep|>Unlimited cloud flows,

In [None]:
dataset[0]["text"]

'<|im_start|>user<|im_sep|>What is the monthly cost of Power Apps Premium?<|im_end|><|im_start|>assistant<|im_sep|>$20 per user/month (or $12 with 2,000+ new licenses)<|im_end|>'

In [None]:
from datasets import Dataset

# JSONL olarak oluşturduğun listeyi HuggingFace Dataset'e dönüştür
hf_dataset = Dataset.from_list(dataset)


<a name="Train"></a>
### Train the model


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = hf_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|im_start|>user<|im_sep|>",
    response_part="<|im_start|>assistant<|im_sep|>",
)

Map (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

We verify masking is actually done:

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|im_start|>user<|im_sep|>What is a Power Automate Process license used for?<|im_end|><|im_start|>assistant<|im_sep|>Licenses a single unattended bot or business process<|im_end|>'

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                  Licenses a single unattended bot or business process<|im_end|>'

We can see the System and Instruction prompts are successfully masked!

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50 | Num Epochs = 5 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 65,536,000/4,000,000,000 (1.64% trained)


Step,Training Loss
1,7.0092
2,6.484
3,6.2798
4,4.8863
5,5.5732
6,6.4155
7,3.6104
8,3.1217
9,3.4716
10,2.5872


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)**

We use `min_p = 0.1` and `temperature = 1.5`.

In [None]:
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
import torch
import re

tokenizer = get_chat_template(tokenizer, chat_template = "phi-4")
FastLanguageModel.for_inference(model)

def parse_phi_chat(text):
    pattern = r"<\|im_start\|>(user|assistant)<\|im_sep\|>(.*?)<\|im_end\|>"
    matches = re.findall(pattern, text, re.DOTALL)
    return [{"role": role.strip(), "content": content.strip()} for role, content in matches]

text = dataset[0]["text"]
messages = parse_phi_chat(text)

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda" if torch.cuda.is_available() else "cpu")

outputs = model.generate(
    input_ids = inputs,
    max_new_tokens = 100,
    use_cache = True,
    temperature = 1.0,
    top_p = 0.95,
)

#Sadece üretilen kısmı decode et
generated = outputs[0][inputs.shape[-1]:]
print(tokenizer.decode(generated, skip_special_tokens=True))


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


$5 per user/month for Power Pages


 You can also use a `TextStreamer` for continuous inference - so you can see The generation token by token, instead of waiting the whole time!

## 🧪 Inference on Custom Dataset and Exporting Model Outputs to CSV

This section performs **inference using a fine-tuned language model** (e.g., Phi-4 with LoRA adapters) on a custom dataset containing chat-style prompts and expected responses. The model's outputs are collected and saved into a CSV file for further analysis.

### 📝 Dataset Format
Each entry in the dataset is a single string with special formatting using tokens:

<|im_start|>user<|im_sep|>QUESTION<|im_end|>

<|im_start|>assistant<|im_sep|>EXPECTED_ANSWER<|im_end|>



### 🎯 Goal
- Extract the `QUESTION` part from each dataset item.
- Generate a response using the model.
- Compare it with the `EXPECTED_ANSWER` (optional).
- Save all results into a `model_outputs.csv` file.

### 📤 Output CSV Columns
- **question**: User input extracted from the dataset.
- **expected_answer**: The reference assistant answer.
- **generated_answer**: The model’s generated response.

This step is essential for validating how well the model performs on custom examples and helps in error analysis or evaluation.


In [None]:
import torch
import openpyxl
from transformers import TextStreamer
from openpyxl import Workbook

results = []

for i, row in enumerate(dataset):
    text = row["text"]

    # Soruyu çıkar
    try:
        question = text.split("<|im_sep|>")[1].split("<|im_end|>")[0].strip()
    except:
        print(f"Parse hatası satır {i}: {text}")
        continue

    messages = [{"role": "user", "content": question}]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    # Modelden yanıt al
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            use_cache=True,
        )

    # Tokenleri çöz
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Sadece assistant cevabını al
    if "<|im_start|>assistant<|im_sep|>" in decoded_output:
        generated_answer = decoded_output.split("<|im_start|>assistant<|im_sep|>")[-1].split("<|im_end|>")[0].strip()
    else:
        generated_answer = decoded_output.strip()

    # Referans cevabı al
    try:
        expected_answer = text.split("<|im_start|>assistant<|im_sep|>")[1].split("<|im_end|>")[0].strip()
    except:
        expected_answer = ""

    results.append((question, expected_answer, generated_answer))


wb = Workbook()
ws = wb.active
ws.title = "Model Cevapları"

ws.append(["question", "expected_answer", "generated_answer"])

# Datas
for row in results:
    ws.append(row)

wb.save("model_outputs.xlsx")
print("✅ Excel dosyası 'model_outputs.xlsx' olarak kaydedildi.")


✅ Excel dosyası 'model_outputs.xlsx' olarak kaydedildi.


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "How can users customize Copilot behavior?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(
    input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
    use_cache = True, temperature = 1.5, min_p = 0.1
)

By using specific prompts and commands<|im_end|>


You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer

    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [None]:
from google.colab import drive
import shutil
import os

# 1. Google Drive'ı mount et
drive.mount('/content/drive')

# 2. Kaynak klasör (modelin bulunduğu yer)
source_folder = "/content/lora_model"  # lora_model klasörün buradaysa
drive_folder = "/content/drive/MyDrive/fine_tuned_phi4"
os.makedirs(drive_folder, exist_ok=True)

# 3. Kopyalanacak dosyaların listesi
files_to_copy = [
    "adapter_model.safetensors",
    "adapter_config.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "vocab.json",
    "merges.txt",
    "special_tokens_map.json"
]

# 4. Dosyaları kopyala
for file_name in files_to_copy:
    source_path = os.path.join(source_folder, file_name)
    if os.path.exists(source_path):
        shutil.copy(source_path, drive_folder)
        print(f"✅ {file_name} Drive'a kopyalandı.")
    else:
        print(f"⚠️ {file_name} bulunamadı, atlandı.")

print("📁 Tüm işlemler tamamlandı.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ adapter_model.safetensors Drive'a kopyalandı.
✅ adapter_config.json Drive'a kopyalandı.
✅ tokenizer.json Drive'a kopyalandı.
✅ tokenizer_config.json Drive'a kopyalandı.
✅ vocab.json Drive'a kopyalandı.
✅ merges.txt Drive'a kopyalandı.
✅ special_tokens_map.json Drive'a kopyalandı.
📁 Tüm işlemler tamamlandı.


# RAG

In [1]:
pip install PyMuPDF faiss-cpu sentence-transformers pandas openpyxl transformers unsloth

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting unsloth
  Downloading unsloth-2025.5.2-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth_zoo>=2025.5.2 (from unsloth)
  Downloading unsloth_zoo-2025.5.4-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.20-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadat

In [2]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd

# 1. PDF'ten metin çıkar
doc = fitz.open("dokuman.pdf")
text = "\n".join([page.get_text() for page in doc])

# 2. Metni parçalara ayır
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " "]
)
chunks = splitter.split_text(text)

# 3. Embed oluştur
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(chunks, convert_to_numpy=True)

# 4. FAISS index oluştur
faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(embeddings)

# 5. RAG fonksiyonu
def retrieve_context(question, k=3):
    q_emb = embedder.encode([question])
    distances, indices = faiss_index.search(q_emb, k)
    return [chunks[i] for i in indices[0]]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Prompt+Model

In [6]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/phi-4",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

model.eval()


==((====))==  Unsloth 2025.5.2: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(100352, 5120, padding_idx=100351)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=1280, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=1280, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=17920, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=17920, bias=False)
          (down_proj): Linear4bit(in_features=17920, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((512

In [16]:
df = pd.read_excel("dataset.xlsx")  # 99 soruluk veri

rag_results = []

for i, row in df.iterrows():
    question = row["Question"]
    context = "\n".join(retrieve_context(question))

    # Phi-4 mesaj formatı
    messages = [{"role": "user", "content": f"{context}\nSoru: {question}"}]

    # Prompt'u modelin formatına göre hazırla
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Yanıtı ayıkla
    try:
        answer = decoded_output.split("<|im_start|>assistant<|im_sep|>")[-1].split("<|im_end|>")[0].strip()
    except IndexError:
        answer = decoded_output.strip()

    rag_results.append((question, context, answer))
    print(f"✅ Soru {i+1} işlendi.")


✅ Soru 1 işlendi.
✅ Soru 2 işlendi.
✅ Soru 3 işlendi.
✅ Soru 4 işlendi.
✅ Soru 5 işlendi.
✅ Soru 6 işlendi.
✅ Soru 7 işlendi.
✅ Soru 8 işlendi.
✅ Soru 9 işlendi.
✅ Soru 10 işlendi.
✅ Soru 11 işlendi.
✅ Soru 12 işlendi.
✅ Soru 13 işlendi.
✅ Soru 14 işlendi.
✅ Soru 15 işlendi.
✅ Soru 16 işlendi.
✅ Soru 17 işlendi.
✅ Soru 18 işlendi.
✅ Soru 19 işlendi.
✅ Soru 20 işlendi.
✅ Soru 21 işlendi.
✅ Soru 22 işlendi.
✅ Soru 23 işlendi.
✅ Soru 24 işlendi.
✅ Soru 25 işlendi.
✅ Soru 26 işlendi.
✅ Soru 27 işlendi.
✅ Soru 28 işlendi.
✅ Soru 29 işlendi.
✅ Soru 30 işlendi.
✅ Soru 31 işlendi.
✅ Soru 32 işlendi.
✅ Soru 33 işlendi.
✅ Soru 34 işlendi.
✅ Soru 35 işlendi.
✅ Soru 36 işlendi.
✅ Soru 37 işlendi.
✅ Soru 38 işlendi.
✅ Soru 39 işlendi.
✅ Soru 40 işlendi.
✅ Soru 41 işlendi.
✅ Soru 42 işlendi.
✅ Soru 43 işlendi.
✅ Soru 44 işlendi.
✅ Soru 45 işlendi.
✅ Soru 46 işlendi.
✅ Soru 47 işlendi.
✅ Soru 48 işlendi.
✅ Soru 49 işlendi.
✅ Soru 50 işlendi.
✅ Soru 51 işlendi.
✅ Soru 52 işlendi.
✅ Soru 53 işlendi.
✅ 

#### Save the results

In [17]:
from openpyxl import Workbook

wb = Workbook()
ws = wb.active
ws.title = "RAG Cevapları"
ws.append(["Question", "Context", "Generated Answer (RAG)"])

for row in rag_results:
    ws.append(row)

wb.save("rag_outputs_phi4.xlsx")
print("✅ Excel dosyası kaydedildi.")

✅ Excel dosyası kaydedildi.


In [18]:
print(f"{len(rag_results)} cevap üretildi.")

98 cevap üretildi.
