# Converting dataset to ShareGPT

In [None]:
%%capture
from datasets import load_dataset   

dataset = load_dataset("CarterPiepenburg/code-search-net-java-docgen", split="train")

In [None]:
def to_sharegpt(dataset, merged_prompt, output_column_name, conversation_extension=1):
    """
    Convert dataset to ShareGPT format with proper variable substitution

    Args:
        dataset: The source dataset
        merged_prompt: Template string with {column_name} placeholders
        output_column_name: Column to use as the output/completion
        conversation_extension: Number of examples to combine into a single conversation
    """
    formatted_data = []

    for i in range(0, len(dataset), conversation_extension):
        conversation = []

        # Process each example in the current conversation window
        for j in range(i, min(i + conversation_extension, len(dataset))):
            example = dataset[j]

            # Format the prompt by substituting variables
            prompt = merged_prompt
            for column in dataset.column_names:
                if column in merged_prompt and column in example:
                    placeholder = "{" + column + "}"
                    prompt = prompt.replace(placeholder, str(example[column]))

            # Add the human message
            conversation.append({
                "from": "human",
                "value": prompt
            })

            # Add the assistant message
            conversation.append({
                "from": "assistant",
                "value": example[output_column_name]
            })

        # Add the conversation to the formatted data
        formatted_data.append({"conversations": conversation})

    return formatted_data

In [None]:
# For code explanation
code_explain_dataset = to_sharegpt(
    dataset,
    merged_prompt = "Explain what this Java code does: {code}",
    output_column_name = "response"
)

In [None]:
code_explain_dataset[0]

# Initialize Model and Token Register

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-1.5B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
from datasets import Dataset

# First, convert your list to a Hugging Face Dataset
code_explain_dataset_hf = Dataset.from_list(code_explain_dataset)

In [None]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(code_explain_dataset_hf)

In [None]:
from unsloth import apply_chat_template
chat_template = """
{SYSTEM}
USER: {INPUT}
ASSISTANT: {OUTPUT}"""

default_system_message = """
You are generating brief documentation for a Java code snippet.
Your response MUST be a single paragraph with NO bulletpoints, NO line breaks, and NO section headers.
Do NOT explain the prompt. Just output the summary.
Keep your explanation short and focused. Avoid repetition.
Start your response with (This function)
Summarize ONLY the core logic and purpose of the code.
Here is the Java code:
Summary (one paragraph only):"""


In [None]:
# Use this system message with the apply_chat_template function
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    default_system_message = default_system_message
)

In [None]:
dataset[2]

# Training Model

In [None]:
split_dataset = dataset.train_test_split(test_size=0.3)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-1.5B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj" ,],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",
    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        #warmup_steps = 5,
        warmup_steps = 10,
        #max_steps = 10,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "ft-outputs",
        report_to = [],
    ),
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("ft-outputs")

In [None]:
model.save_pretrained("ft-outputs")
tokenizer.save_pretrained("ft-outputs")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# After training is complete
training_logs = trainer.state.log_history
# Extract loss values
train_losses = [log.get('loss') for log in training_logs if 'loss' in log]
steps = list(range(len(train_losses)))
# Plot the training loss
plt.figure(figsize=(10, 6))
plt.plot(steps, train_losses)
plt.xlabel('Training Steps')
plt.ylabel('Training Loss')
plt.title('Training Loss over Time')
plt.grid(True)
plt.show()

# Evaluation

In [None]:
import numpy as np
import json
import torch
from transformers import TextStreamer
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from safetensors import safe_open
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
# Load the model and tokenizer from your saved directory
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="ft-outputs",
    max_seq_length=max_seq_length,
    dtype=torch.float16, # You can use bfloat16 if supported by your hardware
    load_in_4bit=True, # Assuming you're using the same quantization as during training
)

FastLanguageModel.for_inference(model)

In [None]:
chat_template = """
{SYSTEM}
USER: {INPUT}
ASSISTANT: {OUTPUT}"""

system_message = """
You are generating brief documentation for a Java code snippet.
Your response MUST be a single paragraph with NO bulletpoints, NO line breaks, and NO section headers.
Do NOT explain the prompt. Just output the summary.
Keep your explanation short and focused. Avoid repetition.
Start your response with (This function)
Summarize ONLY the core logic and purpose of the code.
Here is the Java code:
Summary (one paragraph only):"""

In [None]:
# Function to run inference on a single example
def generate_explanation(code_to_explain):
    # Prepare user message with Java code for explanation
    user_message = f"Explain what this Java code does: {code_to_explain}"

    # Create the messages list with the system message and user message
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

    # Apply chat template to format the input consistently with how you trained
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        chat_template = chat_template,
        default_system_message = system_message
    ).to("cuda")

    # Generate response with appropriate parameters for code generation
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            streamer=None,  # No streaming for batch processing
            max_new_tokens=256,
            pad_token_id=tokenizer.eos_token_id,
            temperature=0.2,
            top_p=0.9,
            repetition_penalty=1.1
        )

    # Decode the output, skipping the prompt
    generated_text = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
    return generated_text

In [None]:
# Process examples from rows 700 to 1000
results = []
start_idx = 0
end_idx = 50


# Convert to list if it's a Dataset object to ensure proper indexing
if hasattr(test_dataset, 'to_list'):
    dataset_list = test_dataset.to_list()
else:
    dataset_list = test_dataset

# Make sure we don't exceed the dataset length
end_idx = min(end_idx, len(dataset_list) - 1)
subset = dataset_list[start_idx:end_idx + 1]

print(f"Processing examples from index {start_idx} to {end_idx}")

# Create results directory if it doesn't exist
import os
os.makedirs("evaluation_results", exist_ok=True)

for i, example in enumerate(subset):
    # Extract the code from the user message
    if "conversations" in example:
        user_message = example["conversations"][0]["content"]
        # Extract just the code part (assuming it starts after "Explain what this Python code does: ")
        if "Explain what this Java code does: " in user_message:
            code_part = user_message.split("Explain what this Java code does: ")[1]
        else:
            code_part = user_message  # If no prefix, use the whole message

        # Generate explanation for this code
        explanation = generate_explanation(code_part)

        # Get the reference explanation
        reference_explanation = example["conversations"][1]["content"] if len(example["conversations"]) > 1 else ""

        # Store results
        results.append({
            "index": start_idx + i,
            "code": code_part,
            "reference_explanation": reference_explanation,
            "generated_explanation": explanation
        })

        # Print progress
        print(f"Processed example {start_idx + i} ({i + 1}/{len(subset)})")

        # Print the explanation
        print(f"Original code: {code_part[:100]}...")
        print(f"Generated explanation: {explanation}")
        print("-" * 50)

# Save raw results to a file
with open("evaluation_results/inference_results_0_10.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Raw results saved to evaluation_results/inference_results_0_10.json")