# Imports and Setup

In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset

# Loading Dataset

In [29]:
%%capture
dataset = load_dataset("Nan-Do/code-search-net-java")

In [30]:
print("Available splits:", list(dataset.keys()))

Available splits: ['train']


In [31]:
# Access the train split
train_dataset = dataset['train']

# Print column names
column_names = train_dataset.column_names
print("Column names:", column_names)

Column names: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary']


In [32]:
print("Columns in train dataset:", train_dataset.column_names)
print(f"Number of examples in train dataset: {len(train_dataset)}")

Columns in train dataset: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary']
Number of examples in train dataset: 495953


In [33]:
# Create a filtered dataset with only the first 1000 examples
subset_size = 1000
dataset = train_dataset.select(range(subset_size))

In [34]:
print("Columns in filtered dataset:", dataset.column_names)
print(f"Number of examples in filtered dataset: {len(dataset)}")

Columns in filtered dataset: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary']
Number of examples in filtered dataset: 1000


# Converting dataset to ShareGPT

In [35]:
def to_sharegpt(dataset, merged_prompt, output_column_name, conversation_extension=1):
    """
    Convert dataset to ShareGPT format with proper variable substitution

    Args:
        dataset: The source dataset
        merged_prompt: Template string with {column_name} placeholders
        output_column_name: Column to use as the output/completion
        conversation_extension: Number of examples to combine into a single conversation
    """
    formatted_data = []

    for i in range(0, len(dataset), conversation_extension):
        conversation = []

        # Process each example in the current conversation window
        for j in range(i, min(i + conversation_extension, len(dataset))):
            example = dataset[j]

            # Format the prompt by substituting variables
            prompt = merged_prompt
            for column in dataset.column_names:
                if column in merged_prompt and column in example:
                    placeholder = "{" + column + "}"
                    prompt = prompt.replace(placeholder, str(example[column]))

            # Add the human message
            conversation.append({
                "from": "human",
                "value": prompt
            })

            # Add the assistant message
            conversation.append({
                "from": "assistant",
                "value": example[output_column_name]
            })

        # Add the conversation to the formatted data
        formatted_data.append({"conversations": conversation})

    return formatted_data

In [36]:
# For code explanation
code_explain_dataset = to_sharegpt(
    dataset,
    merged_prompt = "Explain what this Java code does: {code}",
    output_column_name = "docstring"
)

In [37]:
code_explain_dataset[0]

{'conversations': [{'from': 'human',
   'value': 'Explain what this Java code does: protected final void bindIndexed(ConfigurationPropertyName name, Bindable<?> target,\n\t\t\tAggregateElementBinder elementBinder, ResolvableType aggregateType,\n\t\t\tResolvableType elementType, IndexedCollectionSupplier result) {\n\t\tfor (ConfigurationPropertySource source : getContext().getSources()) {\n\t\t\tbindIndexed(source, name, target, elementBinder, result, aggregateType,\n\t\t\t\t\telementType);\n\t\t\tif (result.wasSupplied() && result.get() != null) {\n\t\t\t\treturn;\n\t\t\t}\n\t\t}\n\t}'},
  {'from': 'assistant',
   'value': 'Bind indexed elements to the supplied collection.\n@param name the name of the property to bind\n@param target the target bindable\n@param elementBinder the binder to use for elements\n@param aggregateType the aggregate type, may be a collection or an array\n@param elementType the element type\n@param result the destination for results'}]}

# Initialize Model and Token Register

In [38]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-1.5B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3060 Laptop GPU. Num GPUs = 1. Max memory: 6.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [39]:
from datasets import Dataset

# First, convert your list to a Hugging Face Dataset
code_explain_dataset_hf = Dataset.from_list(code_explain_dataset)

In [47]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(code_explain_dataset_hf)

Unsloth: Standardizing formats (num_proc=16):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [54]:
from unsloth import apply_chat_template
chat_template = """
{SYSTEM}
USER: {INPUT}
ASSISTANT: {OUTPUT}"""

default_system_message = """You are generating brief documentation for a Java code snippet. "
    "Your response MUST be a single paragraph with NO bullet points, NO line breaks, and NO section headers. "
    "Do NOT explain the prompt. Just output the summary. "
    "Keep your explanation short and focused. Avoid repetition. "
    "Start your response with the words This function  "
    "Summarize ONLY the core logic and purpose of the code.\n\nSummary (one paragraph only):"""

# Use this system message with the apply_chat_template function
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    default_system_message = default_system_message
)

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [55]:
dataset[2]

{'conversations': [{'content': 'Explain what this Java code does: public void addServletRegistrationBeans(\n\t\t\tServletRegistrationBean<?>... servletRegistrationBeans) {\n\t\tAssert.notNull(servletRegistrationBeans,\n\t\t\t\t"ServletRegistrationBeans must not be null");\n\t\tCollections.addAll(this.servletRegistrationBeans, servletRegistrationBeans);\n\t}',
   'role': 'user'},
  {'content': 'Add {@link ServletRegistrationBean}s for the filter.\n@param servletRegistrationBeans the servlet registration beans to add\n@see #setServletRegistrationBeans',
   'role': 'assistant'}],
 'text': 'You are generating brief documentation for a Java code snippet. "\n    "Your response MUST be a single paragraph with NO bullet points, NO line breaks, and NO section headers. "\n    "Do NOT explain the prompt. Just output the summary. "\n    "Keep your explanation short and focused. Avoid repetition. "\n    "Start your response with the words This function  "\n    "Summarize ONLY the core logic and pur

In [56]:
dataset.push_to_hub("CarterPiepenburg/code-search-net-java-docgen")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/384 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CarterPiepenburg/code-search-net-java-docgen/commit/d25fde722a68ef66fe3330608985a8bf1815b511', commit_message='Upload dataset', commit_description='', oid='d25fde722a68ef66fe3330608985a8bf1815b511', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CarterPiepenburg/code-search-net-java-docgen', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CarterPiepenburg/code-search-net-java-docgen'), pr_revision=None, pr_num=None)