In [None]:
!pip install -q -U accelerate datasets peft transformers trl wandb

In [None]:
from accelerate import PartialState
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Specify the checkpoint for SmolLM2 and set the device.
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"


# Load the tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# device = "cuda"  # or "cpu" for CPU usage

# model = AutoModelForCausalLM.from_pretrained(checkpoint)
# model = model.to(device)
# For multi-GPU setups, consider using device_map="auto":
model = AutoModelForCausalLM.from_pretrained(
        checkpoint,
        device_map="auto", # {"": PartialState().process_index}
        )

In [None]:
tokenizer

In [None]:
model

# Dataset

Json structure output: https://huggingface.co/datasets/ChristianAzinn/json-training

In [None]:
from datasets import load_dataset

ds = load_dataset("ChristianAzinn/json-training")
# Perform Train-Test Split
split_ds = ds["train"].train_test_split(test_size=0.2, seed=42)

# Access train and test splits
train_dataset = split_ds["train"]
test_dataset = split_ds["test"]

In [None]:
train_dataset

In [None]:
# Set the response template to match the chat format.
# (Ensure this string exactly matches the beginning of the assistant's response as output by apply_chat_template.)
response_template = "<|im_start|>assistant\n"
instruction_template = "<|im_start|>user\n"
PROMPT_TEMPLATE = """Query: {query}

schema:
{schema}"""


def formatting_prompts_func(example):
    """
    Converts each example into a conversation string using the tokenizer's chat template.
    Assumes each example contains lists under "instruction" and "output".
    """
    output_texts = []
    for i in range(len(example["query"])):
        # Build a conversation with a user message and an assistant reply.
        messages = [
            {
                "role":    "system",
                "content": "You are are an expert in generate json structure based on user query and schema."
                },
            {"role": "user", "content": PROMPT_TEMPLATE.format(query=example["query"][i], schema=example["schema"][i])},
            # Note: It is important that the assistant message content here does not
            # include the assistant marker, because the chat template will insert it.
            {"role": "assistant", "content": example["response"][i]}
            ]
        # Use the chat template to generate the formatted text.
        text = tokenizer.apply_chat_template(messages, tokenize=False)
        output_texts.append(text)
    return output_texts


# Create the data collator.
# It will search for the response_template (here "Assistant:") in the formatted text
# and ensure that only tokens after that marker contribute to the loss.
collator = DataCollatorForCompletionOnlyLM(response_template=response_template,
                                           instruction_template=instruction_template,
                                           tokenizer=tokenizer,
                                           mlm=False)

In [None]:
tokenizer.apply_chat_template([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I am good, thank you."}
    ], tokenize=False)

# Lora Config

In [None]:
from peft import LoraConfig

# Note that r, in the figure above, is a hyperparameter here that we can use to specify the rank of the low-rank matrices used for adaptation.
# A smaller r leads to a simpler low-rank matrix, which results in fewer parameters to learn during adaptation.
# This can lead to faster training and potentially reduced computational requirements.
# However, with a smaller r, the capacity of the low-rank matrix to capture task-specific information decreases.
# This may result in lower adaptation quality, and the model might not perform as well on the new task compared to a higher r.
lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=['o_proj', 'k_proj', 'q_proj', "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
        )

# Wandb

Creat token and account: https://wandb.ai/home

In [None]:
import wandb

wandb.login()

# SFT Trainer config

In [None]:
OUTPUT_DIR = checkpoint.split("/")[-1] + "-structure-output"

# setup the trainer
trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        args=SFTConfig(
                per_device_train_batch_size=2,
                gradient_accumulation_steps=4,
                warmup_steps=100,
                max_steps=1000,
                learning_rate=0.0002,
                lr_scheduler_type="cosine",
                eval_strategy="steps",
                eval_steps=150,
                weight_decay=0.01,
                bf16=True,
                logging_strategy="steps",
                logging_steps=10,
                output_dir="./" + OUTPUT_DIR,
                optim="paged_adamw_8bit",
                seed=42,
                run_name=f"train-{OUTPUT_DIR}",
                report_to="wandb",
                save_steps=31,
                save_total_limit=4,
                ),
        peft_config=lora_config,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
        )

In [None]:
# Start fine-tuning.
trainer.train()

# TODO

"""
    # Save fine tuned Lora Adaptor
    trainer.model.save_pretrained(os.path.join(OUTPUT_DIR, "final_checkpoint"))
    # Free memory for merging weights
    del model
    if is_torch_xpu_available():
        torch.xpu.empty_cache()
    elif is_torch_npu_available():
        torch.npu.empty_cache()
    else:
        torch.cuda.empty_cache()

    model = AutoPeftModelForCausalLM.from_pretrained(OUTPUT_DIR, device_map="auto", torch_dtype=torch.bfloat16)
    model = model.merge_and_unload()

    output_merged_dir = os.path.join(OUTPUT_DIR, "final_merged_checkpoint")
    model.save_pretrained(output_merged_dir, safe_serialization=True)
"""

# inference

In [None]:
import torch
import gc


def clear_hardwares():
    torch.clear_autocast_cache()
    torch.cuda.ipc_collect()
    torch.cuda.empty_cache()
    gc.collect()


clear_hardwares()
clear_hardwares()

## Lora adaptater

In [None]:
base_model = AutoModelForCausalLM.from_pretrained("SmolLM2-135M-Instruct-structure-output", return_dict=True,
                                                  device_map='auto', token='')
tokenizer = AutoTokenizer.from_pretrained(new_model, max_length=max_seq_length)
model = PeftModel.from_pretrained(base_model, new_model)
del base_model

## None lora

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"  # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR).to(device)

In [None]:
test_json_schema = """{
  "type": "object",
  "properties": {
    "weather_data": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "year": { "type": "integer" },
          "station": { "type": "string" },
          "temperature": {
            "type": "object",
            "properties": {
              "min": { "type": "number" },
              "max": { "type": "number" }
            },
            "required": ["min", "max"]
          },
          "events": {
            "type": "array",
            "items": { "type": "string" }
          }
        },
        "required": ["year", "station", "temperature", "events"]
      }
    },
    "required": ["weather_data"]
  }
}"""

test_query = "Provide a detailed breakdown of meteorological data recorded in the city of Berlin from 2015 to 2020. The data should include the year, meteorological station, temperature ranges (minimum and maximum), and any significant events."

test_response = """{
  "weather_data": [
    {
      "year": 2015,
      "station": "Berlin Central Station",
      "temperature": { "min": -5.2, "max": 35.1 },
      "events": ["Heavy snowfall in January", "Heatwave in July"]
    },
    {
      "year": 2017,
      "station": "Berlin East Station",
      "temperature": { "min": -4.0, "max": 32.8 },
      "events": ["Thunderstorms in April", "Flooding in June"]
    },
    {
      "year": 2020,
      "station": "Berlin West Station",
      "temperature": { "min": -3.9, "max": 36.5 },
      "events": ["Drought in September", "Blizzards in February"]
    }
  ]
}"""

messages = [
    {
        "role":    "system",
        "content": "You are are an expert in generate json structure based on user query and schema."
        },
    {
        "role":    "user",
        "content": PROMPT_TEMPLATE.format(query=test_query, schema=test_json_schema)
        },
    ]

In [None]:
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
print("----------------- Generated text -----------------")
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=1024, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))