In [None]:
# install
%%capture
!pip uninstall -y bitsandbytes triton
!pip install triton==2.1.0
!pip install bitsandbytes==0.43.2
!pip install --upgrade unsloth accelerate datasets peft trl sentencepiece protobuf huggingface_hub hf_transfer
!pip install unsloth_zoo

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

# load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

In [None]:
from datasets import Dataset

data = [
    {"instruction": "How to manage stress?", "input": "Daily life pressures", "output": "Try mindfulness and exercise daily."},
    {"instruction": "Cure for stage fright?", "input": "Fear of public speaking", "output": "Practice deep breathing and rehearse a lot."}
]

dataset = Dataset.from_list(data)

In [None]:
from unsloth import to_sharegpt, standardize_sharegpt, apply_chat_template

dataset = to_sharegpt(
    dataset,
    merged_prompt = "{instruction}[[\nInput:\n{input}]]",
    output_column_name = "output",
    conversation_extension = 2,
)

dataset = standardize_sharegpt(dataset)

chat_template = """Below is a task. Write a suitable response.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
)

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# setup SFT trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        output_dir = "./outputs",
        optim = "adamw_8bit",
        report_to = "none",
    ),
)

trainer.train()

In [None]:
# save to GGUF
model.save_pretrained_gguf("mental_health_model", tokenizer)

In [None]:
# install Ollama server
!curl -fsSL https://ollama.com/install.sh | sh

# start Ollama in background
import subprocess
import time

subprocess.Popen(["ollama", "serve"])
time.sleep(5)

In [None]:
# create Ollama model
!ollama create mental_health_bot -f ./mental_health_model/Modelfile

In [None]:
# inference through Ollama API
import json
import requests

# query the model
response = requests.post(
    "http://localhost:11434/api/chat",
    headers={"Content-Type": "application/json"},
    data=json.dumps({
        "model": "mental_health_bot",
        "messages": [{"role": "user", "content": "How can I deal with exam anxiety?"}]
    })
)

print(response.json()['message']['content'])