In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install datasets
!pip install trl

In [None]:
!pip uninstall torch torchvision
!pip install torch torchvision

Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Would remove:
    /usr/local/bin/torchfrtrace
    /usr/local/bin/torchrun
    /usr/local/lib/python3.11/dist-packages/functorch/*
    /usr/local/lib/python3.11/dist-packages/torch-2.6.0.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torch/*
    /usr/local/lib/python3.11/dist-packages/torchgen/*
Proceed (Y/n)? y
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0
Uninstalling torchvision-0.21.0:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/torchvision-0.21.0.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libcudart.41118559.so.12
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libjpeg.1c1c4b09.so.8
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libnvjpeg.02b6d700.so.12
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libpng16.0364a1db.so.16
    /usr/local/lib/python3.11/dist-packages/torchvision.li

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import joblib
from trl import SFTTrainer
from datasets import load_dataset, DatasetDict
from transformers import TrainingArguments, TextStreamer
from unsloth import FastLanguageModel, is_bfloat16_supported

max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-0.5B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    use_gradient_checkpointing=True,
    random_state=3407,
)


career_prompt = """ Given the problem faced by the user, write the prescriptions which should include the following things,if nan values are generated then remove them, do not add it in response: "
        "Medicine, Substitute, side effects and habit forming:
{input}
Response: {output}"""


def format_prompts(examples):
    formatted = []
    for input_data, output in zip(examples["input"], examples["output"]):
        text = career_prompt.format(
            input=input_data,
            output=output
        )
        formatted.append(text)
    return {"text": formatted}


train_dataset = load_dataset("csv", data_files="/content/prescription.csv", split="train")
train_dataset = train_dataset.map(format_prompts, batched=True)

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_ratio=0.03,
    max_steps=100,
    learning_rate=3e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.05,
    lr_scheduler_type="cosine",
    seed=3407,
    output_dir="outputs",
    report_to=[],
    save_steps=10,
    save_total_limit=3
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)
trainer_stats = trainer.train()


==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.2.15 patched 24 layers with 24 QKV layers, 24 O layers and 0 MLP layers.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 1,081,344


Step,Training Loss
1,3.3974
2,3.3753
3,3.2266
4,3.2441
5,2.9167
6,2.8544
7,2.6973
8,2.5623
9,2.329
10,2.1693


In [None]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896, padding_idx=151654)
        (layers): ModuleList(
          (0): Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
              

In [None]:
def prescription(profile):
    prompt = career_prompt.format(input=profile, output="")
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=55,
        temperature=0.7,
        top_p=0.9,
        use_cache=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
test_profile = "Treatment of Bacterial infections"

print(prescription(test_profile))

 Given the problem faced by the user, write the prescriptions which should include the following things,if nan values are generated then remove them, do not add it in response: "
        "Medicine, Substitute, side effects and habit forming:
Treatment of Bacterial infections
Response:  prescription
Medicine: amoxin 500mg tablet
Substitute: Amoxyt 500 Tablet
The following side effects can occur:
Vomiting,
Abdominal pain,
Nausea
Will the medicine form a habit? No


In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
model.push_to_hub_merged("kharshita590/prescriptionn", tokenizer, save_method="merged_16bit")

Unsloth: You are pushing to hub, but you passed your HF username = kharshita590.
We shall truncate kharshita590/prescriptionn to prescriptionn
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 537.7M


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.54 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 24/24 [00:00<00:00, 110.55it/s]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

 Done.
Unsloth: Saving prescriptionn/pytorch_model.bin...


README.md:   0%|          | 0.00/617 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/988M [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/kharshita590/prescriptionn


In [None]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
from pyngrok import ngrok
import requests
!ngrok authtoken '2l4Sy063EDxVDWhdzY9bPNBirEI_oKwvYbuPbSVJ5eCLQJ1D'
# Start a tunnel on port 8000
public_tunnel = ngrok.connect(8000)
print("Ngrok public URL:", public_tunnel.public_url)

# Use the ngrok URL for your POST request by accessing the public_url attribute
url = public_tunnel.public_url + "/generate"
data = {"profile_text": "Treatment of Bacterial Infections"}
response = requests.post(url, json=data)
print(response.json())


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Ngrok public URL: https://01f5-35-221-248-121.ngrok-free.app




JSONDecodeError: Expecting value: line 1 column 1 (char 0)