In [None]:
!pip install unsloth

!pip install torch
!pip install datasets
!pip install trl

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  D



In [None]:
import pandas as pd

# Load the uploaded dataset
file_path = "disease_diagnosis.csv"
df = pd.read_csv(file_path)

# Basic EDA summary
eda_summary = {
    "Head": df.head(),
    "Info": df.info(),
    "Description": df.describe(),
    "Null Values": df.isnull().sum(),
    "Unique Values": df.nunique(),
    "Data Types": df.dtypes
}

# Show the first 5 rows separately for visual clarity
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Patient_ID           2000 non-null   int64  
 1   Age                  2000 non-null   int64  
 2   Gender               2000 non-null   object 
 3   Symptom_1            2000 non-null   object 
 4   Symptom_2            2000 non-null   object 
 5   Symptom_3            2000 non-null   object 
 6   Heart_Rate_bpm       2000 non-null   int64  
 7   Body_Temperature_C   2000 non-null   float64
 8   Blood_Pressure_mmHg  2000 non-null   object 
 9   Oxygen_Saturation_%  2000 non-null   int64  
 10  Diagnosis            2000 non-null   object 
 11  Severity             2000 non-null   object 
 12  Treatment_Plan       2000 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 203.3+ KB


Unnamed: 0,Patient_ID,Age,Gender,Symptom_1,Symptom_2,Symptom_3,Heart_Rate_bpm,Body_Temperature_C,Blood_Pressure_mmHg,Oxygen_Saturation_%,Diagnosis,Severity,Treatment_Plan
0,1,74,Male,Fatigue,Sore throat,Fever,69,39.4,132/91,94,Flu,Moderate,Medication and rest
1,2,66,Female,Sore throat,Fatigue,Cough,95,39.0,174/98,98,Healthy,Mild,Rest and fluids
2,3,32,Male,Body ache,Sore throat,Fatigue,77,36.8,136/60,96,Healthy,Mild,Rest and fluids
3,4,21,Female,Shortness of breath,Headache,Cough,72,38.9,147/82,99,Healthy,Mild,Rest and fluids
4,5,53,Male,Runny nose,Sore throat,Fatigue,100,36.6,109/106,92,Healthy,Mild,Rest and fluids


In [None]:
!pip install cuda-python



In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, Dataset
import pandas as pd

# Load your custom dataset
file_path = 'disease_diagnosis.csv'
df = pd.read_csv(file_path)

# Convert dataset to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Define the prompt structure for the dataset
llama31_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>


{}<|eot_id|><|start_header_id|>user<|end_header_id|>


{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>


{}<|eot_id|>"""

def formatting_prompts_func(examples):
   # Use the relevant columns in your dataset to format the prompt
   instructions = examples["Result"]
   inputs = [f'Oxygen: {x}, PulseRate: {y}, Temperature: {z}' for x, y, z in zip(examples["Oxygen"], examples["PulseRate"], examples["Temperature"])]
   outputs = [str(r) for r in examples["Result"]]

   texts = []
   for instruction, input, output in zip(instructions, inputs, outputs):
       text = llama31_prompt.format(instruction, input, output)
       texts.append(text)
   return {"text": texts}

# Apply the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)

# Initialize the model with LoRA adapters
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
   model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
   max_seq_length=max_seq_length,
   dtype=dtype,
   load_in_4bit=load_in_4bit
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
   model,
   r=16,
   target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
   lora_alpha=16,
   lora_dropout=0,
   bias="none",
   use_gradient_checkpointing="unsloth",
   random_state=3407,
   use_rslora=False
)

# Set up the trainer
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
   model=model,
   tokenizer=tokenizer,
   train_dataset=dataset,
   dataset_text_field="text",
   max_seq_length=max_seq_length,
   dataset_num_proc=2,
   packing=False,
   args=TrainingArguments(
       per_device_train_batch_size=2,
       gradient_accumulation_steps=4,
       warmup_steps=5,
       max_steps=60,
       learning_rate=2e-4,
       fp16=not is_bfloat16_supported(),
       bf16=is_bfloat16_supported(),
       logging_steps=1,
       optim="adamw_8bit",
       weight_decay=0.01,
       lr_scheduler_type="linear",
       seed=3407,
       output_dir="outputs"
   ),
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

KeyError: 'Result'

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import Dataset
import pandas as pd

# Load the new dataset
file_path = 'disease_diagnosis.csv'
df = pd.read_csv(file_path)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Define LLaMA-style prompt format
llama31_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a medical diagnosis assistant that infers likely conditions from patient vitals.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Oxygen Saturation: {}%, Heart Rate: {} bpm, Temperature: {}°C

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}

<|eot_id|>"""

# Formatting function
def formatting_prompts_func(examples):
    inputs = [
        f"{x}, {y}, {z}" for x, y, z in zip(
            examples["Oxygen_Saturation_%"],
            examples["Heart_Rate_bpm"],
            examples["Body_Temperature_C"]
        )
    ]
    outputs = [str(d) for d in examples["Diagnosis"]]

    texts = [
        llama31_prompt.format(o2, hr, temp, diagnosis)
        for (o2, hr, temp), diagnosis in zip(
            zip(examples["Oxygen_Saturation_%"], examples["Heart_Rate_bpm"], examples["Body_Temperature_C"]),
            outputs
        )
    ]
    return {"text": texts}

# Apply prompt formatting
dataset = dataset.map(formatting_prompts_func, batched=True)

# Initialize model
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
   model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
   max_seq_length=max_seq_length,
   dtype=dtype,
   load_in_4bit=load_in_4bit
)

# Inject LoRA adapters
model = FastLanguageModel.get_peft_model(
   model,
   r=16,
   target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
   lora_alpha=16,
   lora_dropout=0,
   bias="none",
   use_gradient_checkpointing="unsloth",
   random_state=3407,
   use_rslora=False
)

# Training
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
   model=model,
   tokenizer=tokenizer,
   train_dataset=dataset,
   dataset_text_field="text",
   max_seq_length=max_seq_length,
   dataset_num_proc=2,
   packing=False,
   args=TrainingArguments(
       per_device_train_batch_size=2,
       gradient_accumulation_steps=4,
       warmup_steps=5,
       max_steps=60,
       learning_rate=2e-4,
       fp16=not is_bfloat16_supported(),
       bf16=is_bfloat16_supported(),
       logging_steps=1,
       optim="adamw_8bit",
       weight_decay=0.01,
       lr_scheduler_type="linear",
       seed=3407,
       output_dir="outputs"
   ),
)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
!3rd new one(Latest finetuning)
from unsloth import FastLanguageModel
import torch
from datasets import Dataset
import pandas as pd

# Load the new dataset
file_path = 'disease_diagnosis.csv'
df = pd.read_csv(file_path)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Define LLaMA-style prompt format
llama31_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a medical diagnosis assistant that infers likely conditions from patient vitals. Based on the following symptoms and vital signs, provide a detailed diagnosis and reasoning behind the condition.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Oxygen Saturation: {}%, Heart Rate: {} bpm, Temperature: {}°C

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Please provide your diagnosis and reasoning for the condition.
<|eot_id|>"""

# Formatting function
def formatting_prompts_func(examples):
    # Prepare the inputs with relevant columns
    inputs = [
        f"Oxygen Saturation: {x}%, Heart Rate: {y} bpm, Temperature: {z}°C"
        for x, y, z in zip(
            examples["Oxygen_Saturation_%"],
            examples["Heart_Rate_bpm"],
            examples["Body_Temperature_C"]
        )
    ]

    # Outputs (Diagnosis)
    outputs = [str(d) for d in examples["Diagnosis"]]

    # Now the prompt includes reasoning instructions
    texts = [
        llama31_prompt.format(o2, hr, temp, f"Diagnosis: {diagnosis}. Please explain your reasoning.")
        for (o2, hr, temp), diagnosis in zip(
            zip(examples["Oxygen_Saturation_%"], examples["Heart_Rate_bpm"], examples["Body_Temperature_C"]),
            outputs
        )
    ]

    return {"text": texts}


# Apply prompt formatting
dataset = dataset.map(formatting_prompts_func, batched=True)

# Initialize model
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
   model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
   max_seq_length=max_seq_length,
   dtype=dtype,
   load_in_4bit=load_in_4bit
)

# Inject LoRA adapters
model = FastLanguageModel.get_peft_model(
   model,
   r=16,
   target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
   lora_alpha=16,
   lora_dropout=0,
   bias="none",
   use_gradient_checkpointing="unsloth",
   random_state=3407,
   use_rslora=False
)

# Training
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
   model=model,
   tokenizer=tokenizer,
   train_dataset=dataset,
   dataset_text_field="text",
   max_seq_length=max_seq_length,
   dataset_num_proc=2,
   packing=False,
   args=TrainingArguments(
       per_device_train_batch_size=2,
       gradient_accumulation_steps=4,
       warmup_steps=5,
       max_steps=60,
       learning_rate=2e-4,
       fp16=not is_bfloat16_supported(),
       bf16=is_bfloat16_supported(),
       logging_steps=1,
       optim="adamw_8bit",
       weight_decay=0.01,
       lr_scheduler_type="linear",
       seed=3407,
       output_dir="outputs"
   ),
)


/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `3rd new one(Latest finetuning)'


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Start Fine-tuning job
trainer_stats = trainer.train()

# Save the model
"""<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
"""
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

# Save to 4bit Q4_0
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = [ "q2_k"])

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msngtbanerjee-8[0m ([33msngtbanerjee-8-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,4.3826
2,4.3546
3,4.2168
4,3.8023
5,3.3465
6,2.7046
7,2.0491
8,1.4902
9,0.9889
10,0.7621


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.41 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 50%|█████     | 16/32 [00:01<00:01, 12.87it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:39<00:00,  3.11s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q2_k'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be /content/model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'


In [None]:
!pip install gdown
!gdown --id <file_id> --output unsloth.Q4_0.gguf

from google.colab import files
files.download('model/unsloth.Q4_0.gguf')

/bin/bash: line 1: file_id: No such file or directory


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Make sure you're in the build directory
%cd /content/llama.cpp/build

# Build the correct quantizer
!cmake --build . --target quantize-gptq


/content/llama.cpp/build
gmake: *** No rule to make target 'quantize-gptq'.  Stop.


In [None]:
# Use llamacpp to quantize from Q8 to Q4
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
!make quantize
!./quantize unsloth.Q8_0.gguf unsloth.Q4_K_M.gguf Q4_K_M

fatal: destination path 'llama.cpp' already exists and is not an empty directory.
/content/llama.cpp
Makefile:2: *** The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md.  Stop.
/bin/bash: line 1: ./quantize: No such file or directory


In [None]:
!zip -r model.zip model/


In [None]:
FROM ./unsloth.Q8_0.gguf
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

{{ .Response }}<|eot_id|>"""
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|reserved_special_token"

In [None]:
!zip -r file.zip Folder_To_Zip

In [None]:
!./quantize unsloth.Q8_0.gguf unsloth.Q4_K_M.gguf Q4_K_M


/bin/bash: line 1: ./quantize: No such file or directory
