In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('https://dicom5c.blob.core.windows.net/public/impression_300_llm.csv')

train_df, eval_df = train_test_split(df, test_size=30, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")

Training set size: 300
Evaluation set size: 30


In [7]:
train_df.head(5)

Unnamed: 0,Report Name,History,Observation,Impression
262,CT Thorax Plain,H/O RTA,Minimal bilateral pleural effusion/haemothorac...,\n Minimal bilateral pleural effusion/haemoth...
297,MRI Scrotum Plain,H/O SCROTUM INJURY SEVERE PAIN AND SWELLING 3 ...,Subtle cutaneous irregularity with minimal ede...,Subtle cutaneous irregularity with minimal ede...
210,MRI Brain Plain and Contrast,SEIZURE,T2/FLAIR heterogeneous signal intensity in lef...,T2/FLAIR heterogeneous signal intensity in lef...
232,CT Abdomen & Pelvis Triphasic - Female,A CASE OF OBSTRUCTIVE JAUNDICE PARIAMPULLARY ...,There is an irregular heterogeneously enhancin...,An irregular heterogeneously enhancing lesion ...
272,MRI Spine Dorso Lumbar Plain,"INJURY, PAIN",Burst compression fracture of L1 vertebral bod...,Burst compression fracture of L1 vertebral bod...


In [8]:
eval_df.head(5)

Unnamed: 0,Report Name,History,Observation,Impression
9,CT Angiography Peripheral Legs,FOLLOW UP REQUISITION,Study is grossly limited due to streak artefac...,Absent contrast opacification of right anterio...
164,CT Abdomen & Pelvis Plain - Male,"RTA TODAY,WARMTH AND TENDERNESS",- Is normal in size and shows uniform density...,No hemo or pneumoperitoneum. No solid abdomina...
139,CT Abdomen & Pelvis Plain and Contrast - Female,PAIN IN LEFT SIDE LOWER ABDOMEN,The liver is mildly enlarged in size. no area...,Mild hepatosplenomegaly. Minimal wall thicken...
46,MRI MSK Knee Plain - Left,16 MONTH BACK INJURY NOW PAIN,Bones around knee joint show normal signals. T...,No fracture or dislocation in current study. G...
94,MRI Brachial Plexus,POST OESOPHAGESTOMY KNOWN CASE OF CA OSEOPHAGU...,Cervical Spine: Mild to moderate degenerative ...,No nerve root avulsion or pseudomeningocele fo...


In [2]:
%%capture
!pip install unsloth

!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [9]:
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported


dataset = [{"text": row['Report Name'] + "\n" +
                       row['History'] + "\n" +
                       row['Observation'] + "\n" +
                       row['Impression']}
           for _, row in train_df.iterrows()]

hf_dataset = Dataset.from_dict({"text": [data['text'] for data in dataset]})


peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, peft_config)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=hf_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)


Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [17]:

eval_dataset = [{"text": row['Report Name'] + "\n" +
                         row['History'] + "\n" +
                         row['Observation'] + "\n" +
                         row['Impression']}
               for _, row in eval_df.iterrows()]


hf_eval_dataset = Dataset.from_dict({"text": [data['text'] for data in eval_dataset]})


tokenized_eval_dataset = hf_eval_dataset.map(
    lambda examples: tokenizer(examples["text"], padding='max_length', truncation=True, max_length=max_seq_length),
    batched=True,
    num_proc=2
)

tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["text"])

print("done")

Map (num_proc=2):   0%|          | 0/30 [00:00<?, ? examples/s]

done
