# The following experiment utilizes deepseek, and parameter efficient fine-tuning to 'teach' the model to output medical prescriptions.

# The next experiment will be aimed at generating tabular datasets based on prompts.

In [2]:
!pip install -U vllm torch transformers datasets accelerate peft bitsandbytes

Collecting torch
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)


## vllm is needed before torch, otherwise issues - https://github.com/vllm-project/vllm/issues/431

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

model_name = "deepseek-ai/deepseek-llm-7b-base"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16  # Use float16 for faster computation
)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


# Apply LoRA for memory-efficient fine-tuning
lora_config = LoraConfig(
    r=8,  # Low-rank adaptation size
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✅ DeepSeek LLM Loaded with LoRA and 4-bit Precision!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569
✅ DeepSeek LLM Loaded with LoRA and 4-bit Precision!


In [19]:
from datasets import load_dataset

# Load IMDB dataset
dataset = load_dataset("imdb")

# Display dataset structure and a few samples
print("Dataset Structure:")
print(dataset)

print("Sample Data:")
print(dataset["train"][0])
print(dataset["train"][1])

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
Sample Data:
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. I

In [21]:
from datasets import load_dataset

# Load IMDB dataset
dataset = load_dataset("rungalileo/medical_transcription_40")

# Display dataset structure and a few samples
print("Dataset Structure:")
print(dataset)

print("Sample Data:")
print(dataset["train"][0])
print(dataset["train"][1])

Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 4499
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})
Sample Data:
{'id': 3614, 'text': 'EXAM: , CT scan of the abdomen and pelvis without and with intravenous contrast.,CLINICAL INDICATION: , Left lower quadrant abdominal pain.,COMPARISON: , None.,FINDINGS: , CT scan of the abdomen and pelvis was performed without and with intravenous contrast.  Total of 100 mL of Isovue was administered intravenously.  Oral contrast was also administered.,The lung bases are clear.  The liver is enlarged and decreased in attenuation.  There are no focal liver masses.,There is no intra or extrahepatic ductal dilatation.,The gallbladder is slightly distended.,The adrenal glands, pancreas, spleen, and left kidney are normal.,A 12-mm simple cyst is present in the inferior pole of the right kidney.  There is no hydronephrosis or hydroureter.

In [22]:
def tokenize_function(examples):
    inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs["labels"] = inputs["input_ids"].copy()  # Use input_ids as labels for causal LM
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Verify tokenized sample
print("Tokenized Sample with Labels:")
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/4499 [00:00<?, ? examples/s]

Tokenized Sample with Labels:
{'id': 3614, 'text': 'EXAM: , CT scan of the abdomen and pelvis without and with intravenous contrast.,CLINICAL INDICATION: , Left lower quadrant abdominal pain.,COMPARISON: , None.,FINDINGS: , CT scan of the abdomen and pelvis was performed without and with intravenous contrast.  Total of 100 mL of Isovue was administered intravenously.  Oral contrast was also administered.,The lung bases are clear.  The liver is enlarged and decreased in attenuation.  There are no focal liver masses.,There is no intra or extrahepatic ductal dilatation.,The gallbladder is slightly distended.,The adrenal glands, pancreas, spleen, and left kidney are normal.,A 12-mm simple cyst is present in the inferior pole of the right kidney.  There is no hydronephrosis or hydroureter.,The appendix is normal.,There are multiple diverticula in the rectosigmoid.  There is evidence of focal wall thickening in the sigmoid colon (image #69) with adjacent fat stranding in association with a d

In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

print("✅ WandB Disabled!")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ WandB Disabled!


In [24]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [25]:
small_train_dataset

Dataset({
    features: ['id', 'text', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [26]:
small_test_dataset

Dataset({
    features: ['id', 'text', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [30]:
small_test_dataset['label']

[1,
 13,
 25,
 13,
 33,
 33,
 33,
 25,
 25,
 39,
 21,
 25,
 36,
 9,
 25,
 13,
 25,
 27,
 15,
 38,
 25,
 27,
 33,
 23,
 36,
 36,
 25,
 13,
 25,
 13,
 28,
 13,
 6,
 25,
 25,
 0,
 25,
 9,
 33,
 25,
 31,
 9,
 9,
 38,
 25,
 4,
 25,
 25,
 36,
 30,
 21,
 6,
 9,
 22,
 25,
 36,
 25,
 34,
 25,
 13,
 6,
 25,
 14,
 23,
 38,
 36,
 21,
 33,
 38,
 15,
 15,
 25,
 23,
 15,
 33,
 25,
 17,
 23,
 23,
 38,
 13,
 24,
 31,
 25,
 33,
 25,
 30,
 25,
 32,
 13,
 13,
 14,
 38,
 5,
 7,
 25,
 38,
 9,
 15,
 3]

In [31]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset.remove_columns(["id", "text"]),  # Remove raw text column
    eval_dataset=small_test_dataset.remove_columns(["id", "text"]),
)

print("🚀 Trainer Initialized!")


🚀 Trainer Initialized!


In [32]:
torch.cuda.empty_cache()
print("✅ Cleared CUDA Cache")


✅ Cleared CUDA Cache


In [33]:
print("🚀 Starting Fine-Tuning...")
trainer.train()

🚀 Starting Fine-Tuning...


Epoch,Training Loss,Validation Loss
0,No log,No log


TrainOutput(global_step=62, training_loss=1.6084028674710182, metrics={'train_runtime': 566.3396, 'train_samples_per_second': 0.883, 'train_steps_per_second': 0.109, 'total_flos': 9896307481116672.0, 'train_loss': 1.6084028674710182, 'epoch': 0.992})

# Conclusion - *needs a lot more training*

In [37]:
def generate_prediction(input_text):
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example reviews
prescriptions = [
    " His nose was broken ",
    " Heart needs to be checked.  ",
    " Lungs need CT scanning for further analysis ",
    "  Radiology "
]

# Run predictions
for presc in prescriptions:
    print(f"Review: {presc}")
    print(f"Predicted : {generate_prediction(presc)}")
    print("-" * 80)


Review:  His nose was broken 
Predicted :  His nose was broken 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a broken nose. 
- He had a 