In [20]:
import torch

if torch.cuda.is_available():
    print("GPU is available and being used.")
else:
    print("GPU is not available. Using CPU.")

GPU is available and being used.


In [None]:
!pip install transformers datasets accelerate peft bitsandbytes trl

In [7]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

## Prepare the dataset

For fine-tuning, we need a dataset formatted appropriately for the model. This typically involves pairs of input and output text, or just the output text if we're training a causal language model to predict the next token.

Since you mentioned oncology-specific data, you would typically load your own dataset here. For demonstration purposes, let's create a small dummy dataset in the expected format.

In [30]:
from datasets import Dataset

# This is a dummy dataset with expanded text entries.
# Replace this with loading your actual oncology data.
data = {'text': [
    "Patient presented to the oncology department with a persistent cough and shortness of breath. Imaging revealed a large mass in the upper lobe of the left lung. A biopsy confirmed the diagnosis of stage III adenocarcinoma of the lung with involvement of regional lymph nodes. The patient's medical history includes a long-standing smoking history and well-controlled hypertension.",
    "The multidisciplinary tumor board reviewed the patient's case and recommended a treatment plan. The plan includes initial treatment with platinum-based chemotherapy combined with concurrent radiation therapy to the primary tumor and involved lymph nodes. Following completion of chemoradiation, a restaging scan will be performed to assess response, and consolidation immunotherapy may be considered based on the patient's performance status and tumor characteristics.",
    "A core needle biopsy of the suspected metastatic lesion in the supraclavicular lymph node was performed. Histopathological examination confirmed the presence of malignant epithelial cells consistent with metastasis from the known lung adenocarcinoma. Immunohistochemical staining was positive for TTF-1 and negative for CK7, supporting a pulmonary origin. These findings confirm the advanced stage of the disease.",
    "Patient with a history of breast cancer presents for follow-up. Physical examination is unremarkable. Mammography shows no evidence of recurrence. Patient reports mild fatigue but is otherwise feeling well. Continue routine surveillance.",
    "New patient consultation for suspected prostate cancer. PSA levels are elevated. Digital rectal exam reveals a nodule. Biopsy is recommended to confirm diagnosis and determine Gleason score.",
    "Patient undergoing treatment for acute myeloid leukemia. Currently in remission after induction chemotherapy. Monitoring blood counts closely. Discussing options for consolidation therapy."
    # Add more expanded oncology-specific data here if needed
]}

dataset = Dataset.from_dict(data)

print(dataset)

Dataset({
    features: ['text'],
    num_rows: 6
})


## Set up the environment and Fine-tune the model

Now we'll set up the training arguments and use the `Trainer` class from the `transformers` library to fine-tune the model.

In [None]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

# Load the original model first
model_kwargs = dict(attn_implementation="eager", torch_dtype=torch.bfloat16, use_cache=True, device_map="auto")
base_model = AutoModelForCausalLM.from_pretrained("openai/gpt-oss-20b", **model_kwargs)





Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00000-of-00002.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.97 GiB is free. Process 17445 has 12.77 GiB memory in use. Of the allocated memory 10.15 GiB is allocated by PyTorch, and 2.51 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer and a smaller model
model_name = "EleutherAI/gpt-neo-125m" # Using EleutherAI/gpt-neo-125m as an alternative
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Using quantization with bitsandbytes to reduce memory usage
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    use_cache=False, # Set use_cache to False for training
    device_map="auto",
    load_in_4bit=True, # Load the model in 4-bit precision
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [31]:
# Add a padding token if the tokenizer doesn't have one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length")
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() # Add labels for causal language modeling
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [33]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8, # LoRA attention dimension
    lora_alpha=16, # Alpha parameter for LoRA scaling
    lora_dropout=0.1, # Dropout probability for LoRA layers
    bias="none",
    task_type="CAUSAL_LM", # Task type for causal language modeling
)

# Apply LoRA to the base model
model = get_peft_model(base_model, lora_config)

# Print the number of trainable parameters
model.print_trainable_parameters()

trainable params: 294,912 || all params: 125,493,504 || trainable%: 0.2350




In [34]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt-neo-finetuned",  # Output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=1,  # Batch size per device during training - reduced to 1
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=200,
    report_to="none", # Disable reporting to wandb
)

# Initialize the Trainer
trainer = Trainer(
    model=model,  # Use the LoRA-adapted model
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [35]:
# Start training
trainer.train()

# Save the PEFT model
trainer.model.save_pretrained(training_args.output_dir)

Step,Training Loss


## Run Inference

Now that the model is fine-tuned, you can use it to generate text based on a given prompt.

In [44]:
# Load the fine-tuned model
# The fine-tuned adapter weights are saved in the output_dir specified in TrainingArguments
fine_tuned_model_path = training_args.output_dir

# Load the base model again (if not already loaded)
# model_name = "EleutherAI/gpt-neo-125m" # Or the model you used for fine-tuning
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# base_model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)


# Load the LoRA adapter weights
model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)

# Merge the LoRA weights with the base model weights for easier inference
model = model.merge_and_unload()

# Set the model to evaluation mode
model.eval()

# Define a prompt
prompt = "patient is having treatment"

# Tokenize the prompt and get input_ids and attention_mask
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids.cuda()
attention_mask = inputs.attention_mask.cuda()


# Generate text
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask, # Pass the attention mask
        max_length=100,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id # Set the pad token id
    )

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

patient is having treatment for the disease.

The first report of the treatment was the first report of the treatment of the disease. The second report was the first report of the treatment of the disease. The third report was the first report of the treatment of the disease. The fourth report was the first report of the treatment of the disease.

Discussion

The first report of the treatment of the disease was the first report of the treatment of the disease. The second report was
