In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 scipy utils

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m35.4 MB/s[0

# Import Python Packages

In [None]:
import os
import torch
from datasets import *
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model,  prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
from huggingface_hub import login
access_token_read = ""

login(token = access_token_read)
print("done")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
done


# Data Preprocessing Utils

In [None]:
from dataclasses import dataclass, field
from typing import Optional

import transformers
from transformers import Trainer


def modify_special_tokens(tokenizer):
    tokenizer.add_special_tokens(
        {
            "pad_token": "<s>",
            "eos_token": "</s>",
            "bos_token": "<s>",
            "unk_token": "<unk>",
        }
    )

    tokenizer.eos_token_id = 2
    tokenizer.bos_token_id = 1
    tokenizer.unk_token_id = 0
    tokenizer.pad_token_id = 1

    return tokenizer


@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")


@dataclass
class DataArguments:
    data_path: str = field(
        default=None, metadata={"help": "Path to the training data."}
    )
    lazy_preprocess: bool = False


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=2048,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    remove_unused_columns: bool = field(
        default=False,
    )
    dataloader_num_workers: int = field(
        default=16,
    )


PROMPT_DICT = {
    "ours": """You are an intelligent clinical languge model.
Below is a snippet of patient's discharge summary and a following instruction from healthcare professional.
Write a response that appropriately completes the instruction.
The response should provide the accurate answer to the instruction, while being concise.

[Discharge Summary Begin]
{note}
[Discharge Summary End]

[Instruction Begin]
{question}
[Instruction End]
""",
    "alpaca": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{question}\n\n### Input:\n{note}\n\n### Response:"
    ),
    "medalpaca": (
        "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
        "\n\n### Instruction:\n{question}\n\n### Input:\n{note}\n\n### Response:\n"
    ),
    "chat": """
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: [The start of the Discharge Summary]
{note}
[The end of the Discharge Summary]
{question} ASSISTANT:
""",
}


def get_prompt(model_name):
    if model_name in ["decapoda-research/llama-7b-hf", "chaoyi-wu/PMC_LLAMA_7B"]:
        print("Using Ours+Response Prompt")
        return PROMPT_DICT["ours"] + "\nResponse: "
    # chatdoctor, alpaca ,medalpaca
    elif model_name in [
        "chavinlo/alpaca-native",
        "zl111/ChatDoctor",
    ]:
        print("Using Alpaca Prompt")
        return PROMPT_DICT["alpaca"]
    elif model_name == "medalpaca/medalpaca-7b":
        print("Using MedAlpaca Prompt")
        return PROMPT_DICT["medalpaca"]
    elif "vicuna" in model_name or "clinical-camel" in model_name:
        print("Using Vicuna Prompt")
        return PROMPT_DICT["chat"]
    else:
        print("Using Our Prompt")
        return PROMPT_DICT["ours"]

# Dataset Preprocessing and Collation

In [None]:
import copy
import io
import json
import logging
from dataclasses import dataclass
from typing import Dict, Sequence

import torch
import transformers
from torch.utils.data import Dataset

from utils import *

#if "A100" in torch.cuda.get_device_name():
#    from llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn

#    replace_llama_attn_with_flash_attn()


IGNORE_INDEX = -100

PROMPT = """You are an intelligent clinical languge model.
Below is a snippet of patient's discharge summary and a following instruction from healthcare professional.
Write a response that appropriately completes the instruction.
The response should provide the accurate answer to the instruction, while being concise.

[Discharge Summary Begin]
{note}
[Discharge Summary End]

[Instruction Begin]
{question}
[Instruction End]
"""


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    jdict = json.load(f)
    f.close()
    return jdict


def _tokenize_fn(
    strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer
) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length= 2048,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [
        _tokenize_fn(strings, tokenizer) for strings in (examples, sources)
    ]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        list_data_dict = jload(data_path)

        # Preprocess start/end \n in the data
        for i in range(len(list_data_dict)):
            for k, v in list_data_dict[i].items():
                if isinstance(v, str):
                    list_data_dict[i][k] = v.strip("\n")

        logging.warning("Formatting inputs...")

        sources = [PROMPT.format_map(example) for example in list_data_dict]
        targets = [
            f"{example['answer']}{tokenizer.eos_token}" for example in list_data_dict
        ]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple(
            [instance[key] for instance in instances] for key in ("input_ids", "labels")
        )
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX
        )
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )


def make_supervised_data_module(
    tokenizer: transformers.PreTrainedTokenizer, data_args
) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = SupervisedDataset(
        tokenizer=tokenizer, data_path="train_set.json"
    )
    print('Finished Training set processing')
    validation_dataset = SupervisedDataset(
        tokenizer=tokenizer, data_path="validation_set.json"
    )
    print('Finished Test set processing')
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(
        train_dataset=train_dataset, eval_dataset=validation_dataset, data_collator=data_collator
    )

# Operate on Dataset

In [None]:
# Load dataset (you can process it here)
# The instruction dataset to use
dataset_name = "starmpcc/Asclepius-Synthetic-Clinical-Notes"

dataset = load_dataset(dataset_name)

train_test = dataset['train'].train_test_split(test_size=0.2)


#dataset_processed = DatasetDict({
#    'train': train_test['train'],
#    'test': train_test['test']
#  )


In [None]:
# Function to reduce dataset
def reduce_dataset(ds, num_rows):
    num_rows_to_keep = min(num_rows, len(ds))
    indices_to_keep = list(range(num_rows_to_keep))
    return ds.select(indices_to_keep)

# Define the number of rows you want to keep for each split
num_train_rows = 5000
num_test_rows = 1000  # Adjust as necessary

# Create a new DatasetDict with reduced splits
dataset_reduced = DatasetDict({
    'train': reduce_dataset(train_test['train'], num_train_rows),
    'test': reduce_dataset(train_test['test'], num_test_rows)
})


In [None]:
print(f"Number of rows in the training partition: {len(dataset_reduced['train'])}")
#print(f"Number of rows in the training partition: {len(dataset_processed['valid'])}")
print(f"Number of rows in the training partition: {len(dataset_reduced['test'])}")

Number of rows in the training partition: 5000
Number of rows in the training partition: 1000


In [None]:
for i in range(1):
    example = train_test['train'][i]
    for key, value in example.items():
        print(f"{key}: {value}")
    print()

patient_id: 91176
note: Discharge Summary:

Admission Date: May 2006

Discharge Date: N/A

Primary Diagnosis: CD grade Шb according to the Marsh classification, portal hypertension, splenomegaly, and non-specific chronic hepatitis.

Hospital Course:

Patient presented with malaise, weight loss, and edema of the lower limbs. Physical examination showed cachexia, anemia, thrombocytopenia, and leucopenia. Abdominal sonography revealed splenomegaly with large amounts of ascites. Duplex doppler ultrasonography confirmed portal hypertension. CT scan showed no evidence of vascular obstruction in splenoportal axis. Eosophagogastroduodenoscopy showed esophageal varices, gastric fundal varices, and portal hypertensive gastropathy. Biopsy results showed CD grade Шb and serologic studies revealed positive anti tissue transglutaminase (tTG) and anti endomysial antibody (EMA). A gluten-free diet was advised. Patient did not return for follow-up visits until 1 year later due to lack of compliance wit

In [None]:
for i in range(1):
    example = train_test['test'][i]
    for key, value in example.items():
        print(f"{key}: {value}")
    print()

patient_id: 76205
note: Hospital Course Summary

Patient ID: 12345
Date of admission: October 2009
Date of discharge: February 2010

Diagnosis: Primary CNS lymphoma (PCNSL)

Hospital course:
The patient presented with complaints of right sided hemiparesis. An MRI of the brain showed a large mass in the left frontal area and a biopsy showed that the tumor consisted of a diffuse proliferation of large lymphoid cells. A CD20 immunostain was strongly positive in the tumor cells. A ki-67 immunostain showed a proliferation index of approximately 90%. A bcl-2 immunostain was positive, whereas immunostains for bcl-1, CD5, and CD10 were negative. A FISH study was negative for MYC translocation. Workup for systemic disease including bone marrow biopsy, CT scans, and a lumbar puncture for CSF examination was all negative.

The patient was diagnosed with PCNSL and was treated with 8 cycles of high dose methotrexate, with significant improvement in his symptoms. He was given adjuvant radiation afte

In [None]:
import json
data_list = dataset_reduced['train'].to_pandas().to_dict(orient="records")
with open('train_set.json', 'w') as f:
  json.dump(data_list, f, indent=4)

data_list = dataset_reduced['test'].to_pandas().to_dict(orient="records")
with open('validation_set.json', 'w') as f:
  json.dump(data_list, f, indent=4)

#train_test['train'].to_json('train_set.json', orient="records")#, indent=4)
#train_test['test'].to_json('validation_set.json', orient="records")#, indent=4)



# Quantization Settings

In [None]:

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4), Set to 4-bit NormalFloat to enable QLoRA
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True


# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)



bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


# Loading Model and Tokenizer

In [None]:
# The model that you want to train from the Hugging Face hub
#model_name = "NousResearch/Llama-2-7b-hf"
#Space considerations
model_name ="meta-llama/Llama-2-7b-hf"


# Fine-tuned model name
new_model = "llama-2-7b-clinical-summarization"

# Load the entire model on the GPU 0
device_map = {"": 0}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model)

print("Loaded Model")
# Load LLaMA tokenizer



In [None]:
model_name ="meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Create Dataset Module

In [None]:
tokenizer = modify_special_tokens(tokenizer)

data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=None)



Finished Training set processing
Finished Test set processing


# FT Memory Usage and Lora

**Fine Tuning Memory Considerations, Lora Hyperparameters**

 Naively fine-tuning a 7B model requires about 7 x 4 x 4 = 112 GB of VRAM (considering Parameters, Gradients, and AdamW Optimizer states)

**Lora comparision**:

LLama 2 7 billion Hidden Dim (d_model) = 4096  >> Lora Rank = {16, 32, 128, 256}

LLama 2 13 billion Hidden Dim (d_model) = 5120  >> Lora Rank = {16, 32, 128, 256}

**QLora** offers 33% memory savings at the cost of a 39% increase in runtime.

In [None]:




################################################################################
# QLoRA parameters
################################################################################

# LoRA rank
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.1


# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    target_modules=["query_key_value", "gate_proj", "down_proj", "up_proj", "q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)



In [None]:
import pickle

with open('data_module.pickle', 'wb') as handle:
    pickle.dump(data_module, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Training Hyperparameters

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 2048

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Conduct Training

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=50,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    eval_steps=50,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    **data_module,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

19965

In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

In [None]:
!pip install tensorboard

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting tensorboard
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.64.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading Markdown-3.6-py3-none-any.whl.metadata (7.0 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-5.27.0-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none