In [None]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install trl==0.12.2
%pip install -U bitsandbytes 
%pip install -U wandb

In [None]:
%pip install -U huggingface_hub 

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

import os, torch, wandb
from datasets import load_dataset

2025-01-29 19:50:55.575626: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-29 19:50:55.575690: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-29 19:50:55.577324: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    capability = torch.cuda.get_device_capability()[0]

    if capability >= 8:
        !pip install -qqq flash-attn
        torch_dtype = torch.bfloat16
        attn_implementation = "flash_attention_2"
        print("Using CUDA with FlashAttention")
    else:  # For T4 GPUs (compute capability 7.5)
        torch_dtype = torch.float16
        attn_implementation = "sdpa"  # SDPA is better for older GPUs
        print("Using CUDA with SDPA (Scaled Dot-Product Attention)")

else:
    device = torch.device("cpu")
    torch_dtype = torch.float16
    attn_implementation = "eager"
    print("Using CPU")

Using CUDA with SDPA (Scaled Dot-Product Attention)


In [3]:
base_model = "/kaggle/input/llama-3.2/transformers/1b/1"

In [4]:
# QLoRA config -- 4bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [5]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-0

In [6]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(
#     base_model,
#     quantization_config=bnb_config,
#     device_map="auto",
#     attn_implementation=attn_implementation
# )

# Before fine-tuning with the drug labels

In [None]:
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch_dtype,
#     device_map="auto",
# )

In [None]:
# input_data = {
#   "brand_name": "PRISMASOL BGK2/0",
#   "generic_name": "MAGNESIUM CHLORIDE, DEXTROSE MONOHYDRATE, LACTIC ACID, SODIUM CHLORIDE, SODIUM BICARBONATE AND POTASSIUM CHLORIDE",
#   "query": "Does the drug PRISMASOL BGK2/0 have any adverse reactions?"
# }

# messages = [
#     {"role": "user", "content": f"Brand Name: {input_data['brand_name']}\nGeneric Name: {input_data['generic_name']}\n\nQuery: {input_data['query']}"}
# ]
# messages = [{"role": "system", "content": "You are a helpful medical assistant."}] + messages

# prompt = tokenizer.apply_chat_template(
#     messages, tokenize=False, add_generation_prompt=True
# )

# outputs = pipe(prompt, max_new_tokens=1000, do_sample=True)

# print(outputs[0]["generated_text"])

In [None]:
# '''
#     Expected Adverse Reaction:
# '''
# expected = {
#     "adverse_reactions": [
#         "6 ADVERSE REACTIONS The following adverse reactions have been identified during postapproval use with these or other similar products and therefore may occur with use of PHOXILLUM or PRISMASOL. Because these reactions are reported voluntarily from a population of uncertain size, it is not always possible to reliably estimate their frequency or establish a causal relationship to drug exposure. \u2022 Metabolic acidosis \u2022 Hypotension \u2022 Acid-base disorders \u2022 Electrolyte imbalance including calcium ionized increased (reported in PRISMASOL solutions containing calcium), hyperphosphatemia, and hypophosphatemia \u2022 Fluid imbalance"
#       ],
# }

# Adverse Reaction

In [7]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
modules

['gate_proj', 'o_proj', 'v_proj', 'down_proj', 'up_proj', 'k_proj', 'q_proj']

In [8]:
from peft import LoraConfig, get_peft_model, PeftType, TaskType

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

# Apply PEFT to the model
model = get_peft_model(model, peft_config)
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [9]:
dataset_path = "/kaggle/input/drug-label-filtered/adverse_reaction.json"

dataset = load_dataset('json', data_files=dataset_path)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(examples):
    inputs = [
        f"Input: {input_text} Response: {response_text}"
        for input_text, response_text in zip(examples["input_text"], examples["response_text"])
    ]
    # Tokenize
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/117619 [00:00<?, ? examples/s]

In [10]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and evaluation sets
train_test_split_ratio = 0.8
split_dataset = tokenized_dataset["train"].train_test_split(test_size=1 - train_test_split_ratio, seed=42)

# Access train and evaluation datasets
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Print dataset sizes
print(f"Train dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

Train dataset size: 94095
Evaluation dataset size: 23524


In [None]:
print(model.config.max_position_embeddings)

In [11]:
# Define training arguments
batch_size = 4
training_arguments = TrainingArguments(
    output_dir="Pharmllm",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="input_text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdshahadat10[0m ([33mdshahadat10-united-international-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
9410,0.3442,0.36532


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [None]:
trainer.model.save_pretrained("pharmllam_adverse")

# py file cell

In [None]:
# from transformers import (
#     AutoModelForCausalLM,
#     AutoTokenizer,
#     BitsAndBytesConfig,
#     HfArgumentParser,
#     TrainingArguments,
#     pipeline,
#     logging,
#     Trainer
# )
# from peft import (
#     LoraConfig,
#     PeftModel,
#     prepare_model_for_kbit_training,
#     get_peft_model,
#     TaskType
# )

# import os, torch, wandb
# from datasets import load_dataset
# from sklearn.model_selection import train_test_split
# from trl import SFTTrainer


# # Set torch dtype and attention implementation
# if torch.cuda.get_device_capability()[0] >= 8:
#     !pip install -qqq flash-attn
#     torch_dtype = torch.bfloat16
#     attn_implementation = "flash_attention_2"
#     print("cuda")
# else:
#     torch_dtype = torch.float16
#     attn_implementation = "eager"
#     print("cpu")

# base_model = "/kaggle/input/llama-3.2/transformers/1b/1"

# # QLoRA config -- 4bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch_dtype,
#     bnb_4bit_use_double_quant=True,
# )

# # Load model
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     quantization_config=bnb_config,
#     device_map="auto",
#     attn_implementation=attn_implementation
# )

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# if tokenizer.pad_token_id is None:
#     tokenizer.pad_token_id = tokenizer.eos_token_id
# if model.config.pad_token_id is None:
#     model.config.pad_token_id = model.config.eos_token_id


# import bitsandbytes as bnb

# def find_all_linear_names(model):
#     cls = bnb.nn.Linear4bit
#     lora_module_names = set()
#     for name, module in model.named_modules():
#         if isinstance(module, cls):
#             names = name.split('.')
#             lora_module_names.add(names[0] if len(names) == 1 else names[-1])
#     if 'lm_head' in lora_module_names:  # needed for 16 bit
#         lora_module_names.remove('lm_head')
#     return list(lora_module_names)

# modules = find_all_linear_names(model)

# # LoRA config
# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=modules
# )

# # Apply PEFT to the model
# model = get_peft_model(model, peft_config)
# print(model)

# dataset_path = "/kaggle/input/drug-label-filtered/adverse_reaction.json"

# dataset = load_dataset('json', data_files=dataset_path)
# tokenizer.pad_token = tokenizer.eos_token

# def preprocess_data(examples):
#     inputs = [
#         f"Input: {input_text} Response: {response_text}"
#         for input_text, response_text in zip(examples["input_text"], examples["response_text"])
#     ]
#     # Tokenize
#     model_inputs = tokenizer(inputs, max_length=5120, truncation=True, padding="max_length")
#     return model_inputs

# # Apply preprocessing
# tokenized_dataset = dataset.map(preprocess_data, batched=True)

# # Split the dataset into training and evaluation sets
# train_test_split_ratio = 0.8
# split_dataset = tokenized_dataset["train"].train_test_split(test_size=1 - train_test_split_ratio, seed=42)

# # Access train and evaluation datasets
# train_dataset = split_dataset["train"]
# eval_dataset = split_dataset["test"]

# # Print dataset sizes
# print(f"Train dataset size: {len(train_dataset)}")
# print(f"Evaluation dataset size: {len(eval_dataset)}")

# # Define training arguments
# batch_size = 4
# training_arguments = TrainingArguments(
#     output_dir="Pharmllm",
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=2,
#     optim="paged_adamw_32bit",
#     num_train_epochs=1,
#     eval_strategy="steps",
#     eval_steps=0.2,
#     logging_steps=1,
#     warmup_steps=10,
#     logging_strategy="steps",
#     learning_rate=2e-4,
#     fp16=False,
#     bf16=False,
#     group_by_length=True,
#     report_to="wandb"
# )


# trainer = SFTTrainer(
#     model=model,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     peft_config=peft_config,
#     max_seq_length= 512,
#     dataset_text_field="input_text",
#     tokenizer=tokenizer,
#     args=training_arguments,
#     packing= False,
# )

# trainer.train()

# trainer.model.save_pretrained("pharmllam_adverse_reaction")

# #wandb api key: e94acafecf7a152ebfc203f27e1d857e1036edeb