In [1]:
%%capture
%pip install -U accelerate peft bitsandbytes transformers trl

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Trainer
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer

2025-01-22 04:39:09.505495: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-22 04:39:09.505656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-22 04:39:09.764834: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
base_model = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"

In [4]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
    print("cuda")
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
    print("cpu")

cpu


In [5]:
# QLoRA config -- 4bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# # Load model
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     quantization_config=bnb_config,
#     device_map="auto",
#     attn_implementation=attn_implementation
# )

In [6]:
# Correct tokenizer initialization
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Ensure pad_token is set correctly
tokenizer.pad_token = tokenizer.eos_token

# Keep the model instantiation as is
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)


In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), 

In [8]:
# tokenizer = AutoTokenizer.from_pretrained(
#     base_model,
#     quantization_config=bnb_config,
#     device_map="auto",
#     attn_implementation=attn_implementation
# )

# Before fine-tuning with the drug labels

In [9]:
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch_dtype,
#     device_map="auto",
# )

In [10]:
# input_data = {
#   "brand_name": "PRISMASOL BGK2/0",
#   "generic_name": "MAGNESIUM CHLORIDE, DEXTROSE MONOHYDRATE, LACTIC ACID, SODIUM CHLORIDE, SODIUM BICARBONATE AND POTASSIUM CHLORIDE",
#   "query": "Does the drug PRISMASOL BGK2/0 have any adverse reactions?"
# }

# messages = [
#     {"role": "user", "content": f"Brand Name: {input_data['brand_name']}\nGeneric Name: {input_data['generic_name']}\n\nQuery: {input_data['query']}"}
# ]
# messages = [{"role": "system", "content": "You are a helpful medical assistant."}] + messages

# prompt = tokenizer.apply_chat_template(
#     messages, tokenize=False, add_generation_prompt=True
# )

# outputs = pipe(prompt, max_new_tokens=1000, do_sample=True)

# print(outputs[0]["generated_text"])

In [11]:
# '''
#     Expected Adverse Reaction:
# '''
# expected = {
#     "adverse_reactions": [
#         "6 ADVERSE REACTIONS The following adverse reactions have been identified during postapproval use with these or other similar products and therefore may occur with use of PHOXILLUM or PRISMASOL. Because these reactions are reported voluntarily from a population of uncertain size, it is not always possible to reliably estimate their frequency or establish a causal relationship to drug exposure. \u2022 Metabolic acidosis \u2022 Hypotension \u2022 Acid-base disorders \u2022 Electrolyte imbalance including calcium ionized increased (reported in PRISMASOL solutions containing calcium), hyperphosphatemia, and hypophosphatemia \u2022 Fluid imbalance"
#       ],
# }

# Adverse Reaction

In [12]:
from peft import LoraConfig, get_peft_model, PeftType, TaskType


peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],  # Adapt to your model's architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply PEFT to the model
model = get_peft_model(model, peft_config)

In [13]:
dataset_path = "/kaggle/input/adr-dataset-v1/adverse_reactions_dataset.json"

dataset = load_dataset('json', data_files=dataset_path)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(examples):
    inputs = [
        f"Input: {input_text} Response: {response_text}"
        for input_text, response_text in zip(examples["input_text"], examples["response_text"])
    ]
    # Tokenize
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_data, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/27901 [00:00<?, ? examples/s]

In [14]:
print(model.config.max_position_embeddings)

131072


In [15]:
# Define training arguments
batch_size = 2
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
)

In [16]:
from sklearn.model_selection import train_test_split

train_test_split_ratio = 0.8
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(int(len(tokenized_dataset["train"]) * train_test_split_ratio)))
eval_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(int(len(tokenized_dataset["train"]) * train_test_split_ratio), len(tokenized_dataset["train"])))

In [17]:
import wandb

# Set up the wandb API key
wandb.login(key="06c879c1f279e5d9818d493d85486dc08f5d2cf3")

# Then initialize the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin




[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  trainer = SFTTrainer(




[34m[1mwandb[0m: Currently logged in as: [33msrahman212074[0m ([33msrahman212074-united-international-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: wandb version 0.19.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Tracking run with wandb version 0.16.6


[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250122_044050-4lu3chm9[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.


[34m[1mwandb[0m: Syncing run [33m./results[0m


[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/srahman212074-united-international-university/huggingface[0m


[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/srahman212074-united-international-university/huggingface/runs/4lu3chm9[0m


Epoch,Training Loss,Validation Loss
1,0.803,0.765176
2,0.6496,0.63857


In [None]:
trainer.model.save_pretrained("llamaDrugLabel++")