# **Fine tune Instruct Model**

##Installing Dependencies

In [1]:
%%capture installation_log
!pip install --no-build-isolation axolotl[deepspeed]
!pip install wandb
!pip install -q datasets

In [2]:
from datasets import load_dataset ,DatasetDict
import json
import pandas as pd
from pprint import pprint
import torch
assert (torch.cuda.is_available()==True)
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoConfig
from huggingface_hub import HfApi, ModelCard, ModelCardData

[2025-05-10 16:30:10,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
from google.colab import userdata
import os
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN') # make sure to use the name you used for your secret
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY') # make sure to use the name you used for your secret
hf_profile = 'aymangomaa'

## Load Dataset
The Fine-tune Base Model notebook covers dataset loading and transformation, splitting the training set into training and validation subsets (e.g., 90/10 split), and pushing the processed dataset to the Hugging Face Hub.

In [4]:
from datasets import load_dataset

summarization_dataset_chat = load_dataset(f"{hf_profile}/entity_extraction_ade_v2_chat_base")
summarization_dataset_chat

DatasetDict({
    train: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 3458
    })
    validation: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 385
    })
    test: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 428
    })
})

In [5]:
pprint(summarization_dataset_chat['train'][4]['messages'])

[{'content': 'Extract all adverse drug effect (ADE) relationships from the '
             'sentence. ### TEXT: PURPOSE: The occurrence of myoclonus '
             'associated with continuous i.v. infusion of dobutamine in a '
             'patient with end-stage renal disease (ESRD) is described.',
  'role': 'user'},
 {'content': '[{"ade": "myoclonus", "drug": "dobutamine"}]',
  'role': 'assistant'}]


## Training Parameters

In [6]:
import yaml

In [7]:
# Define variables
dataset_prepared_path = './prepared_data/'
output_dir = './outputs/'
data_download_path = f"{hf_profile}/entity_extraction_ade_v2_chat_base"
wandb_project = 'hw3_instruct'
base_model = 'Qwen/Qwen3-1.7B'

In [8]:
# Use template with placeholders
# chat_template updated for instruct model
yaml_template = """
seed: 42
torch_seed: 42

datasets:
  - path: ${DATASET_PATH}
    field_messages: messages
    type: chat_template
    chat_template: tokenizer_default
    train_on_split: train

test_datasets:
  - path: ${DATASET_PATH}
    type: chat_template
    field_messages: messages
    chat_template: tokenizer_default
    split: validation

train_on_inputs : false

base_model: ${BASE_MODEL}
load_in_4bit: true
adapter: qlora
lora_r: 128
lora_alpha: 256
lora_dropout: 0.01
lora_target_linear: true

learning_rate: 2e-5
lr_scheduler: cosine
num_epochs: 1
optimizer: paged_adamw_8bit
warmup_ratio: 0.2

eval_strategy: steps
eval_steps: 0.1
max_steps: 50
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 2
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false

sequence_len: 512
pad_to_sequence_len: true

micro_batch_size: 4
gradient_accumulation_steps: 4
gradient_checkpointing: true

sample_packing: false
group_by_length: false
dataset_prepared_path: ${PREPARED_PATH}
output_dir: ${OUTPUT_DIR}

bf16: true
fp16: false
tf32: false
flash_attention: false
sdp_attention: true

wandb_project: ${WANDB_PROJECT}
wandb_name: drug-ade-extraction-finetuned-instruct-2
"""

In [9]:
# Define variables
variables = {
    "${DATASET_PATH}": data_download_path,
    "${PREPARED_PATH}": dataset_prepared_path,
    "${OUTPUT_DIR}": output_dir,
    "${WANDB_PROJECT}": wandb_project,
    "${BASE_MODEL}": base_model
}

# Replace all placeholders at once
yaml_content = yaml_template
for placeholder, value in variables.items():
    yaml_content = yaml_content.replace(placeholder, value)

In [10]:
# Write to training.yaml file
with open('training.yaml', 'w') as file:
    file.write(yaml_content)

print("training.yaml has been created successfully!")

training.yaml has been created successfully!


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataset Preprocessing

In [12]:
!rm -rf {dataset_prepared_path}

In [13]:
!axolotl preprocess training.yaml --debug

2025-05-10 16:30:25.164904: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746894625.185013   35343 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746894625.190984   35343 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2025-05-10 16:30:27,416] [INFO] [numexpr.utils._init_num_threads:162] [PID:35343] NumExpr defaulting to 8 threads.
[2025-05-10 16:30:28,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-05-10 16:30:28,477] [INFO] [root.spawn:60] [PID:35343] x86_64-linux-gnu-gcc -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O

In [14]:
from transformers import AutoTokenizer
from datasets import load_from_disk
import pandas as pd
import glob
from pprint import pprint

# Load your tokenizer
model_id = base_model
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Find the prepared dataset directories
prepared_dirs = glob.glob(f'{dataset_prepared_path}/*/')
print(f"Found prepared directories: {prepared_dirs}")

Found prepared directories: ['./prepared_data/ebb6b7f596a6187b95cefc7ced0f70fb/', './prepared_data/fad58ed398639171a2949561287be4d0/']


In [15]:
def display_token_stream(dataset_path, example_idx=0):
    """
    Displays token information for a specific example as a continuous string.

    Args:
        example_idx: Index of the example to display (default: 0)
    """
    try:
        ds = load_from_disk(dataset_path)

        if example_idx >= len(ds):
            print(f"Error: Example index {example_idx} out of range. Dataset has {len(ds)} examples.")
            return

        # Get the example
        row = ds[example_idx]

        # Print dataset and example info
        print(f"\nDataset: {dataset_path}")
        print(f"Example {example_idx} of {len(ds)}")
        print("-" * 80)

        # Build the continuous token stream
        result = ""
        for token_id, label in zip(row['input_ids'], row['labels']):
            # Decode the token
            token_text = tokenizer.decode([token_id])
            # Add token info to the stream
            token_info = f"<{token_text},{token_id},{label}> "
            result += token_info

        # Print the continuous stream
        print("Token Stream (<token_id,label,masked>):")
        pprint(result)

        # Print the full text separately for easy reading
        pprint("\nFull decoded text:")
        pprint(tokenizer.decode(row['input_ids']))

    except Exception as e:
        print(f"Error processing dataset: {e}")

In [16]:
display_token_stream(prepared_dirs[0], 4)


Dataset: ./prepared_data/ebb6b7f596a6187b95cefc7ced0f70fb/
Example 4 of 385
--------------------------------------------------------------------------------
Token Stream (<token_id,label,masked>):
('<<|im_start|>,151644,-100> <user,872,-100> <\n'
 ',198,-100> <Extract,28959,-100> < all,678,-100> < adverse,30859,-100> < '
 'drug,5506,-100> < effect,2456,-100> < (,320,-100> <ADE,32841,-100> '
 '<),8,-100> < relationships,11871,-100> < from,504,-100> < the,279,-100> < '
 'sentence,11652,-100> <.,13,-100> < ###,16600,-100> < TEXT,15762,-100> '
 '<:,25,-100> < We,1205,-100> < report,1895,-100> < a,264,-100> < '
 'case,1142,-100> < of,315,-100> < vit,13157,-100> <il,321,-100> '
 '<igo,7836,-100> < that,429,-100> < occurred,10017,-100> < during,2337,-100> '
 '< the,279,-100> < second,2086,-100> < month,2254,-100> < of,315,-100> < '
 'interfer,40205,-100> <on,263,-100> < alpha,8287,-100> < ,220,-100> '
 '<2,17,-100> <a,64,-100> < therapy,15069,-100> < for,369,-100> < '
 'chronic,20601,-100> <

In [17]:
display_token_stream(prepared_dirs[0], 18)


Dataset: ./prepared_data/ebb6b7f596a6187b95cefc7ced0f70fb/
Example 18 of 385
--------------------------------------------------------------------------------
Token Stream (<token_id,label,masked>):
('<<|im_start|>,151644,-100> <user,872,-100> <\n'
 ',198,-100> <Extract,28959,-100> < all,678,-100> < adverse,30859,-100> < '
 'drug,5506,-100> < effect,2456,-100> < (,320,-100> <ADE,32841,-100> '
 '<),8,-100> < relationships,11871,-100> < from,504,-100> < the,279,-100> < '
 'sentence,11652,-100> <.,13,-100> < ###,16600,-100> < TEXT,15762,-100> '
 '<:,25,-100> < S,328,-100> <ux,2200,-100> <am,309,-100> <eth,769,-100> '
 '<onium,89244,-100> < ap,1443,-100> <no,2152,-100> <ea,12508,-100> < '
 'terminated,31272,-100> < with,448,-100> < commercial,8353,-100> < '
 'serum,40429,-100> <ch,331,-100> <olin,36637,-100> <ester,5191,-100> '
 '<ase,519,-100> <.,13,-100> <<|im_end|>,151645,-100> <\n'
 ',198,-100> <<|im_start|>,151644,-100> <assistant,77091,-100> <\n'
 ',198,-100> <<think>,151667,-100> <\

## Model Training

In [18]:
!rm -rf {output_dir}

In [19]:
%%capture output
!accelerate launch -m axolotl.cli.train training.yaml




In [20]:
# output.show()
print('\n'.join(output.stdout.split('\n')[-10:]))




                                   [A{'eval_loss': 0.08576303720474243, 'eval_runtime': 333.4512, 'eval_samples_per_second': 1.155, 'eval_steps_per_second': 0.291, 'epoch': 0.23}
100% 50/50 [1:36:07<00:00, 70.63s/it]
100% 97/97 [05:31<00:00,  2.99s/it][A
                                   [A                                     {'train_runtime': 5772.5565, 'train_samples_per_second': 0.139, 'train_steps_per_second': 0.009, 'train_loss': 0.3898436599969864, 'epoch': 0.23}
100% 50/50 [1:36:10<00:00, 70.63s/it]100% 50/50 [1:36:10<00:00, 115.41s/it]
[2025-05-10 18:07:39,187] [INFO] [axolotl.train.save_trained_model:231] [PID:35843] [RANK:0] Training completed! Saving pre-trained model to ./outputs/.[39m
[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mdrug-ade-extraction-finetuned-instruct-2[0m at: [34mhttps://wandb.ai/aymangomaa2005-utd/hw3_instruct/runs/fy52tlc8[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250510_163127-fy52tlc8/logs[0m
[0m


In [21]:
# push model to hub
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
repo_name = f"{hf_profile}/drug-ade-extraction-finetuned-instruct-2"
# Load model with adapter
model = AutoPeftModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Push model to hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aymangomaa/drug-ade-extraction-finetuned-instruct-2/commit/6bb6bc0a7966b36ce1d96773d0abaa8e6ace5686', commit_message='Upload tokenizer', commit_description='', oid='6bb6bc0a7966b36ce1d96773d0abaa8e6ace5686', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aymangomaa/drug-ade-extraction-finetuned-instruct-2', endpoint='https://huggingface.co', repo_type='model', repo_id='aymangomaa/drug-ade-extraction-finetuned-instruct-2'), pr_revision=None, pr_num=None)