Fine tune Base Model**

##Installing Dependencies

In [29]:
%%capture installation_log
!pip install --no-build-isolation axolotl[deepspeed]
!pip install wandb
!pip install -q datasets

In [30]:
from datasets import load_dataset ,DatasetDict
import json
import pandas as pd
from pprint import pprint
import torch
assert (torch.cuda.is_available()==True)
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoConfig
from huggingface_hub import HfApi, ModelCard, ModelCardData

In [31]:
from google.colab import userdata
import os
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN') # make sure to use the name you used for your secret
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY') # make sure to use the name you used for your secret
hf_profile = 'aymangomaa'

## Load Dataset

In [None]:
from datasets import load_dataset

summarization_dataset = load_dataset("entity_extraction")
summarization_dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'relations'],
        num_rows: 3843
    })
    test: Dataset({
        features: ['text', 'relations'],
        num_rows: 428
    })
})

In [33]:
pprint(summarization_dataset['train'][4])

{'relations': [{'ade': 'bilateral pulmonary infiltrates',
                'drug': 'methotrexate'}],
 'text': 'Three patients received respectively 190 mg, 175 mg, and 196 mg of '
         'methotrexate and developed bilateral pulmonary infiltrates without '
         'evidence of peripheral blood eosinophilia.'}


# Transform and save Dataset

In [34]:
def transform_example(example):
    """
    Transform each ADE example into a chat-style prompt with multiple ADE-drug pairs in text.
    Ensures the assistant's response is valid JSON.
    """
    instruction = f"Extract all adverse drug effect (ADE) relationships from the sentence. ### TEXT: {example['text']}"
    answer = json.dumps(example["relations"], ensure_ascii=False)
    return {
        "messages": [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": answer}
        ]
    }


In [35]:
summarization_dataset_chat = summarization_dataset.map(transform_example)

In [36]:
summarization_dataset_chat

DatasetDict({
    train: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 3843
    })
    test: Dataset({
        features: ['text', 'relations', 'messages'],
        num_rows: 428
    })
})

In [37]:
pprint(summarization_dataset_chat['train'][4]['messages'])

[{'content': 'Extract all adverse drug effect (ADE) relationships from the '
             'sentence. ### TEXT: Three patients received respectively 190 mg, '
             '175 mg, and 196 mg of methotrexate and developed bilateral '
             'pulmonary infiltrates without evidence of peripheral blood '
             'eosinophilia.',
  'role': 'user'},
 {'content': '[{"ade": "bilateral pulmonary infiltrates", "drug": '
             '"methotrexate"}]',
  'role': 'assistant'}]


In [38]:
#  Split the train split into train + validation (e.g. 90/10)
split_train = summarization_dataset_chat["train"].train_test_split(test_size=0.1, seed=42)
train_data = split_train["train"]
val_data = split_train["test"]
test_data = summarization_dataset_chat["test"]

In [39]:
# Step 3: Create final dataset with proper splits
final_dataset = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data
})

In [40]:
# push to hub
final_dataset.push_to_hub(f"{hf_profile}/entity_extraction_ade_v2_chat_base")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/aymangomaa/entity_extraction_ade_v2_chat_base/commit/b5f236901f0a64267e271f5dfe47ebf6b209469b', commit_message='Upload dataset', commit_description='', oid='b5f236901f0a64267e271f5dfe47ebf6b209469b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/aymangomaa/entity_extraction_ade_v2_chat_base', endpoint='https://huggingface.co', repo_type='dataset', repo_id='aymangomaa/entity_extraction_ade_v2_chat_base'), pr_revision=None, pr_num=None)

## Training Parameters

In [41]:
import yaml

In [42]:
# Define variables
dataset_prepared_path = './prepared_data/'
output_dir = './outputs/'
data_download_path = f"{hf_profile}/entity_extraction_ade_v2_chat_base"
wandb_project = 'hw3_base'
base_model = 'Qwen/Qwen3-1.7B-Base'

In [43]:
# Use template with placeholders
yaml_template = """
seed: 42
torch_seed: 42

datasets:
  - path: ${DATASET_PATH}
    field_messages: messages
    type: chat_template
    chat_template: jinja
    chat_template_jinja: |
      {%- for message in messages -%}
        {%- if message['role'] == 'user' -%}
          {{ message['content'] }}
        {%- elif message['role'] == 'assistant' -%}
          ### LABEL: {{ message['content'] }}{{eos_token}}
        {%- endif -%}
      {%- endfor -%}
    train_on_split: train

test_datasets:
  - path: ${DATASET_PATH}
    type: chat_template
    chat_template: jinja
    field_messages: messages
    chat_template_jinja: |
      {%- for message in messages -%}
        {%- if message['role'] == 'user' -%}
          {{ message['content'] }}
        {%- elif message['role'] == 'assistant' -%}
          ### LABEL: {{ message['content'] }}{{eos_token}}
        {%- endif -%}
      {%- endfor -%}
    split: validation

train_on_inputs : false

base_model: ${BASE_MODEL}
load_in_4bit: true
adapter: qlora
lora_r: 128
lora_alpha: 256
lora_dropout: 0.01
lora_target_linear: true

learning_rate: 2e-5
lr_scheduler: cosine
num_epochs: 1
optimizer: paged_adamw_8bit
warmup_ratio: 0.2

eval_strategy: steps
eval_steps: 0.1
max_steps: 50
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 2
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false

sequence_len: 512
pad_to_sequence_len: true

micro_batch_size: 4
gradient_accumulation_steps: 4
gradient_checkpointing: true

sample_packing: false
group_by_length: false
dataset_prepared_path: ${PREPARED_PATH}
output_dir: ${OUTPUT_DIR}

bf16: true
fp16: false
tf32: false
flash_attention: false
sdp_attention: true

wandb_project: ${WANDB_PROJECT}
wandb_name: drug-ade-extraction-finetuned-base-4
"""

In [44]:
# Define variables
variables = {
    "${DATASET_PATH}": data_download_path,
    "${PREPARED_PATH}": dataset_prepared_path,
    "${OUTPUT_DIR}": output_dir,
    "${WANDB_PROJECT}": wandb_project,
    "${BASE_MODEL}": base_model
}

# Replace all placeholders at once
yaml_content = yaml_template
for placeholder, value in variables.items():
    yaml_content = yaml_content.replace(placeholder, value)

In [45]:
# Write to training.yaml file
with open('training.yaml', 'w') as file:
    file.write(yaml_content)

print("training.yaml has been created successfully!")

training.yaml has been created successfully!


In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataset Preprocessing

In [47]:
!rm -rf {dataset_prepared_path}

In [48]:
!axolotl preprocess training.yaml --debug

2025-05-09 17:10:26.508826: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746810626.531189   81306 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746810626.537934   81306 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2025-05-09 17:10:29,002] [INFO] [numexpr.utils._init_num_threads:162] [PID:81306] NumExpr defaulting to 8 threads.
[2025-05-09 17:10:30,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-05-09 17:10:30,198] [INFO] [root.spawn:60] [PID:81306] x86_64-linux-gnu-gcc -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O

In [49]:
from transformers import AutoTokenizer
from datasets import load_from_disk
import pandas as pd
import glob
from pprint import pprint

# Load your tokenizer
model_id = base_model
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Find the prepared dataset directories
prepared_dirs = glob.glob(f'{dataset_prepared_path}/*/')
print(f"Found prepared directories: {prepared_dirs}")

Found prepared directories: ['./prepared_data/49d1cf0b50a474a7a46ef8cb7d93c67d/', './prepared_data/2e730dc49b6ccb76382fb4d0b288736b/']


In [50]:
def display_token_stream(dataset_path, example_idx=0):
    """
    Displays token information for a specific example as a continuous string.

    Args:
        example_idx: Index of the example to display (default: 0)
    """
    try:
        ds = load_from_disk(dataset_path)

        if example_idx >= len(ds):
            print(f"Error: Example index {example_idx} out of range. Dataset has {len(ds)} examples.")
            return

        # Get the example
        row = ds[example_idx]

        # Print dataset and example info
        print(f"\nDataset: {dataset_path}")
        print(f"Example {example_idx} of {len(ds)}")
        print("-" * 80)

        # Build the continuous token stream
        result = ""
        for token_id, label in zip(row['input_ids'], row['labels']):
            # Decode the token
            token_text = tokenizer.decode([token_id])
            # Add token info to the stream
            token_info = f"<{token_text},{token_id},{label}> "
            result += token_info

        # Print the continuous stream
        print("Token Stream (<token_id,label,masked>):")
        pprint(result)

        # Print the full text separately for easy reading
        pprint("\nFull decoded text:")
        pprint(tokenizer.decode(row['input_ids']))

    except Exception as e:
        print(f"Error processing dataset: {e}")

In [51]:
display_token_stream(prepared_dirs[0], 4)


Dataset: ./prepared_data/49d1cf0b50a474a7a46ef8cb7d93c67d/
Example 4 of 3457
--------------------------------------------------------------------------------
Token Stream (<token_id,label,masked>):
('<Extract,28959,-100> < all,678,-100> < adverse,30859,-100> < drug,5506,-100> '
 '< effect,2456,-100> < (,320,-100> <ADE,32841,-100> <),8,-100> < '
 'relationships,11871,-100> < from,504,-100> < the,279,-100> < '
 'sentence,11652,-100> <.,13,-100> < ###,16600,-100> < TEXT,15762,-100> '
 '<:,25,-100> < PURPOSE,7515,-100> <:,25,-100> < The,576,-100> < '
 'occurrence,31559,-100> < of,315,-100> < my,847,-100> <oc,509,-100> '
 '<lon,12212,-100> <us,355,-100> < associated,5815,-100> < with,448,-100> < '
 'continuous,19259,-100> < i,600,-100> <.v,3133,-100> <.,13,-100> < '
 'infusion,70208,-100> < of,315,-100> < do,653,-100> <but,8088,-100> '
 '<amine,19991,-100> < in,304,-100> < a,264,-100> < patient,8720,-100> < '
 'with,448,-100> < end,835,-100> <-stage,50156,-100> < renal,62815,-100> < '
 'di

In [52]:
display_token_stream(prepared_dirs[0], 18)


Dataset: ./prepared_data/49d1cf0b50a474a7a46ef8cb7d93c67d/
Example 18 of 3457
--------------------------------------------------------------------------------
Token Stream (<token_id,label,masked>):
('<Extract,28959,-100> < all,678,-100> < adverse,30859,-100> < drug,5506,-100> '
 '< effect,2456,-100> < (,320,-100> <ADE,32841,-100> <),8,-100> < '
 'relationships,11871,-100> < from,504,-100> < the,279,-100> < '
 'sentence,11652,-100> <.,13,-100> < ###,16600,-100> < TEXT,15762,-100> '
 '<:,25,-100> < These,4220,-100> < findings,14613,-100> < are,525,-100> < '
 'consistent,12966,-100> < with,448,-100> < an,458,-100> < immune,22077,-100> '
 '<-com,11476,-100> <plex,9111,-100> < form,1352,-100> < of,315,-100> < '
 'gl,2770,-100> <omer,25359,-100> <ul,360,-100> <opathy,52942,-100> < '
 'in,304,-100> < which,892,-100> < gold,6623,-100> < is,374,-100> < '
 'neither,13866,-100> < the,279,-100> < antigen,81989,-100> < nor,6329,-100> < '
 'a,264,-100> < hap,45800,-100> <ten,1960,-100> < in,304,-1

## Model Training

In [53]:
!rm -rf {output_dir}

In [54]:
%%capture output
!accelerate launch -m axolotl.cli.train training.yaml




In [55]:
# output.show()
print('\n'.join(output.stdout.split('\n')[-10:]))




                                   [A{'eval_loss': 0.38690173625946045, 'eval_runtime': 326.9748, 'eval_samples_per_second': 1.177, 'eval_steps_per_second': 0.297, 'epoch': 0.23}
100% 50/50 [1:34:10<00:00, 69.10s/it]
100% 97/97 [05:24<00:00,  2.94s/it][A
                                   [A                                     {'train_runtime': 5655.5607, 'train_samples_per_second': 0.141, 'train_steps_per_second': 0.009, 'train_loss': 0.6511673140525818, 'epoch': 0.23}
100% 50/50 [1:34:13<00:00, 69.10s/it]100% 50/50 [1:34:13<00:00, 113.07s/it]
[2025-05-09 18:45:46,784] [INFO] [axolotl.train.save_trained_model:231] [PID:81815] [RANK:0] Training completed! Saving pre-trained model to ./outputs/.[39m
[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mdrug-ade-extraction-finetuned-base-4[0m at: [34mhttps://wandb.ai/aymangomaa2005-utd/hw3_base/runs/hqy9d817[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250509_171131-hqy9d817/logs[0m
[0m


In [56]:
# push model to hub
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
repo_name = f"{hf_profile}/drug-ade-extraction-finetuned-base-4"
# Load model with adapter
model = AutoPeftModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Push model to hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

adapter_model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aymangomaa/drug-ade-extraction-finetuned-base-4/commit/d2e979fbcb3002b9048a310229b5b6cd55bc0110', commit_message='Upload tokenizer', commit_description='', oid='d2e979fbcb3002b9048a310229b5b6cd55bc0110', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aymangomaa/drug-ade-extraction-finetuned-base-4', endpoint='https://huggingface.co', repo_type='model', repo_id='aymangomaa/drug-ade-extraction-finetuned-base-4'), pr_revision=None, pr_num=None)