# Fine-tuning Phi-4 for ASR in Wolof language

GPU = A100 PCIe

**Acknowledgement:**  
This notebook is based on and modified from the original work: [Deep-unlearning/notebooks](https://github.com/Deep-unlearning/notebooks/blob/main/finetune_phi4mm.ipynb).  
Thanks to the original author for their contribution.  

Let’s get started by installing the necessary libraries.  


In [1]:
hf_cache_dir = "/workspace/hf_cache"
WORKDIR = "/workspace/Phi4_unf/"

In [2]:
!pip install librosa soundfile

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting scipy>=1.6.0 (from librosa)
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.1.0 (from librosa)
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting joblib>=1.0 (from librosa)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pooch>=1.1 (from libros

In [3]:
!wget https://github.com/Syllo/nvtop/releases/download/3.0.2/nvtop-x86_64.AppImage
!chmod +x nvtop-x86_64.AppImage
#!apt install -y htop nano ffmpeg

!pip install --upgrade pip wheel setuptools
!pip install huggingface_hub
!pip install scipy
!pip install peft
!pip install backoff
!pip install accelerate
!pip install sacrebleu
!pip install torchvision
!pip install hf_transfer
!pip install transformers==4.48.2
!pip install librosa
!pip install soundfile
!pip install datasets
!pip install evaluate
!pip install jiwer

--2025-04-17 11:00:17--  https://github.com/Syllo/nvtop/releases/download/3.0.2/nvtop-x86_64.AppImage
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/94530674/a5c730e4-62f8-4ecd-b7c8-814686852756?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250417%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250417T110017Z&X-Amz-Expires=300&X-Amz-Signature=bbf5cbd69af2e0cbe2a4e59dd956b06d72b51bb5033451210cb081c3afd6a0cc&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dnvtop-x86_64.AppImage&response-content-type=application%2Foctet-stream [following]
--2025-04-17 11:00:17--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/94530674/a5c730e4-62f8-4ecd-b7c8-814686852756?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-C

In [4]:
# !sudo apt install -y cmake ninja-build

from pkg_resources import get_distribution, DistributionNotFound

package_name = 'flash_attn'

try:
  dist = get_distribution(package_name)
  print(f"'{package_name}' version {dist.version} is already installed.")
except DistributionNotFound:
  !MAX_JOBS=4 pip install --no-build-isolation flash-attn==2.7.3

  from pkg_resources import get_distribution, DistributionNotFound


Collecting flash-attn==2.7.3
  Downloading flash_attn-2.7.3.tar.gz (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting einops (from flash-attn==2.7.3)
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l-^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
!git config --global credential.helper store
from huggingface_hub import login

login(token='HF_TOKEN')

In [None]:
# Core Imports
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoProcessor,
    TrainingArguments,
    Trainer,
    BatchFeature,
    StoppingCriteria,
    StoppingCriteriaList,
)

from torch.optim import AdamW

from peft import (
    LoraConfig,
    get_peft_model
)

from accelerate import Accelerator
from huggingface_hub import snapshot_download
from pathlib import Path
import torch
import shutil
import os
import json

import jiwer
from accelerate.utils import gather_object
from datasets import load_dataset, concatenate_datasets, load_from_disk
from torch.utils.data import Dataset
from tqdm import tqdm
import numpy as np
from evaluate import load
wer_metric = load("wer")


INSTRUCTION = "Transcribe the Wolof audio clip."
ANSWER_SUFFIX = "<|end|><|endoftext|>"
_IGNORE_INDEX = -100


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [6]:
#!pip install git+https://github.com/ysdede/trnorm.git
#from trnorm.legacy_normalizer import normalize_text as my_custom_normalizer

def normalize_text(text):
    """
    Placeholder for text normalization. You can use whisper text normalizer/jiwer or similar tools.
    """
    # return my_custom_normalizer(text)
    return text


In [7]:
class WolBanking77Dataset(Dataset):
    def __init__(self, processor, dataset, training=True):
        """
        processor: the AutoProcessor instance
        dataset: a Hugging Face Dataset (already split into train/validation)
        training: whether this dataset is for training (affects concatenation of target tokens)
        """
        self.data = dataset
        self.training = training
        self.processor = processor
        self.instruction = INSTRUCTION

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        # The dataset contains an "audio" dict and a "text" field for transcription.
        user_message = {
            'role': 'user',
            'content': '<|audio_1|>\n' + self.instruction,
        }
        prompt = self.processor.tokenizer.apply_chat_template(
            [user_message], tokenize=False, add_generation_prompt=True
        )
        inputs = self.processor(
            text=prompt,
            audios=[(data["audio"]["array"], data["audio"]["sampling_rate"])],
            return_tensors='pt'
        )
        
        answer = f"{data['text']}{ANSWER_SUFFIX}"
        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
        if self.training:
            # Concatenate prompt and answer, but mask all tokens except the answer.
            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
            labels = torch.full_like(input_ids, _IGNORE_INDEX)
            labels[:, -answer_ids.shape[1]:] = answer_ids
        else:
            input_ids = inputs.input_ids
            labels = answer_ids

        return {
            'input_ids': input_ids,
            'labels': labels,
            'input_audio_embeds': inputs.input_audio_embeds,
            'audio_embed_sizes': inputs.audio_embed_sizes,
        }

In [8]:
def pad_sequence(sequences, padding_side='right', padding_value=0):
    assert padding_side in ['right', 'left']
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max(len(seq) for seq in sequences)
    batch_size = len(sequences)
    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
    for i, seq in enumerate(sequences):
        length = seq.size(0)
        if padding_side == 'right':
            output.data[i, :length] = seq
        else:
            output.data[i, -length:] = seq
    return output

def cat_with_pad(tensors, dim, padding_value=0):
    ndim = tensors[0].dim()
    assert all(t.dim() == ndim for t in tensors[1:]), 'All tensors must have the same number of dimensions'
    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
    out_size[dim] = sum(t.shape[dim] for t in tensors)
    output = tensors[0].new_full(out_size, padding_value)
    index = 0
    for t in tensors:
        slices = [slice(0, t.shape[d]) for d in range(ndim)]
        slices[dim] = slice(index, index + t.shape[dim])
        output[slices] = t
        index += t.shape[dim]
    return output


In [9]:
def esb_collate_fn(batch):
    input_ids_list = []
    labels_list = []
    input_audio_embeds_list = []
    audio_embed_sizes_list = []
    audio_attention_mask_list = []
    for inputs in batch:
        input_ids_list.append(inputs['input_ids'][0])
        labels_list.append(inputs['labels'][0])
        input_audio_embeds_list.append(inputs['input_audio_embeds'])
        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
        audio_attention_mask_list.append(
            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
        )
    try:
        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
        audio_attention_mask = (
            pad_sequence(audio_attention_mask_list, padding_side='right', padding_value=False)
            if len(audio_attention_mask_list) > 1 else None
        )
    except Exception as e:
        print(e)
        print(input_ids_list)
        print(labels_list)
        raise
    attention_mask = (input_ids != 0).long()
    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
    audio_embed_sizes = torch.cat(audio_embed_sizes_list)
    return BatchFeature({
        'input_ids': input_ids,
        'labels': labels,
        'attention_mask': attention_mask,
        'input_audio_embeds': input_audio_embeds,
        'audio_embed_sizes': audio_embed_sizes,
        'audio_attention_mask': audio_attention_mask,
        'input_mode': 2,  # speech mode
    })


In [10]:
class MultipleTokenBatchStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_tokens: torch.LongTensor, batch_size: int = 1) -> None:
        self.stop_tokens = stop_tokens
        self.max_stop_tokens = stop_tokens.shape[-1]
        self.stop_tokens_idx = torch.zeros(batch_size, dtype=torch.long, device=stop_tokens.device)

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        generated_inputs = torch.eq(input_ids[:, -self.max_stop_tokens :].unsqueeze(1), self.stop_tokens)
        equal_generated_inputs = torch.all(generated_inputs, dim=2)
        sequence_idx = torch.any(equal_generated_inputs, dim=1)
        sequence_set_mask = self.stop_tokens_idx == 0
        self.stop_tokens_idx[sequence_idx & sequence_set_mask] = input_ids.shape[-1]
        return torch.all(self.stop_tokens_idx)


In [11]:
@torch.no_grad()
def evaluate(model, processor, eval_dataset, save_path=None, disable_tqdm=False, eval_batch_size=1):
    rank = int(os.environ.get('RANK', 0))
    local_rank = int(os.environ.get('LOCAL_RANK', 0))

    model.eval()
    all_generated_texts = []
    all_labels = []

    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=eval_batch_size,
        collate_fn=esb_collate_fn,
        shuffle=False,
        drop_last=False,
        num_workers=8,  # 2 8
        prefetch_factor=32,  # 128
        pin_memory=True,
        persistent_workers=True  # Keep workers alive between batches
    )
    stop_tokens = ["<|end|>", processor.tokenizer.eos_token]
    stop_tokens_ids = processor.tokenizer(stop_tokens, add_special_tokens=False, padding="longest", return_tensors="pt")["input_ids"]
    stop_tokens_ids = stop_tokens_ids.to(f'cuda:{local_rank}')

    # with torch.cuda.amp.autocast(enabled=True):
    for inputs in tqdm(eval_dataloader, disable=(rank != 0) or disable_tqdm, desc='running eval'):
        stopping_criteria = StoppingCriteriaList([MultipleTokenBatchStoppingCriteria(stop_tokens_ids, batch_size=inputs.input_ids.size(0))])
        inputs = inputs.to(f'cuda:{local_rank}')
        generated_ids = model.generate(
            **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=64,
            stopping_criteria=stopping_criteria,
        )

        stop_tokens_idx = stopping_criteria[0].stop_tokens_idx.reshape(inputs.input_ids.size(0), -1)[:, 0]
        stop_tokens_idx = torch.where(
            stop_tokens_idx > 0,
            stop_tokens_idx - stop_tokens_ids.shape[-1],
            generated_ids.shape[-1],
        )
        generated_text = [
            processor.decode(_pred_ids[inputs["input_ids"].shape[1] : _stop_tokens_idx],
                              skip_special_tokens=True,
                              clean_up_tokenization_spaces=False)
            for _pred_ids, _stop_tokens_idx in zip(generated_ids, stop_tokens_idx)
        ]

        all_generated_texts.extend(generated_text)
        labels = [processor.decode(_label_ids[_label_ids != 0]).rstrip(ANSWER_SUFFIX) for _label_ids in inputs["labels"]]  # ⚠ See annd apply: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/33
        all_labels.extend(labels)

    all_generated_texts = gather_object(all_generated_texts)
    all_labels = gather_object(all_labels)

    if rank == 0:
        norm_all_labels = normalize_text(all_labels)
        norm_all_generated_texts = normalize_text(all_generated_texts)
        # wer = jiwer.wer(norm_all_labels, norm_all_generated_texts)
        wer = wer_metric.compute(references=norm_all_labels, predictions=norm_all_generated_texts)
        print("WER:", wer)
        if save_path:
            with open(save_path, 'w') as f:
                save_dict = {
                    'all_generated_texts': all_generated_texts,
                    'all_labels': all_labels,
                    'wer': wer,
                }
                json.dump(save_dict, f)
        return wer
    return None


In [None]:
dataset_dir = "dataset/audio"

ds = load_dataset("parquet", 
                      data_files={'train': os.path.join(dataset_dir, 'train.parquet'), 
                                  'test': os.path.join(dataset_dir, 'test.parquet')})

In [None]:
BATCH_SIZE_PER_GPU = 16 # 8
EVAL_BATCH_SIZE_PER_GPU = 24

# Load and split the dataset.
train_ds = ds['train']
val_ds = ds['test']

num_processes = 8
print(f"Training dataset size: {len(train_ds)}")
print(f"Val dataset size: {len(val_ds)}")

Training dataset size: 2563
Val dataset size: 641


In [14]:
!pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.71.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading markdown-3.8-py3-none-any.whl.metadata (5.1 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Downloading tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 M

In [15]:
# Configuration variables
MODEL_NAME = 'microsoft/Phi-4-multimodal-instruct'
OUTPUT_DIR = '/workspace/Phi4_mm_asr_wolbanking77_unf'
NEW_MODEL_ID = "karim155/Phi-4-mm-inst-asr-wolbanking77-unf"
USE_FLASH_ATTENTION = True
# BATCH_SIZE_PER_GPU = 8  See dataset loader cell for these parameters.
# EVAL_BATCH_SIZE_PER_GPU = 16
NUM_TRAIN_EPOCHS = 5
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.005

# Initialize Accelerator
accelerator = Accelerator()
num_gpus = accelerator.num_processes
print(f"Training on {num_gpus} GPUs")

def print_model_structure(model, max_depth=3):
    """Prints model structure up to specified depth"""
    print("\n=== MODEL ARCHITECTURE ===")
    for name, module in model.named_modules():
        depth = name.count('.')
        if depth < max_depth:
            print(f"{'  ' * depth}{name} ({type(module).__name__})")

def create_model(model_name, use_flash_attention):
    """Initialize model with audio enabled"""
    config = AutoConfig.from_pretrained(
        model_name,
        trust_remote_code=True,
        audio_enabled=True
    )
    if use_flash_attention:
        config._attn_implementation = "flash_attention_2"
    return AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    ).to(accelerator.device)

# --------------------------------------------------
# Model Initialization and Unfreezing
# --------------------------------------------------
with accelerator.local_main_process_first():
    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = create_model(MODEL_NAME, USE_FLASH_ATTENTION)

    def unfreeze_speech_components(model):
      """Directly target verified components from your debug logs"""
      # 1. Audio Embed Module (confirmed exists)
      audio_embed = model.model.embed_tokens_extend.audio_embed

      # 2. Entire Audio Encoder (simplified)
      audio_encoder = audio_embed.encoder  # Direct access

      # 3. Audio Projection (from debug logs)
      audio_projection = audio_embed.audio_projection

      # Unfreeze ONLY these 3 components
      for component in [audio_embed, audio_encoder, audio_projection]:
          for param in component.parameters():
              param.requires_grad = True
      return model

    model = unfreeze_speech_components(model)

    # Verify unfrozen parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {trainable_params:,}")
    print("Unfrozen components:")
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"- {name}")

    # After unfreezing
    encoder_params = list(model.model.embed_tokens_extend.audio_embed.encoder.parameters())
    proj_params = list(model.model.embed_tokens_extend.audio_embed.audio_projection.parameters())

    assert any(p.requires_grad for p in encoder_params), "Encoder params frozen!"
    assert any(p.requires_grad for p in proj_params), "Projection params frozen!"
    print("Components properly unfrozen ✅")

# Create dataset objects.
train_dataset = WolBanking77Dataset(processor, train_ds, training=True)
val_dataset = WolBanking77Dataset(processor, val_ds, training=False)

# --------------------------------------------------
# Optimizer Configuration with Correct Gradient Handling
# --------------------------------------------------
gradient_accumulation_steps = max(1, BATCH_SIZE_PER_GPU // (BATCH_SIZE_PER_GPU // num_gpus))
print(f"Gradient accumulation steps: {gradient_accumulation_steps}")


# Set mixed precision flags.
fp16 = not USE_FLASH_ATTENTION
bf16 = USE_FLASH_ATTENTION

# --------------------------------------------------
# Training Preparation with DDP Fixes
# --------------------------------------------------
training_args = TrainingArguments(
    gradient_accumulation_steps=gradient_accumulation_steps,
    ddp_find_unused_parameters=True,  # for unused SigLIP layers
    overwrite_output_dir=True,
    save_steps=10000,
    # num_train_epochs=NUM_TRAIN_EPOCHS,
    max_steps=1000,
    per_device_train_batch_size=BATCH_SIZE_PER_GPU,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    optim='adamw_torch',
    adam_beta1=0.9,
    adam_beta2=0.99,
    adam_epsilon=1e-7,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    warmup_ratio=0.1,
    logging_steps=100,
    output_dir=OUTPUT_DIR,
    save_strategy='epoch',
    save_total_limit=2,
    save_only_model=True,
    bf16=bf16,
    fp16=fp16,
    remove_unused_columns=False,
    dataloader_num_workers=2,
    push_to_hub=False,
    # hub_private_repo=True,
    report_to="tensorboard",
    hub_model_id=NEW_MODEL_ID
)
#--------------------------------------------------

print("Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# --------------------------------------------------
# Training
# --------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=esb_collate_fn,
)

trainer.train()

# Save full model with processor and configs
trainer.save_model(OUTPUT_DIR)
# processor.save_pretrained(OUTPUT_DIR)


Training on 1 GPUs


processor_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

processing_phi4mm.py:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- processing_phi4mm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


preprocessor_config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.91M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.63k [00:00<?, ?B/s]

configuration_phi4mm.py:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- configuration_phi4mm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi4mm.py:   0%|          | 0.00/116k [00:00<?, ?B/s]

vision_siglip_navit.py:   0%|          | 0.00/78.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- vision_siglip_navit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


speech_conformer_encoder.py:   0%|          | 0.00/111k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- speech_conformer_encoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- modeling_phi4mm.py
- vision_siglip_navit.py
- speech_conformer_encoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/240k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Trainable parameters: 5,574,460,224
Unfrozen components:
- model.embed_tokens.weight
- model.embed_tokens_extend.image_embed.glb_GN
- model.embed_tokens_extend.image_embed.sub_GN
- model.embed_tokens_extend.image_embed.img_processor.embeddings.patch_embedding.weight
- model.embed_tokens_extend.image_embed.img_processor.embeddings.patch_embedding.bias
- model.embed_tokens_extend.image_embed.img_processor.embeddings.position_embedding.weight
- model.embed_tokens_extend.image_embed.img_processor.encoder.layers.0.self_attn.k_proj.weight
- model.embed_tokens_extend.image_embed.img_processor.encoder.layers.0.self_attn.k_proj.bias
- model.embed_tokens_extend.image_embed.img_processor.encoder.layers.0.self_attn.v_proj.weight
- model.embed_tokens_extend.image_embed.img_processor.encoder.layers.0.self_attn.v_proj.bias
- model.embed_tokens_extend.image_embed.img_processor.encoder.layers.0.self_attn.q_proj.weight
- model.embed_tokens_extend.image_embed.img_processor.encoder.layers.0.self_attn.q_pr

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
100,1.5727
200,0.0408
300,0.015
400,0.0103
500,0.0061
600,0.0031
700,0.0033
800,0.0027
900,0.002
1000,0.0021


In [36]:
!pip freeze | grep transformers

transformers==4.48.2


In [16]:
OUTPUT_DIR = '/workspace/Phi4_mm_asr_wolbanking77_unf'
EVAL_BATCH_SIZE_PER_GPU = 24
# Free up memory before re-loading the model.
# del model, trainer
torch.cuda.empty_cache()

# Reload the fine-tuned model.
model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    trust_remote_code=True,
    # torch_dtype='auto',
    torch_dtype=torch.bfloat16,
    _attn_implementation='flash_attention_2',
).cuda()
model = torch.compile(model)
model.eval()  # Ensure evaluation mode.

# Evaluate the model after fine-tuning.
print("Evaluating after fine-tuning...")
wer_after = evaluate(
    model,
    processor,
    val_dataset,
    save_path=Path(training_args.output_dir) / 'eval_after.json',
    eval_batch_size=EVAL_BATCH_SIZE_PER_GPU,
)
print(f"WER after fine-tuning: {wer_after}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating after fine-tuning...


running eval:   0%|          | 0/27 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been use

WER: 0.03117962930885155
WER after fine-tuning: 0.03117962930885155


In [18]:
OUTPUT_DIR = '/workspace/Phi4_mm_asr_wolbanking77_unf'
MODEL_NAME = 'microsoft/Phi-4-multimodal-instruct'
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
val_dataset = WolBanking77Dataset(processor, val_ds, training=False)
# Evaluate the model after fine-tuning.
print("Evaluating after fine-tuning...")
wer_after = evaluate(
    model,
    processor,
    val_dataset,
    save_path=Path(OUTPUT_DIR) / 'eval_after.json',
    eval_batch_size=EVAL_BATCH_SIZE_PER_GPU,
)
print(f"WER after fine-tuning: {wer_after}")

Evaluating after fine-tuning...


running eval:   0%|          | 0/27 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been use

WER: 0.03117962930885155
WER after fine-tuning: 0.03117962930885155
