# Linear layers


Pour entraîner efficacement notre modèle à l'aide de la méthode LoRA, il est essentiel d'identifier les couches linéaires du modèle qui présentent la plus grande efficacité d'adaptation.

In [2]:
import torch
import re
from transformers import AutoModelForSpeechSeq2Seq, WhisperTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-medium.en"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype, 
    device_map="auto",
    low_cpu_mem_usage=True, 
    use_safetensors=True
)
model.to(device)

tokenizer = WhisperTokenizer.from_pretrained(model_id)

model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))
target_modules = ["q_proj", "v_proj"]

`torch_dtype` is deprecated! Use `dtype` instead!


In [4]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [5]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
lora_config = LoraConfig(
    r=16,                         # Rang LoRA Plus grand = plus de paramètres entraînés, meilleure performance, Temps d'éxecution plus élévés
    lora_alpha=8,                
    target_modules=target_modules, # Les couches que nous avons identifiées précédemment
    lora_dropout=0.2,            
    bias="none",                  
    task_type="SEQ_2_SEQ_LM",
)

In [6]:
model = prepare_model_for_kbit_training(model)
model.enable_input_require_grads()
peft_model = get_peft_model(model, lora_config)

# Afficher la différence : Seuls les paramètres LoRA sont entraînés
peft_model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 768,575,488 || trainable%: 0.6139


In [7]:
from datasets import load_dataset, Audio
from transformers import AutoProcessor

datasets = load_dataset("json", data_files={"train": "../dataset/processed/json_format/train.json", "validation": "../dataset/processed/json_format/val.json"})


datasets = datasets.cast_column(
    "audio", 
    Audio(sampling_rate=16000)
)

processor = AutoProcessor.from_pretrained(model_id)

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 195
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 22
    })
})


In [8]:
def prepare_dataset(batch, processor):
    audio = batch["audio"]
    
    # feature_extractor() convertit l'array audio en spectrograms
    batch["input_features"] = processor.feature_extractor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    batch["labels"] = processor.tokenizer(
        batch["text"]
    ).input_ids
    
    return batch

In [9]:
# Lancement du mapping 
datasets = datasets.map(
    prepare_dataset, 
    remove_columns=datasets.column_names["train"], 
    num_proc=4,
    fn_kwargs={"processor": processor}
)

print("Pré-traitement réussi.")
print(datasets)

Pré-traitement réussi.
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 195
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 22
    })
})


In [10]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [11]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [12]:
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [13]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer_result = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    cer_result = 100 * cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer_result, "cer": cer_result}

In [14]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-docvoice-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    max_steps=200,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=128,
    save_steps=25,
    eval_steps=25,
    logging_steps=1,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    save_total_limit=2,
    greater_is_better=False,
    weight_decay=0.01
)


In [15]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [16]:
print("Lancement du fine-tuning LoRA sur le lexique médical...")
trainer.train()

print("Fine-tuning terminé. Le modèle est sauvegardé.")

Lancement du fine-tuning LoRA sur le lexique médical...


You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer,Cer
25,0.2247,0.418959,36.0,6.719368
50,0.0824,0.41692,40.0,7.905138
75,0.0527,0.400924,36.0,7.509881
100,0.0595,0.424028,36.0,7.509881
125,0.027,0.418324,36.0,7.509881
150,0.0236,0.432583,36.0,7.905138
175,0.0158,0.43517,36.0,7.905138
200,0.0241,0.4369,36.0,7.905138


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Fine-tuning terminé. Le modèle est sauvegardé.


In [19]:
final_adapter_output_dir = "./whisper-docvoice-lora/final_adapter" 
peft_model.save_pretrained(final_adapter_output_dir)

In [20]:
from peft import PeftModel
from evaluate import load

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype,
)
processor = AutoProcessor.from_pretrained(model_id)

# 3. Attach LoRA Adapters (Loading the Checkpoint)
# ----------------------------------------------------------------------
# This loads the LoRA weights onto the base model
model = PeftModel.from_pretrained(base_model, final_adapter_output_dir)
model = model.merge_and_unload()
# Set the combined model to evaluation mode
model.eval()

model.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=Tru

In [21]:

datasets = load_dataset("json", data_files={"train": "../dataset/processed/json_format/train.json", "test": "../dataset/processed/json_format/test.json"})
train_dataset = datasets["train"]
test_dataset = datasets["test"]


In [29]:
from tqdm.auto import tqdm
from scipy.io.wavfile import read

def predictData(dataset):
    pred_texts, ref_texts = [], []
    for sample in tqdm(dataset, desc="Predecting Speech"):
        audio_path = sample["audio"]
        sample_rate, waveform = read(audio_path)
        input_features = processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(torch.float16)
        with torch.no_grad():
            predicted_ids = model.generate(input_features.to("cuda"))[0]
        transcription = processor.decode(predicted_ids)
        pred_texts.append(processor.tokenizer._normalize(transcription))
        ref_texts.append(sample["text"].lower().strip())
    return pred_texts,ref_texts

In [34]:
pred_texts, ref_texts = predictData(test_dataset)

Predecting Speech: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 55/55 [02:39<00:00,  2.90s/it]


In [35]:
from jiwer import wer, cer
baseWer = wer(ref_texts, pred_texts) * 100
baseCer = cer(ref_texts, pred_texts) * 100
print(f"Whisper WER (test_dataset): {baseWer:.2f}%")
print(f"Whisper CER (test_dataset): {baseCer:.2f}%")

Whisper WER (test_dataset): 70.18%
Whisper CER (test_dataset): 31.11%


In [32]:
pred_texts, ref_texts = predictData(train_dataset)

Predecting Speech: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 195/195 [09:47<00:00,  3.01s/it]


In [33]:
baseWer = wer(ref_texts, pred_texts) * 100
baseCer = cer(ref_texts, pred_texts) * 100
print(f"Whisper WER (train_dataset): {baseWer:.2f}%")
print(f"Whisper CER (train_dataset): {baseCer:.2f}%")

Whisper WER (train_dataset): 74.30%
Whisper CER (train_dataset): 28.27%
