In [1]:
# Import libraries
import logging
import os
import sys
import json
import random
import numpy as np
from dataclasses import dataclass, field
from typing import Optional
from huggingface_hub import notebook_login

from datasets import load_dataset, DatasetDict
from transformers import (WhisperFeatureExtractor, 
                          WhisperTokenizer, 
                          WhisperProcessor,
                          WhisperModel,
                          WhisperForConditionalGeneration, 
                          Seq2SeqTrainingArguments, 
                          Seq2SeqTrainer, 
                          TrainerCallback, 
                          TrainingArguments, 
                          TrainerState, 
                          TrainerControl)
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from peft import (prepare_model_for_int8_training,
                  LoraConfig, 
                  PeftModel, 
                  LoraModel, 
                  LoraConfig, 
                  TaskType,
                  get_peft_model)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from transformers.utils import check_min_version
import re

from trainer_utils import AlignmentSeq2SeqTrainer
from data_utils import (DataCollatorSpeechSeq2SeqWithPadding, 
                        load_sd_qa_dataset, 
                        filter_data)

In [2]:

# Setup 
os.system("pip install -q transformers librosa datasets==2.14.6 evaluate jiwer gradio bitsandbytes==0.37 accelerate geomloss gradio torchaudio")
os.system("pip install -q git+https://github.com/huggingface/peft.git@main")
# current = os.path.dirname(os.path.realpath(__file__))  # name of this directory
# parent = os.path.dirname(current)  # parent directory
# sys.path.append(parent)  # add parent directory to sys.path

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # use first gpu on machine

# logger = logging.getLogger(__name__)
# check_min_version("4.21.0")  # calls an error if minimal version of Transformers is not installed. 




0

In [5]:
model_path = "openai/whisper-base"
task = "transcribe"
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
tokenizer = WhisperTokenizer.from_pretrained(model_path, task=task)
processor = WhisperProcessor.from_pretrained(model_path, task=task)

# load pre-trained model checkpoint
model = WhisperForConditionalGeneration.from_pretrained(model_path)
# model.hf_device_map = {" ":0}  # not super sure what to map to here
model.config.forced_decoder_ids = None  # no tokens forced for decoder outputs
model.config.suppress_tokens = []
    
    # load data
target_dialect = 'usa'
source_dialect = 'ind_n'
sd_qa = filter_data(load_sd_qa_dataset(), source=source_dialect, target=target_dialect)
    
print(sd_qa['dev'][0])
sd_qa['dev'] = sd_qa['dev'].select([0,1,2,3,4,5])
sd_qa['test']= sd_qa['test'].select([0,1,2,3,4,5])

    # prepare data
def prepare_source_data(data):
        # compute log-Mel input features from audio arrays
    data["source_input_features"] = feature_extractor(data[source_dialect]["array"], sampling_rate=data[source_dialect]["sampling_rate"]).input_features[0]
    data["target_input_features"] = feature_extractor(data[target_dialect]["array"], sampling_rate=data[target_dialect]["sampling_rate"]).input_features[0]
    return data

def prepare_target_embeddings(data):
    # compute log-Mel input features from target audio array
    batch_size = 128
    target_embeddings = []
    decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
    for i in range(0, len(data["target_input_features"]), batch_size):
        input_features = torch.tensor(data["target_input_features"][i: i + batch_size])
        with torch.no_grad():
            outputs = model(input_features, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
        last_hidden_state = outputs.encoder_hidden_states[-1]
        target_embeddings.extend([embedding for embedding in last_hidden_state])
    data["target_embeddings"] = target_embeddings
    return data


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'id': '-1008642825401516622', 'ind_n': {'path': None, 'array': array([ 0.00000000e+00, -3.05175781e-05, -3.05175781e-05, ...,
        3.96728516e-04,  2.13623047e-04,  6.10351562e-05]), 'sampling_rate': 16000}, 'usa': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.00201416, 0.00259399,
       0.00262451]), 'sampling_rate': 16000}}


In [10]:
sd_qa['test'][0]

{'id': '-1009049296232977537',
 'ind_n': {'path': None,
  'array': array([-0.00119019, -0.00198364, -0.00198364, ...,  0.05123901,
          0.04470825,  0.03805542]),
  'sampling_rate': 16000},
 'usa': {'path': None,
  'array': array([ 0.       ,  0.       ,  0.       , ..., -0.0078125, -0.0078125,
         -0.0078125]),
  'sampling_rate': 16000}}

In [11]:
sd_qa['test'] = sd_qa['test'].map(prepare_source_data, num_proc=2, desc="Extract features for source dialect"
                      ).map(prepare_target_embeddings,batched=True,desc="Original hidden embeddings for target dialect")

Extract features for source dialect (num_proc=2):   0%|          | 0/6 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


TimeoutError: 