# Практическое задание из раздела 5 - Дообучить ASR модель

## Подготовка IDE

### Загрузим необходимые библиотеки

In [1]:
from datasets import load_dataset, DatasetDict, Audio
from transformers import WhisperProcessor



## Аутентификация блокнота в HuggingFace

Для того, чтобы иметь возможность сохранить модель на HuggingFace Hub аутентифицируем блокнот:

In [None]:
from huggingface_hub import notebook_login

## Доступные языки Whisper

In [14]:
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE

TO_LANGUAGE_CODE

{'english': 'en',
 'chinese': 'zh',
 'german': 'de',
 'spanish': 'es',
 'russian': 'ru',
 'korean': 'ko',
 'french': 'fr',
 'japanese': 'ja',
 'portuguese': 'pt',
 'turkish': 'tr',
 'polish': 'pl',
 'catalan': 'ca',
 'dutch': 'nl',
 'arabic': 'ar',
 'swedish': 'sv',
 'italian': 'it',
 'indonesian': 'id',
 'hindi': 'hi',
 'finnish': 'fi',
 'vietnamese': 'vi',
 'hebrew': 'he',
 'ukrainian': 'uk',
 'greek': 'el',
 'malay': 'ms',
 'czech': 'cs',
 'romanian': 'ro',
 'danish': 'da',
 'hungarian': 'hu',
 'tamil': 'ta',
 'norwegian': 'no',
 'thai': 'th',
 'urdu': 'ur',
 'croatian': 'hr',
 'bulgarian': 'bg',
 'lithuanian': 'lt',
 'latin': 'la',
 'maori': 'mi',
 'malayalam': 'ml',
 'welsh': 'cy',
 'slovak': 'sk',
 'telugu': 'te',
 'persian': 'fa',
 'latvian': 'lv',
 'bengali': 'bn',
 'serbian': 'sr',
 'azerbaijani': 'az',
 'slovenian': 'sl',
 'kannada': 'kn',
 'estonian': 'et',
 'macedonian': 'mk',
 'breton': 'br',
 'basque': 'eu',
 'icelandic': 'is',
 'armenian': 'hy',
 'nepali': 'ne',
 'mongol

## Подготовка датасета

In [15]:
minds14 = load_dataset("PolyAI/minds14", "en-US")
minds14 = minds14["train"] 

minds14 = minds14.train_test_split(seed=42, shuffle=True, train_size=450)

minds14 = minds14.select_columns(["audio", "transcription"]) # Альтернативно можно попробовать взять english_transcription


minds14

Found cached dataset minds14 (/home/artyom/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/65c7e0f3be79e18a6ffaf879a083daf706312d421ac90d25718459cbf3c42696)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/artyom/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/65c7e0f3be79e18a6ffaf879a083daf706312d421ac90d25718459cbf3c42696/cache-94393ee2aedc20b0.arrow and /home/artyom/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/65c7e0f3be79e18a6ffaf879a083daf706312d421ac90d25718459cbf3c42696/cache-6639db825711b94a.arrow


DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 450
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 113
    })
})

In [16]:
minds14["train"][0]

{'audio': {'path': '/home/artyom/.cache/huggingface/datasets/downloads/extracted/f36f6aec19da5f39a26424689c3c607c43d0915fd5f02aab534917afde7eede4/en-US~BALANCE/602ba1e0963e11ccd901cc51.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00024414,
          0.        ,  0.00024414]),
  'sampling_rate': 8000},
 'transcription': "hey honey can you tell me how much money I have in my account like what's the balance"}

In [17]:
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small", language="english", task="transcribe"
)

In [18]:
sampling_rate = processor.feature_extractor.sampling_rate

minds14 = minds14.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [19]:
minds14["train"][2]

{'audio': {'path': '/home/artyom/.cache/huggingface/datasets/downloads/extracted/f36f6aec19da5f39a26424689c3c607c43d0915fd5f02aab534917afde7eede4/en-US~ABROAD/602ba22c963e11ccd901cc57.wav',
  'array': array([ 4.87945363e-06,  6.92212780e-06, -4.69154838e-06, ...,
          9.48604702e-06, -4.41033444e-06, -7.06448191e-06]),
  'sampling_rate': 16000},
 'transcription': "hi I'm going to be traveling abroad in Japan and I just wondered if I'm going to be able to use my bank card when I'm there if you could let me know that would be great thank you"}

In [20]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["transcription"],
    )

    # compute input length of audio sample in seconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

In [22]:
minds14 = minds14.map(prepare_dataset, num_proc=1)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [None]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

In [None]:
common_voice["train"] = common_voice["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)