In [34]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from torchaudio import load
import torch

# Load Whisper processor
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer

from dotenv import load_dotenv
from huggingface_hub import login
import os


In [2]:
load_dotenv()
login_token = os.getenv('HuggingFaceToken')

login(login_token)

In [3]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="train+validation", trust_remote_code=True
)
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "ml", split="test", trust_remote_code=True
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2023
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 710
    })
})


In [5]:
# remove unwanted features
common_voice = common_voice.select_columns(['audio', 'sentence'])

In [6]:
print(common_voice['train'][0]['audio'])

{'path': 'C:\\Users\\VICTUS\\.cache\\huggingface\\datasets\\downloads\\extracted\\3e7b12b0fa0deddeccc4a37a644801109d30fe7dda8b39a953688d0be0744a2f\\ml_train_0/common_voice_ml_37003897.mp3', 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       1.33694380e-06, 6.72575652e-07, 1.44025307e-07], shape=(150336,)), 'sampling_rate': 48000}


In [7]:
# 48kHz -> 16kHz
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [8]:
common_voice['train'][0]['audio']['array']

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -1.61271237e-06, -1.26397367e-06,  1.32478658e-06], shape=(50112,))

In [9]:
# def collate_fuc(batch):
#     print(len(batch))
#     print(batch[0].keys())
# data_loader = DataLoader(common_voice['train'], batch_size=3, shuffle=True, collate_fn=collate_fuc)

In [10]:
# for i in data_loader:
#     break

In [11]:
# filtering audio len > 30 sec  

In [12]:
# whisper proccessor wrap whisperFeature extractor for audio and whispertokenizer for text labels as one processor 
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', task='transcribe', language='malayalam')

In [13]:
# sample_tokens = tokenizer.encode('ഇല്ല മോനേ')
# tokenizer.decode(sample_tokens)

In [14]:
audio, sampling = load('record_out.wav')
print(f"Audio:{audio}")
print(f"Sampling : {sampling}")
audio = audio[0] # convert to 1d array (single audio should 1d array if it is mono, if stereo , we use two array for audio )

text = 'ഇല്ല മോനേ'
batch = processor(audio=audio, text=text, sampling_rate=16000) 

Audio:tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0004,  0.0003, -0.0002]])
Sampling : 48000


In [15]:
# Prepare data
def prepare_data(batch):
    # processor have both feature extractor for audio and tokenizer for text, so we just pass both of theem
    batch = processor(audio=batch['audio']['array'],
                      text=batch['sentence'],
                      sampling_rate=processor.feature_extractor.sampling_rate)
    return batch

In [16]:
common_voice = common_voice.map(prepare_data, batched=False)

Map:   0%|          | 0/2023 [00:00<?, ? examples/s]

Map:   0%|          | 0/710 [00:00<?, ? examples/s]

In [15]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 2023
    })
    test: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 710
    })
})

In [17]:
common_voice = common_voice.select_columns(['input_features', 'labels'])

In [None]:
common_voice['train'][0]['input_features'] # (1, 80, 3000)

Dataloader takes random datapoints, here it will look like {input_feature:.., labels}, when batch enabled it will be like [{inp:.., lable:..}, {inp: .., label:..}], we need to use data collator for pad them and join them

In [None]:
# Data Collator for padding

class DataCollatorForSeqToSeqPadding:
    def __init__(self, processor):
        self.processor = processor
        
    def __call__(self, batch):
        # batch = [{}, {} ...]
        
        input_features = []
        labels = []
        # convert to list -> list of list
        for data in batch:
            features = torch.tensor(data['input_features']).squeeze(0) # (1, 80, 3000) -> (80, 3000)
            input_features.append(features)
            labels.append(torch.tensor(data['labels']))
    
        # padding
            
    
        # tensor stacking
        input_features = torch.stack(input_features, dim=0)
        labels = torch.stack(labels) # have 
            
        
        return input_features, labels
        

In [63]:
collate_fn = DataCollatorForSeqToSeqPadding(processor=processor)
data_loader = DataLoader(dataset=common_voice['train'],
                         collate_fn=collate_fn,
                         batch_size=5,
                         shuffle=True, 
                         drop_last=True)

In [68]:
for i, labels in data_loader:
    break

RuntimeError: stack expects each tensor to be equal size, but got [78] at entry 0 and [117] at entry 1

In [69]:
# i.shape -> torch.Size([5, 80, 3000])
labels

[tensor([50258, 50296, 50359, 50363, 46503,   229, 46503,   110,   156,   113,
           235, 46503,   110,   220, 46503,   106,   156,   113,   233, 46503,
           101,   156,   113,   229, 50257]),
 tensor([50258, 50296, 50359, 50363, 46503,   106,   156,   113,   228, 46503,
           107,   156,   113,   235,   220, 46503,   106, 46503,   122, 46503,
           116,   156, 23560,   156,   113,   235,   156, 23560, 46503,   123,
           156,   113,   121,   220, 46503,   228, 46503,   109,   156,   113,
           235,   220, 46503,    99, 46503,   114, 46503,   122, 46503,   224,
         46503,   114, 46503,   224,   220, 46503,   228, 46503,   109,   156,
           113,   235,   220, 46503,   114,   156, 23560, 46503,   106, 46503,
           122, 46503,   101, 46503,   106, 46503,   122, 46503,    96,   156,
           113,   235,   220, 46503,   113, 46503,   123, 46503,   110, 46503,
           243,   156,   113,   235, 46503,   243, 46503,   107, 46503,   109,
      

In [58]:
feature_extractor(common_voice['train'][:2]['audio'])

It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


TypeError: float() argument must be a string or a real number, not 'dict'