## Install Requirements

In [5]:
!pip install datasets librosa jiwer transformers colorednoise pyctcdecode torchaudio

[0m

## Imports

In [6]:
import colorednoise as cn
from datasets import load_dataset, load_from_disk
from jiwer import wer
import librosa
import numpy as np
import os
import tarfile
import torch
import torchaudio
import urllib.request
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, AutoModelForCTC, AutoProcessor

## wav2vec 2.0 on accent dataset

In [7]:
# set paths
datasets_path = os.path.join(os.getcwd(), 'datasets') 
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# copy files from gcp
!gsutil -m cp -n -r gs://capstone_datasets/speech-accent-archive ./datasets/

Skipping existing item: file://./datasets/speech-accent-archive/reading-passage.txt
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans1.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans2.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans3.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans4.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans5.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/agni1.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/albanian8.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/amharic15.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/akan1.mp3
Skipping existing item: file://./datasets/speech-accent-archive/recordings/amharic11.mp3
Skipping existing item: file:

In [9]:
# load dataset
dataset = load_dataset("datasets", data_dir="./speech-accent-archive/recordings", drop_labels=True)

Resolving data files:   0%|          | 0/2138 [00:00<?, ?it/s]

Using custom data configuration datasets-84a94aafc5f723e5


Downloading and preparing dataset audiofolder/datasets to /home/Max/.cache/huggingface/datasets/audiofolder/datasets-84a94aafc5f723e5/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...
                

Downloading data files #2:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/133 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/133 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/133 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/133 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/133 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/133 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/134 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /home/Max/.cache/huggingface/datasets/audiofolder/datasets-84a94aafc5f723e5/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
# # load model
# model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
# processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")

In [11]:
# def map_to_pred(batch, model, processor):
#     inputs = processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
#     inputs = {k: v.to("cuda") for k,v in inputs.items()}
#     with torch.no_grad():
#         logits = model(**inputs).logits
#     transcription = processor.batch_decode(logits.cpu().numpy()).text[0]
#     batch["transcription"] = transcription
#     return batch

In [12]:
# dataset = dataset.map(map_to_pred, fn_kwargs={"model": model, "processor": processor}, batch_size=2, writer_batch_size=2)

In [13]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


Downloading:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def map_to_pred(batch, model, tokenizer):
    """
    predicts transcription
    """
    #tokenize
    input_values = tokenizer(batch["audio"]["array"], return_tensors="pt").input_values
    #take logits
    logits = model(input_values.to(device)).logits
    #take argmax (find most probable word id)
    predicted_ids = torch.argmax(logits, dim=-1)
    #get the words from the predicted word ids
    transcription = tokenizer.decode(predicted_ids[0])
    #save logits and transcription
    batch["logits"] = logits.cpu().detach().numpy()
    batch["transcription"] = transcription
    return batch

In [15]:
dataset = dataset.map(map_to_pred, fn_kwargs={"model": model, "tokenizer": tokenizer}, batch_size=1, writer_batch_size=1)

  0%|          | 0/2138 [00:00<?, ?ex/s]

RuntimeError: CUDA out of memory. Tried to allocate 378.00 MiB (GPU 0; 14.76 GiB total capacity; 3.93 GiB already allocated; 380.75 MiB free; 4.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
!nvidia-smi

Thu Nov  3 02:10:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 495.46       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    30W /  70W |   9492MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces