# Speech2text with Hugging Face 🤗 Transformers

https://nordicapis.com/5-best-speech-to-text-apis-in-2021/

In [None]:
! pip install -q transformers

[K     |████████████████████████████████| 3.8 MB 5.0 MB/s 
[K     |████████████████████████████████| 895 kB 24.9 MB/s 
[K     |████████████████████████████████| 596 kB 43.9 MB/s 
[K     |████████████████████████████████| 6.5 MB 30.1 MB/s 
[K     |████████████████████████████████| 67 kB 3.4 MB/s 
[?25h

In [None]:
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
from pprint import pprint

## Load pre-trained model and tokenizer

https://huggingface.co/facebook/wav2vec2-base-960h

https://huggingface.co/docs/transformers/model_doc/wav2vec2

In [None]:
MODEL_NAME = "facebook/wav2vec2-base-960h"

In [None]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


Downloading:   0%|          | 0.00/360M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load an audio file

In [None]:
WAV_AUDIO_FILE_URL = "https://www.opdsupport.com/downloads/miscellaneous/sample-audio-files/52-welcome-wav/download"
FILE_NAME = "audio.wav"
FILE_NAME = "depth_correct.wav"

In [None]:
! wget {WAV_AUDIO_FILE_URL} -O {FILE_NAME}

--2022-03-19 17:56:51--  https://www.opdsupport.com/downloads/miscellaneous/sample-audio-files/52-welcome-wav/download
Resolving www.opdsupport.com (www.opdsupport.com)... 185.229.20.48
Connecting to www.opdsupport.com (www.opdsupport.com)|185.229.20.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1292288 (1.2M) [audio/x-wav]
Saving to: ‘death_misspelled.ogg’


2022-03-19 17:56:54 (1.40 MB/s) - ‘death_misspelled.ogg’ saved [1292288/1292288]



In [None]:
speech, rate = librosa.load(FILE_NAME, sr = 16000)

In [None]:
import IPython.display as display
display.Audio(FILE_NAME, autoplay=True)

## Testing

In [None]:
input_values = tokenizer(speech, return_tensors="pt").input_values

In [None]:
input_values

tensor([[0.0004, 0.0004, 0.0004,  ..., 0.0886, 0.0788, 0.0782]])

In [None]:
logits = model(input_values).logits

In [None]:
logits

tensor([[[ 15.8560, -27.6112, -27.2855,  ...,  -5.8947,  -6.4562,  -6.8793],
         [ 15.8535, -27.5264, -27.2024,  ...,  -5.7899,  -6.3857,  -6.8285],
         [ 15.9368, -27.5419, -27.2230,  ...,  -6.1381,  -6.6819,  -6.9405],
         ...,
         [ 15.9278, -27.7355, -27.4495,  ...,  -6.3373,  -6.7389,  -7.5149],
         [ 15.7476, -27.4275, -27.1132,  ...,  -6.0333,  -6.6799,  -7.0695],
         [ 15.7526, -27.5321, -27.2169,  ...,  -6.1279,  -6.8039,  -7.1651]]],
       grad_fn=<AddBackward0>)

In [None]:
predicted_ids = torch.argmax(logits, dim=-1)

In [None]:
predicted_ids

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0, 18,  0,  0,  5,  0,  4,  4, 12, 11, 11,  0,
          8,  0, 16, 15,  0,  0, 14,  4,  4, 21,  0,  0,  0,  0,  0,  0,  8,  0,
          4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  0,
          0,  0,  8, 13, 13,  5,  0,  4,  0, 10,  9,  0,  0,  4,  0, 14,  0,  0,
          0,  7, 24,  0,  0,  6,  0,  0,  4,  4,  0, 18,  0,  0, 10,  6, 11, 11,
          4,  4,  6, 11,  0, 10,  0,  0, 12,  0,  4,  4,  0,  0,  6,  0,  0,  0,
          8,  0,  0,  0, 23,  0,  0,  0, 10, 19,  0,  0,  4,  4,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0]])

In [None]:
transcriptions = tokenizer.decode(predicted_ids[0])

In [None]:
transcriptions

'THANK YOU FOR CHOUSING THE OLYMPUS DICTATION MANAGEMENT SYSTEM THE OLYMPUS DICTATION MANAGEMENT SYSTEM GIVES YOU THE POWER TO MANAGE YOUR DICTATIONS TRANSCRIPTIONS AND DOCUMENTS SEEMLESSLY AND TO IMPROVE THE PRODUCTIVITY OF YOUR DAILY WORK FOR EXAMPLE YOU CAN AUTOMATICALLY SENT THE DICTATION FILES OR TRANSCRIBED DOCUMENTS TO YOUR ASSISTANT OR THE AUTHOR VIRE EMALE OR F T P IF YOURE USING THE SPEECH RECOGNITION SOFTWARE THE SPEECH RECOGNITION ENGINE WORKS IN THE BACKGROUND TO SUPPORT YOUR DOCUMENT CREATION WE HOPE YOU ENJOY THE SIMPLE FLEXIBLE RELIABLE AND SECURE SOLUTIONS FROM OLYMPUS'

## Test top k predictions

In [None]:
K = 2

In [None]:
topk = torch.topk(logits, k=2)
top_logits, top_ids = topk.values[0].T, topk.indices[0].T

In [None]:
top_logits, top_ids

(tensor([[15.8560, 15.8535, 15.9368, 15.9288, 15.9597, 15.9221, 15.9729, 15.9604,
          16.0041, 15.9716, 15.8827, 15.2108, 15.9206, 15.8140, 15.8992, 15.8977,
          15.9822, 16.0264, 16.0366, 16.0544, 16.0731, 16.0315, 15.9496, 15.5769,
          14.8746, 13.1041, 12.6768, 12.4050, 10.7846, 10.5972, 11.3168, 13.4824,
          13.8672, 11.6460, 12.7109, 12.2744, 13.5258,  9.9795, 11.5919, 11.6702,
          13.2894, 12.8258, 11.0244, 12.5252, 12.9568, 13.5806, 12.9836, 14.2474,
          14.3265, 14.5889, 15.1061, 13.2841, 12.5275, 11.8119, 11.3352, 12.9585,
          12.5637, 12.3160, 12.3208, 12.8788, 15.5941, 16.0744, 16.0354, 16.0969,
          16.0824, 15.9775, 15.7608, 15.2490, 14.2353, 13.7246, 13.5280, 13.1182,
          13.5307, 12.4268, 12.2845, 10.7518, 11.0056, 10.0369, 10.6693, 10.2229,
          10.3267, 10.0757, 10.6073,  9.5040, 10.5420, 10.2903,  9.4346,  7.3526,
           8.6355, 10.2835,  9.6307,  8.5543,  5.3210,  7.9336,  8.2515,  8.7196,
           8.394

In [None]:
top_ids[0][34]

tensor(11)

In [None]:
top_logits[1][33]

tensor(10.2120, grad_fn=<SelectBackward0>)

In [None]:
print(tokenizer.decode(torch.tensor([14, 0,  0,  0, 7, 24,  0, 0,  6])))
print(tokenizer.decode(torch.tensor([6, 11, 11, 11, 5,  0, 24, 6, 24])))

DABT
THEBTB


In [None]:
tokenizer.decode(top_ids[0])

'WE SHOULD GO MORE IN DABT WITH THIS TOPIC'

In [None]:
tokenizer.decode(top_ids[1])

'W U W WYWEI COOALDLGH OE O MOEE ANG THEBTBTE RHAEISS SDTRAOPBPLANK US WSWSNWS S W W W W W'