# Speech2text with Hugging Face 🤗 Transformers

In [None]:
! pip install -q transformers

In [None]:
import librosa
import torch
import numpy as np
import IPython.display as display

from transformers import Wav2Vec2ForCTC, HubertForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor
from typing import *

In [None]:
SAMPLING_RATE = 16000

## Load pre-trained model and tokenizer

- https://huggingface.co/facebook/wav2vec2-base-960h
- https://huggingface.co/facebook/wav2vec2-large-960h
- https://huggingface.co/facebook/wav2vec2-large-xlsr-53
- https://huggingface.co/facebook/hubert-large-ls960-ft

https://huggingface.co/docs/transformers/model_doc/wav2vec2

https://huggingface.co/docs/transformers/model_doc/hubert

In [None]:
BASE_MODEL_NAME = "facebook/wav2vec2-base-960h"
LARGE_MODEL_NAME = "facebook/wav2vec2-large-960h"
XLSR_MODEL_NAME = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
HUBERT_MODEL_NAME = "facebook/hubert-large-ls960-ft"

In [None]:
base_tokenizer = Wav2Vec2Tokenizer.from_pretrained(BASE_MODEL_NAME)
base_model = Wav2Vec2ForCTC.from_pretrained(BASE_MODEL_NAME)

large_tokenizer = Wav2Vec2Tokenizer.from_pretrained(LARGE_MODEL_NAME)
large_model = Wav2Vec2ForCTC.from_pretrained(LARGE_MODEL_NAME)

xlsr_tokenizer = Wav2Vec2Tokenizer.from_pretrained(XLSR_MODEL_NAME)
xlsr_model = Wav2Vec2ForCTC.from_pretrained(XLSR_MODEL_NAME)

hubert_tokenizer = Wav2Vec2Processor.from_pretrained(HUBERT_MODEL_NAME)
hubert_model = HubertForCTC.from_pretrained(HUBERT_MODEL_NAME)

## Load an audio file

In [None]:
def load_audio(file_path: str, sampling_rate=SAMPLING_RATE) -> np.ndarray:
    speech, rate = librosa.load(file_path, sr=sampling_rate)
    return speech

In [None]:
load_audio("death_mispronounced.wav")

array([ 0.        ,  0.        ,  0.        , ..., -0.00027445,
       -0.00049258, -0.00097774], dtype=float32)

In [None]:
display.Audio("death_correct.wav", autoplay=False)

## Testing english model with ambigous words' pronunciations

Audio samples pronounced by a non-native english speaker, with mispronounced words (e.g. _death_ instead of _depth_)

In [None]:
MODEL = hubert_model
TOKENIZER = hubert_tokenizer

def speech2text(speech: np.ndarray,
                tokenizer: Union[Wav2Vec2Tokenizer, Wav2Vec2Processor] = TOKENIZER,
                model: Wav2Vec2ForCTC = MODEL) -> str:
    input_values = tokenizer(speech, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcriptions = tokenizer.decode(predicted_ids[0])
    return transcriptions

In [None]:
def speech2textCMP(speech: np.ndarray) -> str:
    out = dict()
    out["BASE_MODEL"] = speech2text(speech, base_tokenizer, base_model)
    out["LARGE_MODEL"] = speech2text(speech, large_tokenizer, large_model)
    out["XLSR_MODEL"] = speech2text(speech, xlsr_tokenizer, xlsr_model)
    out["HUBERT_MODEL"] = speech2text(speech, hubert_tokenizer, hubert_model)
    return out

### "We should go more in depth with this topic"

In [None]:
SAMPLE_CORRECT = "depth_correct.wav"
SAMPLE_MISPRONOUNCED = "depth_mispronounced.wav"

In [None]:
display.Audio(SAMPLE_CORRECT, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_CORRECT))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'WE SHOULD GO MORE IN DABT WITH THIS TOPIC',
 'HUBERT_MODEL': 'WE SHOULD GROW MORE INDEPT WITH THIS TOPIC',
 'LARGE_MODEL': 'WE SHOULD GO MORE IN DEPT WITH THIS TOPIC',
 'XLSR_MODEL': 'we should go more in that with this topic'}

In [None]:
display.Audio(SAMPLE_MISPRONOUNCED, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_MISPRONOUNCED))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'WE SHOULD GO MORE IN DEBT WITH THIS TOPIC',
 'HUBERT_MODEL': 'WE SHOULD GO MORE IN THAT WITH THIS TOPIC',
 'LARGE_MODEL': 'WE SHOULD GO MORE IN DABT WITH THIS TOPIC',
 'XLSR_MODEL': 'we should go more in debt with this topic'}

### "When we talk about the depth of a graph in graph theory we mean how deep it is"

In [None]:
SAMPLE_CORRECT = "depth_4_correct.wav"
SAMPLE_MISPRONOUNCED = "depth_4_mispronounced.wav"

In [None]:
display.Audio(SAMPLE_CORRECT, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_CORRECT))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'WHEN WE TALK ABOUT THAT OF AGRAPH IN GRAPH THEORY WE MEAN HOW DEEP IT IS',
 'HUBERT_MODEL': 'WHEN WE TALK ABOUT THAT OTOGRAPHINGRAF THEORY WE MEAN HOW DEEP IT IS',
 'LARGE_MODEL': 'WHEN WE TALK ABOUT THAT OF A GRAP IN GRA THEORY WE MEAN HOW DEEP IT IS',
 'XLSR_MODEL': 'when we talk about depth of a graph in graph theory we mean houdeep it is'}

In [None]:
display.Audio(SAMPLE_MISPRONOUNCED, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_MISPRONOUNCED))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'WHEN WE TALK ABOUT THAT OF A GRAPHINGRA THEORY WE MEAN NOW DEPIT IS',
 'HUBERT_MODEL': 'WHEN WE TALK ABOUT THAT OF A GRAPINGRAF THEORY WE MEAN HOW DEEP IT IS',
 'LARGE_MODEL': 'WHEN WE TALK ABOUT THAT OF A GRAFFIN GRAFF THEORY WE MAY NOW DEPIRISE',
 'XLSR_MODEL': 'when we talk about dat of a graph in graph theory we may now deperis'}

### "How can we measure the depth of the ocean?"

In [None]:
SAMPLE_CORRECT = "depth_2_correct.wav"
SAMPLE_MISPRONOUNCED = "depth_2_mispronounced_3.wav"

In [None]:
display.Audio(SAMPLE_CORRECT, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_CORRECT))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'HOW CAN WE MEASURE THE DEPTH OF THE OCEAN',
 'HUBERT_MODEL': 'HOW CAN WE MEASURE THE DEPTH OF THE OCEAN',
 'LARGE_MODEL': 'HOW CAN WE MEASURE THE DEPTH OF THE OCEAN',
 'XLSR_MODEL': 'how can we measure the depth of the ocean'}

In [None]:
display.Audio(SAMPLE_MISPRONOUNCED, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_MISPRONOUNCED))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'HOW CAN WE MEASURE THE DEATH OF THE OCEAN',
 'HUBERT_MODEL': 'HOW CAN WE MEASURE THE DEPTH OF THE OCEAN',
 'LARGE_MODEL': 'HOW CAN WE MEASURE THE DEATH OF THE OCEAN',
 'XLSR_MODEL': 'how can we measure the death of the ocean'}

### "Do you believe in life after death?"

In [None]:
SAMPLE_CORRECT = "death_correct_3.wav"
SAMPLE_MISPRONOUNCED = "death_mispronounced_2.wav"

In [None]:
display.Audio(SAMPLE_CORRECT, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_CORRECT))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'DO YOU BELIEVE IN LIFE AFTER DEATH',
 'HUBERT_MODEL': 'DO YOU BELIEVE IN LIFE AFTER DEATH',
 'LARGE_MODEL': 'DO YOU BELIEVE IN LIFE AFTER DEATH',
 'XLSR_MODEL': 'do you believe in life after that'}

In [None]:
display.Audio(SAMPLE_MISPRONOUNCED, autoplay=False)

In [None]:
speech2textCMP(load_audio(SAMPLE_MISPRONOUNCED))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'BASE_MODEL': 'DO YOU BELIEVE IN LIFE AFTER THAT',
 'HUBERT_MODEL': 'DO BELIEVE IN LIFE AFTER DEATH',
 'LARGE_MODEL': 'DO YOU BELIEVE IN LIFE AFTER DEAT',
 'XLSR_MODEL': 'do you believe in life after theat'}

## Testing italian transcriptions of a fine-tuned model

https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-italian

In [None]:
MODEL_NAME_ITA = "jonatasgrosman/wav2vec2-large-xlsr-53-italian"

TOKENIZER_ITA = Wav2Vec2Tokenizer.from_pretrained(MODEL_NAME_ITA)
MODEL_ITA = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME_ITA)

def speech2textIT(speech: np.ndarray,
                  tokenizer: Wav2Vec2Tokenizer = TOKENIZER_ITA,
                  model: Wav2Vec2ForCTC = MODEL_ITA) -> str:
    return speech2text(speech, tokenizer, model)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


In [None]:
display.Audio("ita_test.wav", autoplay=False)

In [None]:
speech2textIT(load_audio("ita_test.wav"))

'questo è un test del modello in italiano per fare spicitu text'

In [None]:
speech2textIT(load_audio("ita_test_2.wav"))

"questo è un secondo test del modello per effettuare le trascrizioni in italiano scaricato dall sito web e vediamo come funziona con una frase un po' più e lunga"

### "Cosa può cambiare tra il primo e il secondo?"

In [None]:
SAMPLE_CORRECT = "ita_cambiare_correct.wav"
SAMPLE_MISPRONOUNCED = "ita_cambiare_mispronounced.wav"

In [None]:
display.Audio(SAMPLE_CORRECT, autoplay=False)

In [None]:
speech2textIT(load_audio(SAMPLE_CORRECT))

'cosa può cambiare tra il primo e il secondo'

In [None]:
display.Audio(SAMPLE_MISPRONOUNCED, autoplay=False)

In [None]:
speech2textIT(load_audio(SAMPLE_MISPRONOUNCED))

'cosa può cambiare tra il primo e il secondo'