In [1]:
import collections
import contextlib
import sys
import wave

In [2]:
def read_wave(path):
    """Reads a .wav file.
    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate

def write_wave(path, audio, sample_rate):
    """Writes a .wav file.
    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)

class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration, is_speech = True):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration
        self.isSpeech = is_speech

def __frame_generator_old(frame_duration_ms, audio, sample_rate, vad=None):
    """Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.
    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        is_speech = vad.is_speech(audio[offset:offset + n], sample_rate) if vad is not None else False
        yield Frame(audio[offset:offset + n], timestamp, duration, is_speech)
        timestamp += duration
        offset += n
        
class Utterance(object):
    """Represents an utterance of speech audio data."""
    def __init__(self, audio, timestamp, duration, bytes):
        self.audio = audio
        self.timestamp = timestamp
        self.duration = duration
        self.bytes=bytes


In [26]:
import webrtcvad
import numpy as np
class AudioSegmenter(object):
    """Represents the WebRTC based Audio Segmentation tool"""
    def __init__(self, 
                 vadAggressiveness=2, 
                 frameDurationMs=30, 
                 numFramesInWindow=100, 
                 samplingRate=16000, 
                 numBytesPerSample=2):
        assert numBytesPerSample == 2  ## for now although the algo should work for byte encodings
        self.vad = webrtcvad.Vad(vadAggressiveness)
        self.sample_rate = samplingRate
        self.frame_duration_ms = frameDurationMs
        self.num_frames_in_window = numFramesInWindow
        self.SCALE_FACTOR = 1./float(1 << ((8 * numBytesPerSample)-1))
        self.num_bytes_per_sample = numBytesPerSample
        
    def frame_generator(self, audio):
        """Generates audio frames from PCM audio data.
        Takes the desired frame duration in milliseconds, the PCM data, and
        the sample rate.
        Yields Frames of the requested duration.
        """
        n = int(self.sample_rate * (self.frame_duration_ms / 1000.0) * 2)
        offset = 0
        timestamp = 0.0
        duration = (float(n) / self.sample_rate) / 2.0
        while offset + n < len(audio):
            is_speech = self.vad.is_speech(audio[offset:offset + n], self.sample_rate)
            yield Frame(audio[offset:offset + n], timestamp, duration, is_speech)
            timestamp += duration
            offset += n
        
    def vad_collector(self, frames, triggerToggleFactor=0.9, utteranceRunoffDuration=5, minSilenceAtEnds=0.06):
        """Filters out non-voiced audio frames.
        Given a webrtcvad.Vad and a source of audio frames, yields only
        the voiced audio.
        Uses a padded, sliding window algorithm over the audio frames.
        When more than (triggerToggleFactor)X% (default X=90) of the frames in the window are voiced (as
        reported by the VAD), the collector triggers and begins yielding
        audio frames. Then the collector waits until X% of the frames in
        the window are unvoiced to detrigger.
        The window is padded at the front and back to provide a small
        amount of silence or the beginnings/endings of speech around the
        voiced frames.
        Arguments:
        sample_rate - The audio sample rate, in Hz.
        frame_duration_ms - The frame duration in milliseconds.
        padding_duration_ms - The amount to pad the window, in milliseconds.
        vad - An instance of webrtcvad.Vad.
        frames - a source of audio frames (sequence or generator).
        Returns: A generator that yields PCM audio data.
        """
        frame_duration_ms = self.frame_duration_ms
        num_padding_frames = self.num_frames_in_window
        sample_rate = self.sample_rate
        min_silence_frames_at_end = int(minSilenceAtEnds * 1000 / frame_duration_ms)
        print('Min Silence Frames at End: %d' % min_silence_frames_at_end)
        
        # We use a deque for our sliding window/ring buffer.
        ring_buffer = collections.deque(maxlen=num_padding_frames)
        # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
        # NOTTRIGGERED state.
        triggered = False
        start_time = -1
        end_time = -1

        ## smoothing on the frames
        numFrames = len(frames)
        for index, frame in enumerate(frames):
            if(index > 0) and (index < (numFrames-1)):
                if((frames[index].isSpeech != frames[index-1].isSpeech) and (frames[index-1].isSpeech == frames[index+1].isSpeech)):
                    frames[index].isSpeech = frames[index-1].isSpeech

        voiced_frames = []
        segid = 1
        num_silence_frames_at_end = 0
        for frame in frames:
            #is_speech = vad.is_speech(frame.bytes, sample_rate)
            is_speech = frame.isSpeech

            #sys.stdout.write('1' if is_speech else '0')
            #sys.stdout.write(' %.2f\n' % frame.timestamp)
            if not triggered:
                ring_buffer.append((frame, is_speech))
                num_voiced = len([f for f, speech in ring_buffer if speech])
                # If we're NOTTRIGGERED and more than 90% of the frames in
                # the ring buffer are voiced frames, then enter the
                # TRIGGERED state.
                if num_voiced > triggerToggleFactor * ring_buffer.maxlen:
                    triggered = True
                    #sys.stdout.write('+(%s)\n' % (ring_buffer[0][0].timestamp,))
                    start_time = ring_buffer[0][0].timestamp
                    # We want to yield all the audio we see from now until
                    # we are NOTTRIGGERED, but we have to start with the
                    # audio that's already in the ring buffer.
                    for f, s in ring_buffer:
                        voiced_frames.append(f)
                    ring_buffer.clear()
                    index = len(voiced_frames)-1
                    while(index>=0):
                        if(voiced_frames[index].isSpeech):
                            break
                        else:
                            num_silence_frames_at_end += 1

            else:
                # We're in the TRIGGERED state, so collect the audio data
                # and add it to the ring buffer.
                voiced_frames.append(frame)
                ring_buffer.append((frame, is_speech))
                num_unvoiced = len([f for f, speech in ring_buffer if not speech])
                if not is_speech:
                    num_silence_frames_at_end += 1
                end_time = frame.timestamp + frame.duration
                # If more than 90% of the frames in the ring buffer are
                # unvoiced, then enter NOTTRIGGERED and yield whatever
                # audio we've collected.
                if ((num_unvoiced > (triggerToggleFactor * ring_buffer.maxlen)) or 
                (((end_time - start_time) > utteranceRunoffDuration) and (num_silence_frames_at_end >= min_silence_frames_at_end))):
                    #sys.stdout.write('-(%s)\n' % (frame.timestamp + frame.duration))
                    end_time = frame.timestamp + frame.duration
                    #start_time -= minSilenceAtEnds if start_time >= minSilenceAtEnds else 0.0
                    triggered = False
                    databytes = b''.join([f.bytes for f in voiced_frames])
                    audiosamples = np.frombuffer(databytes, dtype=np.int16).astype(np.float32)
                    audiosamples *= self.SCALE_FACTOR
                    print('Segment %d: start=%.2f end=%.2f bytes=%d duration=%.2f\n' % (segid, start_time, end_time, len(databytes), len(audiosamples)/sample_rate))
                    segid += 1
                    #databytes = None
                    yield(Utterance(audiosamples, start_time, (end_time - start_time), databytes))
                    start_time = -1
                    end_time = -1
                    ring_buffer.clear()
                    voiced_frames = []
                    num_silence_frames_at_end = 0
        #if triggered:
            #sys.stdout.write('-(%s)\n' % (frame.timestamp + frame.duration))
        #sys.stdout.write('\n')
        # If we have any leftover voiced audio when we run out of input,
        # yield it.
        end_time = frame.timestamp + frame.duration
        if voiced_frames:
            databytes = b''.join([f.bytes for f in voiced_frames])
            audiosamples = np.frombuffer(databytes, dtype=np.int16).astype(np.float32)
            audiosamples *= self.SCALE_FACTOR
            print('Segment %d: start=%.2f end=%.2f bytes=%d duration=%.2f\n' % (segid, start_time, end_time, len(databytes), len(audiosamples)/sample_rate))
            #databytes = None
            yield(Utterance(audiosamples, start_time, (end_time - start_time), databytes))

    def process(self, audio, triggerToggleFactor=0.9, utteranceRunoffDuration=5, minSilenceAtEnds=0.06):
        frames = self.frame_generator(audio)
        frames = list(frames)
        
        totalAudioDuration = len(audio) / (self.sample_rate * self.num_bytes_per_sample)
        segmentsList = []
        totalSpeechDuration = 0.0
        totalNumSegments = 0
        maxDuration = 0.0
        speech_segments = self.vad_collector(frames, triggerToggleFactor, utteranceRunoffDuration, minSilenceAtEnds)
        for segment in speech_segments:
            if segment.duration > maxDuration:
                maxDuration = segment.duration
            totalSpeechDuration += segment.duration
            totalNumSegments += 1
            segmentsList.append(segment)
        
        print('Segmenter found total of %d segments with duration %.2fsecs from audio of duration %.2fsecs with max=%.2f -- compression factor: %.2f%%\n' 
                  % (totalNumSegments, totalSpeechDuration, totalAudioDuration, maxDuration, (100.0*totalSpeechDuration/totalAudioDuration)))
        return segmentsList

In [27]:
from __future__ import annotations
#import os
#import sys
import torch
import warnings
import logging
from typing import Optional, Callable
#from datasets import load_from_disk, Dataset
from tqdm import tqdm
from transformers import (
    Wav2Vec2Processor, 
    AutoModelForCTC
)
from huggingsound.utils import get_chunks, get_waveforms, get_dataset_from_dict_list
from huggingsound.token_set import TokenSet
from huggingsound.normalizer import DefaultTextNormalizer
#from huggingsound.trainer import TrainingArguments, ModelArguments, finetune_ctc
from huggingsound.speech_recognition.decoder import Decoder, GreedyDecoder
#from huggingsound.metrics import cer, wer

# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger.setLevel(logging.INFO)

class SpeechRecognitionModel2():
    """
    Speech Recognition Model.

    Parameters
    ----------
    model_path : str
        The path to the model or the model identifier from huggingface.co/models.
    
    device: Optional[str] = "cpu"
        Device to use for inference/evaluation/training, default is "cpu". If you want to use a GPU for that, 
        you'll probably need to specify the device as "cuda"
    """

    def __init__(self, model_path: str, device: Optional[str] = "cpu"):
        
        self.model_path = model_path
        self.device = device
        
        logger.info("Loading model...")
        self._load_model()

    @property
    def is_finetuned(self):
        return self.processor is not None

    def _load_model(self):

        self.model = AutoModelForCTC.from_pretrained(self.model_path)
        self.model.to(self.device)

        try:
            self.processor = Wav2Vec2Processor.from_pretrained(self.model_path)
            self.token_set = TokenSet.from_processor(self.processor)
        except Exception:
            logger.warning("Not fine-tuned model! You'll need to fine-tune it before use this model for audio transcription")
            self.processor = None
            self.token_set = None

    def transcribeFiles(self, paths: list[str], batch_size: Optional[int] = 1, decoder: Optional[Decoder] = None) -> list[dict]:
        """ 
        Transcribe audio files.

        Parameters:
        ----------
            paths: list[str]
                List of paths to audio files to transcribe

            batch_size: Optional[int] = 1
                Batch size to use for inference

            decoder: Optional[Decoder] = None
                Decoder to use for transcription. If you don't specify this, the engine will use the GreedyDecoder.

        Returns:
        ----------
            list[dict]:
                A list of dictionaries containing the transcription for each audio file:

                [{
                    "transcription": str,
                    "start_timesteps": list[int],
                    "end_timesteps": list[int],
                    "probabilities": list[float]
                }, ...]
        """

        if not self.is_finetuned:
            raise ValueError("Not fine-tuned model! Please, fine-tune the model first.")
        
        if decoder is None:
            decoder = GreedyDecoder(self.token_set)

        sampling_rate = self.processor.feature_extractor.sampling_rate
        result = []

        for paths_batch in tqdm(list(get_chunks(paths, batch_size))):

            waveforms = get_waveforms(paths_batch, sampling_rate)

            inputs = self.processor(waveforms, sampling_rate=sampling_rate, return_tensors="pt", padding=True, do_normalize=True)

            with torch.no_grad():
                logits = self.model(inputs.input_values.to(self.device), attention_mask=inputs.attention_mask.to(self.device)).logits

            result += decoder(logits)

        return result
    
    def transcribeAudio(self, utterances: list[Utterance], batch_size: Optional[int] = 1, decoder: Optional[Decoder] = None) -> list[dict]:
        """ 
        Transcribe audio files.

        Parameters:
        ----------
            paths: list[Utterance]
                List of audio utterances to transcribe

            batch_size: Optional[int] = 1
                Batch size to use for inference

            decoder: Optional[Decoder] = None
                Decoder to use for transcription. If you don't specify this, the engine will use the GreedyDecoder.

        Returns:
        ----------
            list[dict]:
                A list of dictionaries containing the transcription for each audio file:

                [{
                    "transcription": str,
                    "start_timesteps": list[int],
                    "end_timesteps": list[int],
                    "probabilities": list[float]
                }, ...]
        """

        if not self.is_finetuned:
            raise ValueError("Not fine-tuned model! Please, fine-tune the model first.")

        if decoder is None:
            decoder = GreedyDecoder(self.token_set)

        sampling_rate = self.processor.feature_extractor.sampling_rate
        result = []

        for utts_batch in tqdm(list(get_chunks(utterances, batch_size))):

            #waveforms = get_waveforms(paths_batch, sampling_rate)
            waveforms = []
            for utt in utts_batch:
                waveforms.append(utt.audio)

            inputs = self.processor(waveforms, sampling_rate=sampling_rate, return_tensors="pt", padding=True, do_normalize=True)

            with torch.no_grad():
                logits = self.model(inputs.input_values.to(self.device), attention_mask=inputs.attention_mask.to(self.device)).logits

            batchResults = decoder(logits)
            for index, br in enumerate(batchResults):
                br['utterance_start']    = '%.2f' % utts_batch[index].timestamp
                br['utterance_duration'] = '%.2f' % utts_batch[index].duration
            result += batchResults

        return result

In [38]:
#audioFile = '/Users/asrivast/Data/tdaudio/1minagent.wav'
audioFile = '/Users/asrivast/Data/Spanish_Conversational_Speech_Corpus/WAV/A0001_S003_0_G0001_G0002.wav'
aggressiveness = 3
frameDurationMs = 30
numFramesInWindow = 20

audio, sample_rate = read_wave(audioFile)
print('sample rate = %d\n' % sample_rate)
print(len(audio)/(sample_rate * 2))

segmenter = AudioSegmenter(aggressiveness, frameDurationMs, numFramesInWindow, sample_rate, 2)
#vad = webrtcvad.Vad(aggresiveness)
#frames = frame_generator(frameDurationMs, audio, sample_rate, vad)
#frames = list(frames)
#speech_segments = vad_collector(sample_rate, frameDurationMs, (frameDurationMs*numFramesInWindow), vad, frames)
#segments = list(speech_segments)

segments = segmenter.process(audio, 0.75, 5, 0.03)


sample rate = 16000

1215.5285
Min Silence Frames at End: 1
Segment 1: start=4.71 end=8.49 bytes=120960 duration=3.78

Segment 2: start=8.73 end=12.18 bytes=110400 duration=3.45

Segment 3: start=12.75 end=15.69 bytes=94080 duration=2.94

Segment 4: start=15.69 end=20.70 bytes=160320 duration=5.01

Segment 5: start=20.70 end=24.93 bytes=135360 duration=4.23

Segment 6: start=24.93 end=26.67 bytes=55680 duration=1.74

Segment 7: start=27.60 end=32.61 bytes=160320 duration=5.01

Segment 8: start=32.61 end=37.62 bytes=160320 duration=5.01

Segment 9: start=37.80 end=42.81 bytes=160320 duration=5.01

Segment 10: start=42.81 end=43.86 bytes=33600 duration=1.05

Segment 11: start=44.25 end=53.04 bytes=281280 duration=8.79

Segment 12: start=53.31 end=58.32 bytes=160320 duration=5.01

Segment 13: start=58.32 end=63.33 bytes=160320 duration=5.01

Segment 14: start=63.33 end=68.70 bytes=171840 duration=5.37

Segment 15: start=68.82 end=73.83 bytes=160320 duration=5.01

Segment 16: start=73.83 e

In [39]:
from traceback import print_tb
import torch
import os.path
import argparse
#from huggingsound import SpeechRecognitionModel, KenshoLMDecoder

device = "cuda" if (torch.cuda.is_available()) else "cpu"
#SttModelFolder = "/Users/asrivast/Models/wav2vec2-large-xlsr-53-english/"
SttModelFolder = "/Users/asrivast/Models/wav2vec2-large-xlsr-53-spanish/"
model = SpeechRecognitionModel2(SttModelFolder, device=device)
print(model.processor.feature_extractor.sampling_rate)
useLM = False
decoder = None
if useLM == True:
    from huggingsound import KenshoLMDecoder
    LmModelFolder = SttModelFolder + "/language_model/"
    lm_path = LmModelFolder + "lm.binary"
    unigrams_path = LmModelFolder + "unigrams.txt"
    decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path, alpha=2, beta=1, beam_width=100)
    print("Finished loading Language Model")

transcripts = model.transcribeAudio(segments, 1, decoder)
#print(transcripts)

for transcript in transcripts:
    print('%s (%s-%s)\n' % (transcript['transcription'], transcript['utterance_start'], transcript['utterance_duration']))
    


04/18/2022 22:42:35 - INFO - __main__ - Loading model...
16000


  4%|██▍                                                            | 9/232 [00:34<15:56,  4.29s/it][W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.
100%|█████████████████████████████████████████████████████████████| 232/232 [19:00<00:00,  4.92s/it]

voys date a coleccion for beiteene magic data (4.71-3.78)

boys data collection forbay-jin magic data (8.73-3.45)

pues la verdad es que ahora mismo (12.75-2.94)

eh todos los productos digitales que hay a mi me parece increíble  porque (15.69-5.01)

no hace tanto tiempo no teníamos ni la cuarta parte de (20.70-4.23)

de todo lo que tenemos hoy (24.93-1.74)

antiguamente un telefono era para hablar el  telefono fijo que estaba en la entrada de (27.60-5.01)

desde casa o en la sala y para conversaciones (32.61-5.01)

breves progamas como tes playaras con tus amigos ya cuelga quevo ven ena factura no (37.80-5.01)

creíble (42.81-1.05)

y ahora granadavamos todos un mundo gol telephono móvil a todos lados que sales de casa y teoridos al telefonio vás corriendo a caso a travez a buscarlo como si se tuviera olvidado (44.25-8.79)

medio cerebro sí  pero mira por ejemplo a mio parece super práctico  porque antes (53.31-5.01)

por ejemplo sólo podíamos sacar fotografías si llevabas la cámara s




## This block can save audio for each segment into files and run decoder on individual files to compare to the previous block 
from traceback import print_tb
import torch
import os.path
import argparse
from huggingsound import SpeechRecognitionModel, KenshoLMDecoder

outFolder = '/tmp/'
device = "cuda" if (torch.cuda.is_available()) else "cpu"
SttModelFolder = "/Users/asrivast/Models/wav2vec2-large-xlsr-53-english/"
model = SpeechRecognitionModel2(SttModelFolder, device=device)
print(model.processor.feature_extractor.sampling_rate)
useLM = False
decoder = None
if useLM == True:
  LmModelFolder = SttModelFolder + "/language_model/"
  lm_path = LmModelFolder + "lm.binary"
  unigrams_path = LmModelFolder + "unigrams.txt"
  decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path, alpha=2, beta=1, beam_width=100)
  print("Finished loading Language Model")

audioFilesList = []
for i, segment in enumerate(segments):
    path = outFolder
    path = path + ('chunk-%002d.wav' % (i,)) 
    print('\nWriting %s %.2f %.2f %d\n' % (path, segment.timestamp, segment.duration, len(segment.audio)))
    write_wave(path, segment.bytes, sample_rate)
    audioFilesList.append(path)

transcripts = model.transcribeFiles(audioFilesList, 1, decoder)
#print(transcripts)
#audioFilesList.clear()


for transcript in transcripts:
    print('%s\n' % (transcript['transcription']))
