In [None]:
import sys
!{sys.executable} -m pip install -r "../../examples/audio/requirements.txt"

In [None]:
import sys
!{sys.executable} -m pip install soundfile

In [None]:
import sys
!{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html


In [None]:
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge librosa



In [4]:
import os
from torch import argmax
import sounddevice
import torch
import pyannote.audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.common.exception import ProcessFlowException
from forte.data.data_pack import DataPack
from forte.data.readers import AudioReader
from forte.pipeline import Pipeline
from forte.processors.base.pack_processor import PackProcessor

from forte.data.ontology.top import Link
from ft.onto.base_ontology import AudioUtterance, Utterance


In [5]:
class SpeakerSegmentationProcessor(PackProcessor):
    """
    An audio processor for speaker segmentation.
    """

    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self._model = pyannote.audio.Pipeline.from_pretrained(
            "pyannote/speaker-segmentation"
        )

    def _process(self, input_pack: DataPack):
        output = self._model(input_pack.pack_name)
        for turn, _, speaker in output.itertracks(yield_label=True):
            audio_utter: AudioUtterance = AudioUtterance(
                pack=input_pack,
                begin=int(turn.start * input_pack.sample_rate),
                end=int(turn.end * input_pack.sample_rate)
            )
            audio_utter.speaker = speaker


class AudioUtteranceASRProcessor(PackProcessor):
    """
    An audio processor for automatic speech recognition.
    """

    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        # Initialize tokenizer and model
        pretrained_model: str = "facebook/wav2vec2-base-960h"
        self._tokenizer = Wav2Vec2Processor.from_pretrained(pretrained_model)
        self._model = Wav2Vec2ForCTC.from_pretrained(pretrained_model)

    def _process(self, input_pack: DataPack):
        required_sample_rate: int = 16000
        if input_pack.sample_rate != required_sample_rate:
            raise ProcessFlowException(
                f"A sample rate of {required_sample_rate} Hz is requied by the"
                " pretrained model."
            )

        for audio_utter in input_pack.get(AudioUtterance):

            # tokenize
            input_values = self._tokenizer(
                audio_utter.audio, return_tensors="pt", padding="longest"
            ).input_values  # Batch size 1

            # take argmax and decode
            transcription = self._tokenizer.batch_decode(
                argmax(self._model(input_values).logits, dim=-1)
            )

            if not transcription[0]:
                continue

            input_pack.set_text(text=input_pack.text + transcription[0])

            # Create annotations on audio and text utterance
            text_utter: Utterance = Utterance(
                pack=input_pack,
                begin=len(input_pack.text) - len(transcription[0]),
                end=len(input_pack.text)
            )
            text_utter.speaker = audio_utter.speaker
            Link(pack=input_pack, parent=audio_utter, child=text_utter)


In [6]:
pwd

'/l/users/bhaskar.rao/work/projects/forte/docs/notebook_tutorial'

In [7]:
#audio_path='../../audio'
audio_path='/l/users/bhaskar.rao/work/projects/forte/examples/audio'

In [8]:
# Define and config the Pipeline
pipeline = Pipeline[DataPack]()
pipeline.set_reader(AudioReader(), config={"file_ext": ".wav"})
pipeline.add(SpeakerSegmentationProcessor())
pipeline.add(AudioUtteranceASRProcessor())
pipeline.initialize()



Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<forte.pipeline.Pipeline at 0x14b55b64beb0>

In [10]:
for pack in pipeline.process_dataset(audio_path):
    for asr_link in pack.get(Link):
        audio_utter = asr_link.get_parent()
        text_utter = asr_link.get_child()
        print(f"{text_utter.speaker}: {text_utter.text}")
        print()
        #sounddevice.play(audio_utter.audio, pack.sample_rate)
        sounddevice.wait()

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


SPEAKER_00: IMG CUFVERADGE CONTINUES NOW WITH OUR THER POLITICAL REPORTE MICHAEL DOUBT N

SPEAKER_01: HE JOINS US LIFE FROM THE ALLERT CENTER WITH WHAT VOTERS THINK OF TO NIGHT'S DEBATE MICHAEL

SPEAKER_02: EL WITH MAINLY REHASHED ARGUMENTS HERE ALMOST EVERY ON THE PANEL FELT THEY COULD HAVE USED A LITTLE LESS POLITICS AS USUAL BUT ON WHO WON THE DEBATE WHILE OUR PANEL WAS SPLIT ON PRETTY MUCH PARTY LINES

