In [2]:
!pip install faster_whisper
!pip install git+https://github.com/m-bain/whisperx.git
!pip install python-docx

# Class for audio transcribation and diarization

In [2]:
from faster_whisper import WhisperModel
from docx import Document
import whisperx
import gc


class Audio():
    def __init__(self, filename, hf_token, device ='cuda'):

        """
        :param filename: name of audio/video
        :param hf_token: HuggingFace token to access open-source models
        :param device: cuda or cpu
        """

        self.filename = filename

        self.transcriber = WhisperModel("large-v2", device=device)
        self.transcribation_text = Document()

        self.diarizer1 = whisperx.load_model("large-v2", device, compute_type="float16")
        self.diarizer2 = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
        self.diarization_text = Document()

    def transcribe(self):
        segments, info = self.transcriber.transcribe(self.filename)
        for segment in segments:
            self.transcribation_text.add_paragraph("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

    def download_transcribation(self):
        if len(self.transcribation_text.paragraphs) == 0:
            print("The transcribation wasn't made yet. Use transribe() first.")
            return
        self.transcribation_text.save(self.filename.split(".")[0]+".docx")

    def diarize(self, batch_size=16, device='cuda'):
        audio = whisperx.load_audio(self.filename)
        result =  self.diarizer1.transcribe(audio, batch_size=batch_size)
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
        diarize_segments = self.diarizer2(audio)
        result = whisperx.assign_word_speakers(diarize_segments, result)

        prev_speaker, text = '', ''
        start, end = 0, 0
        for i in result['segments']:
            if prev_speaker == '':
                prev_speaker = i['speaker']
                start = i['start']
            elif prev_speaker == i['speaker']:
                text += i['text'] + ' '
            else:
                end = i['end']
                if text != '':
                    self.diarization_text.add_paragraph(f'{prev_speaker}: {text.strip()}')
                prev_speaker = i['speaker']
                text = i['text']

    def download_diarization(self):
        if len(self.diarization_text.paragraphs) == 0:
            print("The diarization wasn't made yet. Use diarize() first.")
            return
        self.diarization_text.save(self.filename.split(".")[0]+".docx")

  torchaudio.set_audio_backend("soundfile")


# Usage example

In [1]:
my_audio = Audio('video_name.mp4')
my_audio.transcribe()
my_audio.download_transcribation()