### Libraries

In [1]:
num_speakers = 2

language = 'English'

model_size = 'tiny' 

In [None]:
%pip install -q git+https://github.com/openai/whisper.git
%pip install -q git+https://github.com/pyannote/pyannote-audio

In [None]:
import whisper
import datetime

import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [None]:
path="/kaggle/input/medmind/audio.mp3"
if path[-3:] != 'wav':
    subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
    path = '/kaggle/working/audio.wav'

### Open AI's Whisper model to seperate audio into segments and generate transcripts.

In [None]:
model = whisper.load_model(model_size)

In [8]:
result = model.transcribe(path)
segments = result["segments"]

In [9]:
with contextlib.closing(wave.open(path,'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    duration = frames / float(rate)

### generating speaker embeddings for each segments.

In [62]:
audio = Audio()

def segment_embedding(segment):
    start = segment["start"]
    # Whisper overshoots the end timestamp in the last segment
    end = min(duration, segment["end"])
    clip = Segment(start, end)
    waveform, sample_rate = audio.crop(path, clip)
    print(waveform.shape, waveform)
    return embedding_model(waveform.unsqueeze(0))

In [None]:
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
    embeddings[i] = segment_embedding(segment)

embeddings = np.nan_to_num(embeddings)

### agglomerative clustering on the embeddings to identify the speaker for each segment.

In [None]:
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

In [None]:
def time(secs):
  return datetime.timedelta(seconds=round(secs))

f = open("transcript.txt", "w")

for (i, segment) in enumerate(segments):
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
    f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
  f.write(segment["text"][1:] + ' ')
f.close()