# Taken From:
https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279

In [None]:
!pip install resemblyzer
!pip install pydub

In [1]:
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
import librosa
from pydub import AudioSegment
from spectralcluster import SpectralClusterer, RefinementOptions
from resemblyzer.audio import sampling_rate

In [9]:
#give the file path to your audio file
audio_path = 'conversation2.wav'
wav_fpath = Path(audio_path)
cut_rate = 5
# wav = preprocess_wav(wav_fpath)
wav,_ = librosa.load(wav_fpath,sr=16000)

In [10]:
def mp3_to_wav(audio_file_path):
  audio_file = Path(audio_file_path)
  if audio_file.suffix != ".mp3":
    return audio_file_path
  sound = AudioSegment.from_mp3(audio_file_path)
  output_file = audio_file.parent.joinpath(audio_file.stem+".wav")
  output_file_path = str(output_file.absolute().resolve())
  sound.export(output_file_path, format="wav")
  return output_file_path

In [11]:
audio_file_path = mp3_to_wav(audio_path)
encoder = VoiceEncoder("cpu",verbose=False)
_, cont_embeds, wav_splits = encoder.embed_utterance(wav,min_coverage=1, return_partials=True, rate=cut_rate)
refinement_options = RefinementOptions(gaussian_blur_sigma=1,p_percentile=0.90)

In [12]:
clusterer = SpectralClusterer(min_clusters=2, max_clusters=2,refinement_options=refinement_options)
labels = clusterer.predict(cont_embeds)

In [13]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [None]:
def create_labelling(labels,wav_splits):
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    count = 0
    # for i,time in enumerate(labels):
    #     if i>0 and labels[i]!=labels[i-1]:
    #         if count <= (cut_rate - cut_rate*0.2):
    #             for index in range(count+1):
    #                 labels[i-index-1] = labels[i]
    #         count = 0
    #         continue
    #     count += 1
    labelling = []
    start_time = 0
    
    for i,time in enumerate(times):
        if i>0 and labels[i]!=labels[i-1]:
            if count < (cut_rate - cut_rate*0.2):
                continue
            temp = [str(labels[i-1]),start_time,time]
            labelling.append(tuple(temp))
            start_time = time
            count = 0
            
        else:
            count += 1 
        
        if i==len(times)-1:
            temp = [str(labels[i]),start_time,time]
            labelling.append(tuple(temp))

    return labelling

In [14]:
labelling = create_labelling(labels,wav_splits)
labelling

[('0', 0, 9.0),
 ('1', 9.0, 21.2),
 ('0', 21.2, 24.4),
 ('0', 24.4, 26.8),
 ('1', 26.8, 31.8),
 ('1', 31.8, 34.4),
 ('0', 34.4, 41.4),
 ('1', 41.4, 44.0)]