# **LOAD LIBRARIES**

In [1]:
!pip install torchaudio -q --no-deps
!pip install speechbrain -q
!pip install spectralcluster -q
!pip install pyannote.metrics -q
!pip install gdown -q

[K     |████████████████████████████████| 1.9MB 5.7MB/s 
[K     |████████████████████████████████| 358kB 4.5MB/s 
[K     |████████████████████████████████| 1.2MB 7.7MB/s 
[K     |████████████████████████████████| 102kB 7.4MB/s 
[K     |████████████████████████████████| 645kB 13.1MB/s 
[K     |████████████████████████████████| 552kB 18.6MB/s 
[K     |████████████████████████████████| 51kB 2.7MB/s 
[K     |████████████████████████████████| 61kB 6.0MB/s 
[K     |████████████████████████████████| 51kB 6.2MB/s 
[K     |████████████████████████████████| 133kB 21.8MB/s 
[?25h

In [1]:
!git clone https://github.com/shashikg/speaker_diarization_ee698.git
!cp -r speaker_diarization_ee698/core/. core

Downloading...
From: https://drive.google.com/uc?id=1XltkNas1LTOck62GIDYT0WcPDPGZEo7P
To: /content/utils.py
100% 15.4k/15.4k [00:00<00:00, 7.32MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FWIRF2Kq6JB0fFHwjkZmVfEuGq0-SSC5
To: /content/DEC.py
100% 10.1k/10.1k [00:00<00:00, 21.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hbepzjAxHTQS5QoAlKNP9zK0nMxdjauJ
To: /content/optimumSpeaker.py
100% 7.08k/7.08k [00:00<00:00, 14.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RQkezkiXa5DVFHE7g4UKE2v8vcmgJFpm
To: /content/baselineMethods.py
100% 4.39k/4.39k [00:00<00:00, 9.04MB/s]


In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
import torchaudio
import os
import torch
from torch.utils.data import Dataset, DataLoader
from speechbrain.pretrained import SpeakerRecognition
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans, SpectralClustering
from sklearn import decomposition
from tqdm.auto import tqdm

from core.utils import DiarizationDataSet, get_metrics, plot_annot, make_rttm
import core.baselineMethods as baselineMethods

import shutil
import pandas as pd

# Baseline over "test" dataset

In [4]:
audio_dataset = DiarizationDataSet(dataset_name='voxconverse',
                                   window_len=1500, 
                                   window_step=750,
                                   split='test',
                                   use_oracle_vad=False)

print("\nData size:", len(audio_dataset))

Dataset already downloaded!
Downloading precomputed VADs...


Downloading...
From: https://drive.google.com/uc?id=18oXqn9Zyt5tJpoEwKKztpTag-AJMQ2Sz
To: /content/tmp.zip
100%|██████████| 128k/128k [00:00<00:00, 17.7MB/s]


Download and Extraction Complete
Precomputed X-vectors exists!
Will use precomputed features...

Downloading precomputed features...


Downloading...
From: https://drive.google.com/uc?id=1-2-AZnabTtHxLmw2DBwj4PJDGwlvFa8J
To: /content/tmp.zip
69.4MB [00:00, 185MB/s]


Download and Extraction Complete

Data size: 50


## KMeans - Oracle Number of Speakers

In [5]:
hypothesis_dir = baselineMethods.diarizationOracleNumSpkrs(audio_dataset, method="KMeans")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
ahnss                   3.29   582.22   564.90   97.03        1.84        0.32            14.95             2.57      2.37      0.41
akthc                   5.01   101.91   101.30   99.40        4.50        4.41             0.61             0.60      0.00      0.00
aufkn                  24.30   162.52   124.60   76.67        1.57        0.97             2.11             1.30     35.81     22.03
bkwns                   6.05    47.85    47.28   98.81        2.33        4.86             0.46             0.96      0.11      0.23
blwmj                   1.77   245.88   244.24   99.33        2.71  

In [6]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,17.938039,2.358409,2.029195,13.550435


## Spectral - Oracle Number of Speakers

In [7]:
hypothesis_dir = baselineMethods.diarizationOracleNumSpkrs(audio_dataset, method="Spectral")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
ahnss                   3.43   582.22   564.08   96.88        1.84        0.32            14.95             2.57      3.19      0.55
akthc                   5.01   101.91   101.30   99.40        4.50        4.41             0.61             0.60      0.00      0.00
aufkn                   4.28   162.52   157.14   96.69        1.57        0.97             2.11             1.30      3.27      2.01
bkwns                   6.05    47.85    47.28   98.81        2.33        4.86             0.46             0.96      0.11      0.23
blwmj                   1.77   245.88   244.24   99.33        2.71  

In [8]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,11.064577,2.358409,2.029195,6.676973


## Spectral - EigenGap

In [9]:
hypothesis_dir = baselineMethods.diarizationEigenGapNumSpkrs(audio_dataset)
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
ahnss                   5.30   582.22   553.21   95.02        1.84        0.32            14.95             2.57     14.06      2.41
akthc                  30.05   101.91    75.78   74.36        4.50        4.41             0.61             0.60     25.52     25.04
aufkn                  22.55   162.52   127.45   78.42        1.57        0.97             2.11             1.30     32.96     20.28
bkwns                  89.23    47.85     7.48   15.63        2.33        4.86             0.46             0.96     39.91     83.41
blwmj                   2.69   245.88   241.99   98.42        2.71  

In [10]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,17.760788,2.358409,2.029195,13.373184


# Baseline over 'full' split

In [11]:
audio_dataset = DiarizationDataSet(dataset_name='voxconverse',
                                   window_len=1500, 
                                   window_step=750,
                                   split='full',
                                   use_oracle_vad=False)

print("\nData size:", len(audio_dataset))

Dataset already downloaded!
Downloading precomputed VADs...


Downloading...
From: https://drive.google.com/uc?id=18oXqn9Zyt5tJpoEwKKztpTag-AJMQ2Sz
To: /content/tmp.zip
100%|██████████| 128k/128k [00:00<00:00, 20.0MB/s]


Download and Extraction Complete
Precomputed X-vectors exists!
Will use precomputed features...

Downloading precomputed features...


Downloading...
From: https://drive.google.com/uc?id=1-2-AZnabTtHxLmw2DBwj4PJDGwlvFa8J
To: /content/tmp.zip
69.4MB [00:00, 154MB/s] 


Download and Extraction Complete

Data size: 216


## KMeans - Oracle Number of Speakers

In [12]:
hypothesis_dir = baselineMethods.diarizationOracleNumSpkrs(audio_dataset, method="KMeans")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
abjxc                   2.17    62.10    61.71   99.37        0.95        1.54             0.39             0.63      0.00      0.00
afjiv                   9.48   116.64   115.96   99.42       10.38        8.90             0.68             0.58      0.00      0.00
ahnss                   3.29   582.22   564.90   97.03        1.84        0.32            14.95             2.57      2.37      0.41
aisvi                  42.74   430.98   261.55   60.69       14.78        3.43             1.35             0.31    168.08     39.00
akthc                   5.01   101.91   101.30   99.40        4.50  

In [13]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,17.150973,2.507069,2.417944,12.22596


## Spectral - Oracle Number of Speakers

In [14]:
hypothesis_dir = baselineMethods.diarizationOracleNumSpkrs(audio_dataset, method="Spectral")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
abjxc                   2.17    62.10    61.71   99.37        0.95        1.54             0.39             0.63      0.00      0.00
afjiv                  16.82   116.64   107.40   92.08       10.38        8.90             0.68             0.58      8.56      7.34
ahnss                   3.47   582.22   563.83   96.84        1.84        0.32            14.95             2.57      3.44      0.59
aisvi                   3.93   430.98   428.81   99.50       14.78        3.43             1.35             0.31      0.82      0.19
akthc                   5.01   101.91   101.30   99.40        4.50  

In [15]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,10.254304,2.507069,2.417944,5.329291


## Spectral - EigenGap

In [16]:
hypothesis_dir = baselineMethods.diarizationEigenGapNumSpkrs(audio_dataset)
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
abjxc                  85.84    62.10     9.75   15.70        0.95        1.54             0.39             0.63     51.96     83.67
afjiv                   9.58   116.64   115.85   99.32       10.38        8.90             0.68             0.58      0.11      0.09
ahnss                   5.30   582.22   553.21   95.02        1.84        0.32            14.95             2.57     14.06      2.41
aisvi                  14.18   430.98   384.66   89.25       14.78        3.43             1.35             0.31     44.97     10.43
akthc                  30.05   101.91    75.78   74.36        4.50  

In [17]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,16.982875,2.507069,2.417944,12.057862
