# **LOAD LIBRARIES**

In [1]:
!pip install torchaudio -q --no-deps
!pip install speechbrain -q
!pip install spectralcluster -q
!pip install pyannote.metrics -q
!pip install gdown -q

[K     |████████████████████████████████| 1.9MB 32.7MB/s 
[K     |████████████████████████████████| 358kB 28.7MB/s 
[K     |████████████████████████████████| 1.2MB 43.4MB/s 
[K     |████████████████████████████████| 102kB 12.0MB/s 
[K     |████████████████████████████████| 645kB 58.4MB/s 
[K     |████████████████████████████████| 552kB 59.1MB/s 
[K     |████████████████████████████████| 51kB 6.4MB/s 
[K     |████████████████████████████████| 51kB 7.6MB/s 
[K     |████████████████████████████████| 61kB 8.7MB/s 
[K     |████████████████████████████████| 133kB 42.3MB/s 
[?25h

In [1]:
!git clone https://github.com/shashikg/speaker_diarization_ee698.git
!cp -r speaker_diarization_ee698/core/. core

Downloading...
From: https://drive.google.com/uc?id=1XltkNas1LTOck62GIDYT0WcPDPGZEo7P
To: /content/utils.py
100% 15.4k/15.4k [00:00<00:00, 22.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FWIRF2Kq6JB0fFHwjkZmVfEuGq0-SSC5
To: /content/DEC.py
100% 10.1k/10.1k [00:00<00:00, 14.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hbepzjAxHTQS5QoAlKNP9zK0nMxdjauJ
To: /content/optimumSpeaker.py
100% 7.08k/7.08k [00:00<00:00, 10.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RQkezkiXa5DVFHE7g4UKE2v8vcmgJFpm
To: /content/baselineMethods.py
100% 4.39k/4.39k [00:00<00:00, 3.92MB/s]


In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
import torchaudio
import os
import torch
from torch.utils.data import Dataset, DataLoader
from speechbrain.pretrained import SpeakerRecognition
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans, SpectralClustering
from sklearn import decomposition
from tqdm.auto import tqdm

from core.utils import DiarizationDataSet, make_rttm, get_metrics, plot_annot
import core.baselineMethods as baselineMethods

import shutil
import pandas as pd

In [4]:
audio_dataset = DiarizationDataSet(dataset_name='ami',
                                   window_len=1500, 
                                   window_step=750,
                                   split='full',
                                   use_oracle_vad=False)

print("\nData size:", len(audio_dataset))

Downloading audio dataset...


Downloading...
From: https://drive.google.com/uc?id=1c0l9amE_0eVD1soSXvxUvJzuxzFxkn2u
To: /content/tmp.zip
668MB [00:05, 133MB/s]


Download and Extraction Complete
Downloading precomputed VADs...


Downloading...
From: https://drive.google.com/uc?id=1Hzhks79Mq9py0yPfxI_e73Nx5M-XbJp6
To: /content/tmp.zip
100%|██████████| 44.0k/44.0k [00:00<00:00, 8.88MB/s]


Download and Extraction Complete
Precomputed X-vectors exists!
Will use precomputed features...

Downloading precomputed features...


Downloading...
From: https://drive.google.com/uc?id=1HjW9caW9f3Bqp2hvz97tvLPEz_fFv01X
To: /content/tmp.zip
31.0MB [00:00, 36.0MB/s]


Download and Extraction Complete

Data size: 16


## KMeans - Oracle Number of Speakers

In [5]:
hypothesis_dir = baselineMethods.diarizationOracleNumSpkrs(audio_dataset, method="KMeans")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


        diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                             %                         %                       %                                 %                   %
item                                                                                                                                  
EN2002a                  18.99  1234.81  1117.81   90.52      117.47        9.51            77.60             6.28     39.40      3.19
EN2002b                  21.34   989.65   881.36   89.06      102.90       10.40            64.87             6.55     43.42      4.39
EN2002c                  15.73  1839.89  1699.15   92.35      148.76        8.09           104.32             5.67     36.42      1.98
EN2002d                  20.59  1214.34  1074.12   88.45      109.77        9.04            81.75             6.73     58.47      4.81
ES2004a                  36.91   607.99   487.94   80.

In [6]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,19.587051,9.771509,5.467412,4.34813


## Spectral - Oracle Number of Speakers

In [7]:
hypothesis_dir = baselineMethods.diarizationOracleNumSpkrs(audio_dataset, method="Spectral")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


        diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                             %                         %                       %                                 %                   %
item                                                                                                                                  
EN2002a                  18.71  1234.81  1121.22   90.80      117.47        9.51            77.60             6.28     35.99      2.91
EN2002b                  20.91   989.65   885.60   89.49      102.90       10.40            64.87             6.55     39.18      3.96
EN2002c                  15.95  1839.89  1695.13   92.13      148.76        8.09           104.32             5.67     40.44      2.20
EN2002d                  19.07  1214.34  1092.56   89.97      109.77        9.04            81.75             6.73     40.03      3.30
ES2004a                  36.69   607.99   489.27   80.

In [8]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,17.6524,9.771509,5.467412,2.413479


## Spectral - EigenGap

In [9]:
hypothesis_dir = baselineMethods.diarizationEigenGapNumSpkrs(audio_dataset)
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


        diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                             %                         %                       %                                 %                   %
item                                                                                                                                  
EN2002a                  22.43  1234.81  1075.28   87.08      117.47        9.51            77.60             6.28     81.93      6.64
EN2002b                  24.20   989.65   853.05   86.20      102.90       10.40            64.87             6.55     71.73      7.25
EN2002c                  16.74  1839.89  1680.67   91.35      148.76        8.09           104.32             5.67     54.90      2.98
EN2002d                  21.54  1214.34  1062.60   87.50      109.77        9.04            81.75             6.73     69.99      5.76
ES2004a                  32.38   607.99   515.52   84.

In [10]:
df = metric.report()
dfT = df[-1:]
dfT[[dfT.keys()[0], dfT.keys()[5], dfT.keys()[7], dfT.keys()[9]]]

Unnamed: 0_level_0,diarization error rate,false alarm,missed detection,confusion
Unnamed: 0_level_1,%,%,%,%
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
TOTAL,21.99261,9.771509,5.467412,6.753689
