In [9]:
import torch
from pyannote.audio import Pipeline
from huggingface_hub import HfApi
from pyannote.database.util import load_rttm
from pyannote.metrics.detection import DetectionErrorRate
from pyannote.core import Segment, notebook, SlidingWindowFeature
from pyannote.audio.utils.signal import Binarize
from pyannote.database import get_protocol, FileFinder
from pyannote.pipeline import Optimizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the pipeline
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")

In [4]:
# get the data
preprocessors = {"audio": FileFinder()}
protocol = get_protocol("AMI.SpeakerDiarization.only_words", preprocessors=preprocessors)

In [10]:
# tune hyperparameters
initial_params = {"onset": 0.6, "offset": 0.4, 
                  "min_duration_on": 0.0, "min_duration_off": 0.0}
pipeline.freeze({'min_duration_on': 0.0, 'min_duration_off': 0.0})
optimizer = Optimizer(pipeline)
optimizer.tune(list(protocol.development()), 
               warm_start=initial_params, 
               n_iterations=20, 
               show_progress=True)
optimized_params = optimizer.best_params
print(optimized_params)


Current trial:   0%|                                                                                                                            | 0/18 [00:00<?, ?it/s][A
Current trial:   6%|██████▍                                                                                                             | 1/18 [01:29<25:16, 89.19s/it][A
Current trial:  11%|████████████▊                                                                                                      | 2/18 [04:16<36:00, 135.02s/it][A
Current trial:  17%|███████████████████▏                                                                                               | 3/18 [06:42<35:02, 140.17s/it][A
Current trial:  22%|█████████████████████████                                                                                        | 4/18 [18:49<1:26:47, 371.98s/it][A
Current trial:  28%|███████████████████████████████▍                                                                                 | 5/18 [52:

{'onset': 0.7479685379491965, 'offset': 0.5852537564722043, 'min_duration_on': 0.0, 'min_duration_off': 0.0}


In [14]:
# test the performance of the pipeline
optimized_pipeline = pipeline.instantiate(optimized_params)
test_prediction = []
index = 0
for test_file in protocol.test():
    index += 1
    dia = optimized_pipeline(test_file)
    test_prediction.append(dia)
    print(f"Finish example {index}.")

Finish example 1.
Finish example 2.
Finish example 3.
Finish example 4.
Finish example 5.
Finish example 6.
Finish example 7.
Finish example 8.
Finish example 9.
Finish example 10.
Finish example 11.
Finish example 12.
Finish example 13.
Finish example 14.
Finish example 15.
Finish example 16.


In [15]:
# calculate the detection error rate
metric = DetectionErrorRate()
for hypothesis, reference in zip(test_prediction, protocol.test()):
    _ = metric(reference['annotation'], hypothesis, uem=reference['annotated'])
detection_error_rate = abs(metric)
print(f'Detection error rate = {detection_error_rate * 100:.1f}%')

Detection error rate = 6.8%
