In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
! pip install pyannote.audio
! git clone https://github.com/pyannote/AMI-diarization-setup.git

In [3]:
! mkdir /kaggle/working/voxconverse
! mkdir /kaggle/working/voxconverse/wav
! mkdir /kaggle/working/voxconverse/wav/train
! mkdir /kaggle/working/voxconverse/wav/dev
! mkdir /kaggle/working/voxconverse/wav/test
! mkdir /kaggle/working/voxconverse/rttm
! mkdir /kaggle/working/voxconverse/rttm/train
! mkdir /kaggle/working/voxconverse/rttm/dev
! mkdir /kaggle/working/voxconverse/rttm/test
! mkdir /kaggle/working/voxconverse/uem
! mkdir /kaggle/working/voxconverse/uem/train
! mkdir /kaggle/working/voxconverse/uem/dev
! mkdir /kaggle/working/voxconverse/uem/test

In [9]:
import os 
dir_name = "/kaggle/input/voxconverse-rttm/voxconverse-master/dev/"

filenames_train_dev =  [file.split('.')[0] for file in os.listdir(dir_name)]
filenames_train = filenames_train_dev[:int(0.7*len(filenames_train_dev))]
filenames_dev = filenames_train_dev[len(filenames_train):]
filenames_test = [file.split('.')[0] for file in os.listdir("/kaggle/input/voxconverse-rttm/voxconverse-master/test")]


with open(f"/kaggle/working/voxconverse/train.speakers.txt", 'w') as f:
        f.write('\n'.join(filenames_train))

with open(f"/kaggle/working/voxconverse/dev.speakers.txt", 'w') as f:
    f.write('\n'.join(filenames_dev))

with open(f"/kaggle/working/voxconverse/test.speakers.txt", 'w') as f:
    f.write('\n'.join(filenames_test))

In [10]:
! cp /kaggle/input/voxconverse-rttm/voxconverse-master/test/* /kaggle/working/voxconverse/rttm/test
! cp /kaggle/input/voxconverse-dataset/voxconverse_test_wav/voxconverse_test_wav/* /kaggle/working/voxconverse/wav/test

In [11]:
! cp /kaggle/input/voxconverse-rttm/voxconverse-master/dev/* /kaggle/working/voxconverse/rttm/dev
! cp /kaggle/input/voxconverse-dataset/voxconverse_dev_wav/audio/* /kaggle/working/voxconverse/wav/dev

In [12]:
import shutil

def move_files(source_dir, destination_dir, filenames_train):
    for file_name in filenames_train:
        source_file_path = os.path.join(source_dir, file_name)
        destination_file_path = os.path.join(destination_dir, file_name)
        shutil.move(source_file_path, destination_file_path)

    
    
source_dir = "/kaggle/working/voxconverse/rttm/dev"
destination_dir = "/kaggle/working/voxconverse/rttm/train"
move_files(source_dir, destination_dir, [file + '.rttm' for file in filenames_train])

               
source_dir = "/kaggle/working/voxconverse/wav/dev"
destination_dir = "/kaggle/working/voxconverse/wav/train"
move_files(source_dir, destination_dir, [file + '.wav' for file in filenames_train])

In [13]:
import os

def process_rttm_file(input_file, name):
    filename = input_file.split('/')[-1].split('.')[0]
    result = [filename, "1",  "0.000", ""]
    sum_1 = 0
    sum_2 = 0
    with open(input_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            sum_1 += float(parts[3])
            sum_2 += float(parts[4])
    
    result[-1]=str(sum_2)
    
    output_file = f"/kaggle/working/voxconverse/uem/{name}/{filename}.uem"
    with open(output_file, 'w') as f:
        f.write(' '.join(result))
    

for filename in [file + '.rttm' for file in filenames_train]:
    file_path = os.path.join("/kaggle/working/voxconverse/rttm/train/", filename)
    process_rttm_file(file_path, "train")

for filename in [file + '.rttm' for file in filenames_dev]:
    file_path = os.path.join("/kaggle/working/voxconverse/rttm/dev/", filename)
    process_rttm_file(file_path, "dev")

for filename in [file + '.rttm' for file in filenames_test]:
    file_path = os.path.join("/kaggle/working/voxconverse/rttm/test/", filename)
    process_rttm_file(file_path, "test")


In [31]:
from tqdm.autonotebook import tqdm

import torch
from pyannote.audio import Pipeline
from pyannote.database import registry, FileFinder
from pyannote.metrics.diarization import DiarizationErrorRate

In [56]:
config = """Databases:
    # tell pyannote.database where to find AMI wav files.
    # {uri} is a placeholder for the session name (eg. ES2004c).
    # you might need to update this line to fit your own setup.
    VoxCeleb: 
    - /kaggle/working/voxconverse/wav/train/{uri}.wav 
    - /kaggle/working/voxconverse/wav/dev/{uri}.wav 
    - /kaggle/working/voxconverse/wav/test/{uri}.wav 

Protocols: 
    VoxCeleb: 
        SpeakerDiarization: 
            only_words:  
                train: 
                    uri: /kaggle/working/voxconverse/train.speakers.txt 
                    annotation: /kaggle/working/voxconverse/rttm/train/{uri}.rttm 
                    annotated:  /kaggle/working/voxconverse/uem/train/{uri}.uem 
                dev:  
                    uri: /kaggle/working/voxconverse/dev.speakers.txt 
                    annotation: /kaggle/working/voxconverse/rttm/dev/{uri}.rttm 
                    annotated:  /kaggle/working/voxconverse/uem/dev/{uri}.uem       
                test: 
                    uri: /kaggle/working/voxconverse/test.speakers.txt 
                    annotation: /kaggle/working/voxconverse/rttm/test/{uri}.rttm 
                    annotated: /kaggle/working/voxconverse/uem/test/{uri}.uem 
"""

file_path = "/kaggle/working/AMI-diarization-setup/pyannote/database3.yml" 

with open(file_path, 'w') as file:
    file.write(config)

In [33]:
registry.load_database("/kaggle/working/AMI-diarization-setup/pyannote/database3.yml")
preprocessors = {"audio": FileFinder()}
dataset = registry.get_protocol('VoxCeleb.SpeakerDiarization.only_words', preprocessors=preprocessors)

'VoxCeleb.SpeakerDiarization.only_words' found in /kaggle/working/AMI-diarization-setup/pyannote/database3.yml does not define the 'scope' of speaker labels (file, database, or global). Setting it to 'file'.


In [34]:
from pyannote.audio import Model
from pyannote.audio.tasks import Segmentation
from types import MethodType
from torch.optim import Adam
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    RichProgressBar,
)

model = Model.from_pretrained("pyannote/segmentation-3.0", use_auth_token=True).cuda()

task = Segmentation(
    dataset, 
    duration=model.specifications.duration, 
    max_num_speakers=len(model.specifications.classes), 
    batch_size=32,
    num_workers=8, 
    loss="bce", 
    vad_loss="bce"
)

model.task = task
model.prepare_data()
model.setup()

Protocol VoxCeleb.SpeakerDiarization.only_words does not precompute the output of torchaudio.info(): adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. See pyannote.database documentation on how to do that yourself.


In [35]:
from types import MethodType
from torch.optim import Adam
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    RichProgressBar,
)

def configure_optimizers(self):
    return Adam(self.parameters(), lr=1e-4)

model.configure_optimizers = MethodType(configure_optimizers, model)
monitor, direction = task.val_monitor

checkpoint = ModelCheckpoint(
    monitor=monitor,
    mode=direction,
    save_top_k=1,
    every_n_epochs=1,
    save_last=False,
    save_weights_only=False,
    filename="{epoch}",
    verbose=False,
)

early_stopping = EarlyStopping(
    monitor=monitor,
    mode=direction,
    min_delta=0.0,
    patience=10,
    strict=True,
    verbose=False,
)

callbacks = [RichProgressBar(), checkpoint, early_stopping]


from pytorch_lightning import Trainer
trainer = Trainer(
    accelerator="gpu", 
#     callbacks=callbacks, 
    max_epochs=20,
    gradient_clip_val=0.5
)

trainer.fit(model)

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

TypeError: An invalid dataloader was returned from `PyanNet.val_dataloader()`. Found None.