## Speaker Diatrization
Need to seperate audio by speaker.
- https://www.youtube.com/watch?v=YRvf00NooN8

Pyannote Docs
- https://huggingface.co/pyannote/speaker-diarization-3.1
- https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/intro.ipynb

In [1]:
# Install necessary libraries
# !pip install pyannote.audio pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
# Import necessary libraries
import torch
from pyannote.audio import Pipeline
import wave
import contextlib
import time
from huggingface_hub import notebook_login
import os
from pydub import AudioSegment
from collections import defaultdict

In [3]:
# Define a simple timer function
def timer(label):
    class Timer:
        def __enter__(self):
            self.start = time.time()
            return self

        def __exit__(self, *args):
            end = time.time()
            print(f"{label}: {end - self.start:.2f} seconds")

    return Timer()

In [4]:
# Login to HF
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Load pretrained pipeline
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization-3.1', use_auth_token=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Define the audio file path
audio_file = "/content/elon_ted.wav"

In [7]:
# Check if the file exists
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"The audio file {audio_file} does not exist. Please make sure it's in the correct location.")

In [8]:
# Send pipeline to GPU (when available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

print(f"PyTorch is using: {device}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

PyTorch is using: cuda
GPU Device: Tesla T4
Number of GPUs: 1


In [9]:
# Get the duration of the audio file
with contextlib.closing(wave.open(audio_file,'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    duration = frames / float(rate)

In [10]:
# Run the diarization
with timer("Speaker Diarization"):
    diarization = pipeline(audio_file, num_speakers=None, min_speakers=1, max_speakers=5)

# Print the results
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

# Calculate and print speaker statistics
speaker_duration = {}
for segment, _, speaker in diarization.itertracks(yield_label=True):
    if speaker not in speaker_duration:
        speaker_duration[speaker] = segment.duration
    else:
        speaker_duration[speaker] += segment.duration

print("\nSpeaker Statistics:")
for speaker, duration in speaker_duration.items():
    percentage = (duration / duration) * 100
    print(f"{speaker}: {duration:.2f} seconds ({percentage:.2f}% of total)")

# Extract audio for each speaker
def extract_speaker_audio(diarization, audio_file):
    # Load the audio file
    audio = AudioSegment.from_wav(audio_file)

    # Create a dictionary to store audio segments for each speaker
    speaker_audio = defaultdict(lambda: AudioSegment.empty())

    # Iterate through the diarization results
    for segment, _, speaker in diarization.itertracks(yield_label=True):
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)

        # Extract the audio segment
        segment_audio = audio[start_ms:end_ms]

        # Add the segment to the corresponding speaker
        speaker_audio[speaker] += segment_audio

    # Export audio for each speaker
    for speaker, audio in speaker_audio.items():
        output_file = f"/content/{speaker}_audio.wav"
        audio.export(output_file, format="wav")
        print(f"Exported {output_file}")

# Run the extraction
print("\nExtracting audio for each speaker...")
extract_speaker_audio(diarization, audio_file)

print("Audio extraction complete!")

It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


Speaker Diarization: 216.93 seconds
start=0.0s stop=21.1s speaker_SPEAKER_00
start=21.7s stop=22.1s speaker_SPEAKER_00
start=22.9s stop=24.8s speaker_SPEAKER_00
start=25.0s stop=37.8s speaker_SPEAKER_00
start=30.2s stop=30.5s speaker_SPEAKER_01
start=38.3s stop=38.9s speaker_SPEAKER_00
start=39.2s stop=39.9s speaker_SPEAKER_00
start=40.2s stop=43.9s speaker_SPEAKER_00
start=43.9s stop=52.1s speaker_SPEAKER_02
start=43.9s stop=44.1s speaker_SPEAKER_00
start=52.4s stop=52.8s speaker_SPEAKER_02
start=53.0s stop=55.5s speaker_SPEAKER_02
start=56.7s stop=57.2s speaker_SPEAKER_02
start=58.3s stop=59.9s speaker_SPEAKER_02
start=60.5s stop=60.9s speaker_SPEAKER_02
start=61.2s stop=63.2s speaker_SPEAKER_02
start=65.0s stop=67.6s speaker_SPEAKER_02
start=68.1s stop=72.9s speaker_SPEAKER_02
start=73.7s stop=74.5s speaker_SPEAKER_02
start=74.6s stop=75.0s speaker_SPEAKER_01
start=75.0s stop=76.2s speaker_SPEAKER_02
start=77.1s stop=79.0s speaker_SPEAKER_02
start=79.4s stop=82.0s speaker_SPEAKER_02