In [1]:
#  Install PyTorch with GPU support
!pip install torch torchaudio --quiet

#  Install PyAnnote for speaker diarization
!pip install pyannote.audio --quiet

#  Install Whisper for speech-to-text
!pip install openai-whisper --quiet

#  Install FFmpeg for audio processing
!apt-get install -y ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [2]:
from pyannote.audio import Pipeline
import torch
import whisper
import warnings

# Hugging Face Token
HUGGINGFACE_TOKEN = "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"
warnings.filterwarnings("ignore")


In [3]:
audio_file = "/kaggle/input/diarization-audio-check/audio.mp3"

In [4]:
#PyAnnote speaker diarization pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN
)

#Move pipeline to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

#Load Whisper model for transcription
whisper_model = whisper.load_model("base")  

In [6]:
!ffmpeg -i /kaggle/input/diarization-audio-check/audio.mp3 audio3.wav
audio_file = "audio3.wav"

#  Run speaker diarization
print(" Processing audio for diarization...")
diarization = pipeline(audio_file)

#  Run Whisper transcription
print(" Transcribing audio...")
transcription = whisper_model.transcribe(audio_file)


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [7]:
# Map PyAnnote speakers to friendly names
speaker_map = {}
speaker_count = 1

# Create and write to output text file
output_file = "transcript.txt"
with open(output_file, "w") as f:
    for segment in transcription['segments']:
        seg_start = segment['start']
        seg_end = segment['end']
        text = segment['text']

        # Find matching speaker
        speaker_overlap = {}
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            overlap_start = max(seg_start, turn.start)
            overlap_end = min(seg_end, turn.end)
            overlap = max(0, overlap_end - overlap_start)

            if overlap > 0:
                if speaker not in speaker_overlap:
                    speaker_overlap[speaker] = 0
                speaker_overlap[speaker] += overlap

        if speaker_overlap:
            matched_speaker = max(speaker_overlap, key=speaker_overlap.get)
            if matched_speaker not in speaker_map:
                speaker_map[matched_speaker] = f"Speaker {speaker_count}"
                speaker_count += 1
            speaker_name = speaker_map[matched_speaker]
        else:
            speaker_name = "Speaker ?"

        # Write to file
        line = f"{speaker_name}: {text.strip()} (start={seg_start:.1f}s, end={seg_end:.1f}s)\n"
        f.write(line)
        print(line.strip())

print(f"\n Transcript saved as: {output_file}")


Speaker 1: I can surely ask it and I will let you know about this. (start=0.0s, end=3.4s)
Speaker 1: Okay. (start=3.4s, end=4.4s)
Speaker ?: Perfect. (start=4.4s, end=5.4s)
Speaker 1: Yeah, it could be done but that's not a big problem. (start=5.4s, end=10.4s)
Speaker 1: So this is so yeah, I totally understand why are you saying like you just need to go (start=10.4s, end=16.3s)
Speaker 1: down so it's better like if this specific part is from one one specific source you can (start=16.3s, end=22.2s)
Speaker 1: simply see okay this what's the source you can go to that page directly. (start=22.2s, end=26.3s)
Speaker 1: So I understand yeah that this is a good thing as well. (start=26.3s, end=32.1s)
Speaker 1: For sure. (start=32.1s, end=33.1s)
Speaker 1: Yeah, so this is for medical. (start=33.1s, end=36.0s)
Speaker 1: This is a public record. (start=36.0s, end=39.5s)
Speaker 1: And now okay, no. (start=39.5s, end=42.9s)
Speaker 1: So what I was thinking first of all I was thinking there