In [6]:
#  Install PyTorch with GPU support
!pip install torch torchaudio --quiet

#  Install PyAnnote for speaker diarization
!pip install pyannote.audio --quiet

#  Install Whisper for speech-to-text
!pip install openai-whisper --quiet

#  Install FFmpeg for audio processing
!apt-get install -y ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [12]:
from pyannote.audio import Pipeline
import torch
import whisper
import warnings

# Hugging Face Token
HUGGINGFACE_TOKEN = "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"
warnings.filterwarnings("ignore")


In [23]:
audio_file = "/kaggle/input/audio-test/input1.mp3"

In [24]:
#PyAnnote speaker diarization pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN
)

#Move pipeline to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

#Load Whisper model for transcription
whisper_model = whisper.load_model("base")  

In [25]:
import json

!ffmpeg -i /kaggle/input/audio-test/input1.mp3 audio5.wav
audio_file = "audio5.wav"

# Run speaker diarization
print(" Processing audio for diarization...")
diarization = pipeline(audio_file)

# Run Whisper transcription
print(" Transcribing audio...")
transcription = whisper_model.transcribe(audio_file)

# Map PyAnnote speakers to friendly names
speaker_map = {}
speaker_count = 1

# Create a list to hold JSON entries
json_output = []

for segment in transcription['segments']:
    seg_start = segment['start']
    seg_end = segment['end']
    text = segment['text']

    # Find matching speaker
    speaker_overlap = {}
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        overlap_start = max(seg_start, turn.start)
        overlap_end = min(seg_end, turn.end)
        overlap = max(0, overlap_end - overlap_start)

        if overlap > 0:
            if speaker not in speaker_overlap:
                speaker_overlap[speaker] = 0
            speaker_overlap[speaker] += overlap

    if speaker_overlap:
        matched_speaker = max(speaker_overlap, key=speaker_overlap.get)
        if matched_speaker not in speaker_map:
            speaker_map[matched_speaker] = f"Speaker {speaker_count}"
            speaker_count += 1
        speaker_name = speaker_map[matched_speaker]
    else:
        speaker_name = "Speaker ?"

    # Add entry to JSON output
    json_output.append({
        "speaker": speaker_name,
        "text": text.strip(),
        "start": round(seg_start, 1),
        "end": round(seg_end, 1)
    })

# Write JSON output to file
output_json_file = "transcript.json"
with open(output_json_file, "w") as json_file:
    json.dump(json_output, json_file, indent=4)

print(f"\n Transcript saved as: {output_json_file}")

#  Print JSON to console
print(json.dumps(json_output, indent=4))


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab