#### Note:

It is recommended that the script be run in a Python 3 environment in Google Colab with GPU acceleration (T4) enabled. The script may take a long time to run on a CPU. Additionally, several version mismatch errors were noticed when running the script in a local environment (tested on a Windows machine). The script was tested to work fine in Google Colab.

Additionally, Google Colab has no list dependency for Pyannote. Hence the following line should only be if the script is being executed in a local environment:

```python
%pip install pyannote
```

In [None]:
#For Google Colab, install dependencies via the following commands:
%pip install pydub
%pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip #installs pyannote

In [None]:
import os
import torch
from pyannote.audio import Pipeline
from pydub import AudioSegment
from IPython.display import Audio, display
import json

class SpeakerDiarization:
    def __init__(self):
        API_KEY = "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"
        self.endtime = 0
        self.timestamps = []
        self.audionames = []
        self.pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=API_KEY)
        self.pipeline.to(torch.device("cuda")) if torch.cuda.is_available() else self.pipeline

    def merge_mp3_files_to_wav(self, mp3_dir, output_wav_file):
        merged_audio = AudioSegment.silent(duration=150)
        for file in os.listdir(mp3_dir):
            if file.endswith(".wav"):
                audio = AudioSegment.from_wav(os.path.join(mp3_dir, file))
                merged_audio += audio
                merged_audio += AudioSegment.silent(duration=300)
                self.endtime = self.endtime + audio.duration_seconds
                self.timestamps.append(self.endtime)
                self.audionames.append(file)
        merged_audio.export(output_wav_file, format="wav")

    def diatrize_speakers(self, audio_file):
        diarization = self.pipeline(audio_file)
        speaker_info = {}
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            speaker_id = f"speaker_{speaker}"
            if speaker_id not in speaker_info:
                speaker_info[speaker_id] = {
                    "gender": None,
                    "age-group": None,
                    "samples": []
                }

            #deducing the sample id
            index = len(self.timestamps)
            for i, timestamp in enumerate(self.timestamps):
                if turn.end < timestamp:
                    index = i + 1
                    break

            speaker_info[speaker_id]["samples"].append({
                "start": turn.start,
                "end": turn.end,
                "sample_id": index,
                "sample_file": self.audionames[index-1]
            })
        return speaker_info

if __name__ == "__main__":
    speaker_diarization = SpeakerDiarization()
    mp3_directory = "./"
    output_wav_file = "./output/merged_audio_wav.wav"
    speaker_diarization.merge_mp3_files_to_wav(mp3_directory, output_wav_file)
    speaker_info = speaker_diarization.diatrize_speakers(output_wav_file)

    # Prompt user to determine speaker gender
    for speaker_id, info in speaker_info.items():
        sample = info["samples"][0]  # Take the first sample as representative
        start, end, filename = sample["start"], sample["end"], sample["sample_file"]

        #Playing the audio file:
        audio_path = os.path.join(mp3_directory, filename)
        display(Audio(audio_path, autoplay=True))
        print(f"-->> Representative Sample for {speaker_id} in '{filename}' from {start:.1f}s to {end:.1f}s in the merged dataset.")

        #Gender Annotation
        gender = input("Enter the gender (M/F): ").upper()
        info["gender"] = gender

        #Age-group Annotation
        age_group = input("Enter the expected age group (C [<10], T [11-18], A [19-39], R [40 - 59], E [>60]): ").upper()
        info["age-group"] = age_group

    # Save speaker info to JSONL file
    with open("speaker_info.jsonl", "w") as f:
        for speaker_id, info in speaker_info.items():
            json.dump({speaker_id: info}, f)
            f.write("\n")

    # Generate summary
    total_samples = sum(len(info["samples"]) for info in speaker_info.values())
    total_speakers = len(speaker_info)
    speakers_summary = [{
        "speaker": speaker_id,
        "gender": info["gender"],
        "age-group": info["age-group"]
    } for speaker_id, info in speaker_info.items()]

    summary = {
        "total_samples": total_samples,
        "total_speakers": total_speakers,
        "speakers": speakers_summary
    }

    # Save summary to JSON file
    with open("summary.json", "w") as f:
        json.dump(summary, f, indent=4)

    print("Summary saved to summary.json")
