## The whole process of diarization, speakers selection and gender prediction applied to a playlist

In [None]:
!pip install pyannote.audio

In [2]:
import numpy as np 
import pandas as pd 
import os
from tqdm import tqdm
import torch
from pyannote.audio import Pipeline
import json

import scipy
import csv
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from tensorflow.keras.models import load_model

import glob
import shutil
from pydub import AudioSegment
import librosa
from useful_funtions import extract_feature, preprocess_audio, predict_gender

# Diarization

## Utilization of Multiple Dictionaries

In this analysis, several dictionaries are employed to organize and process the audio data effectively:

- **`Diarizations{}`**: 
  - Associates each `file_name` with its corresponding diarization, which is an `Annotation` object.

- **`unique_speakers{}`**: 
  - For each `file_name`, associates a set of unique speakers identified in the audio file.

- **`durations_conferences{}`**: 
  - Maps each `file_name` to a `Duration{}` dictionary. 
  - For each speaker in the audio file, `Duration{}` associates their total spoken duration.
  - **Purpose**: This is particularly useful for determining the principal speaker in each conference.

- **`longest_segments_conferences{}`**: 
  - For each `file_name`, associates a `longest_segments{}` dictionary.
  - For each speaker in the audio file, `longest_segments{}` associates the duration of the longest spoken segment and the corresponding segment.
  - **Purpose**: This is useful for extracting a reasonable subsegment for every speaker to predict their gender.


## Paths

In [None]:
# Path to the directory containing audio files
audio_folder = '/kaggle/input/playlist2'

# Path to the gender prediction model
model_path ="/kaggle/input/genderrec/model.h5"

model = load_model(model_path)

## Auxiliary functions

In [11]:
from pydub import AudioSegment

def convert_mp3_to_wav(mp3_path, wav_path):
    if not os.path.exists(wav_path):
        cmd = ['ffmpeg', '-i', mp3_path, '-acodec', 'pcm_s16le', '-ar', '16000', wav_path]
        subprocess.run(cmd, check=True)
        
        
def delete_file(file_path):
    if os.path.exists(file_path):
            os.remove(file_path)
                  

def extract_subsegment(source_path, start_time, end_time, output_path):

    audio = AudioSegment.from_mp3(source_path)

    subsegment = audio[start_time:end_time]

    subsegment.export(output_path, format="mp3")

### Diarization and extraction of different speakers in the audio files

### Saving excerpts for each speaker in each audio file

In [None]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="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")

# send pipeline to GPU (when available)
pipeline.to(torch.device("cuda"))


# Dictionnaire pour stocker les résultats de la diarisation
Diarizations = {}
unique_speakers={}
durations_conferences = {}
longest_segments_conferences = {}



# Iterate over the MP3 files in the directory with a progress bar
for file in tqdm(os.listdir(audio_folder), desc="Processing audio files"):
    # Check if the file is an MP3 file
    if file.endswith('.mp3'):
        file_name = os.path.basename(file)
        mp3_path = os.path.join(audio_folder, file)
        wav_path = file_name[:-3] + "wav" 
        convert_mp3_to_wav(mp3_path, wav_path)
        diarization = pipeline(wav_path)
        Diarizations[file_name] = diarization
        #Unique speakers
        unique_speaker = set(label for turn,_, label in diarization.itertracks(yield_label=True))
        unique_speakers[file_name] = unique_speaker
        #Durations
        Duration={}
        #Longest segments
        Longest_segment = {}
        for speaker in unique_speaker:
            longest_segment = None
            longest_duration = 0
            total_duration = 0

            for segment, _, speaker_iter in diarization.itertracks(yield_label=True):
                if speaker_iter == speaker:
                    duration = segment.duration
                    total_duration += duration
                    if duration > longest_duration:
                        longest_duration = duration
                        longest_segment = segment

                Duration[speaker] = total_duration
                Longest_segment[speaker] = [longest_duration, longest_segment]
        durations_conferences[file_name] = Duration
        longest_segments_conferences[file_name] = Longest_segment

        delete_file(wav_path)

In [None]:
audio_folder="/kaggle/input/playlist2"
SegmentsPerFile ={}
for file in tqdm(os.listdir(audio_folder), desc="Saving audios for each speaker in each audio file"):
    file_name = os.path.basename(file)
    unique_speaker = unique_speakers[file_name]
    Longest_segment = longest_segments_conferences[file_name]
    print(Longest_segment)
    
    SegmentsPerSpeaker = {}
    
    for speaker in unique_speaker :
        if Longest_segment[speaker][0]>3 :
            print(speaker, file_name, Longest_segment[speaker][0],Longest_segment[speaker][1].start,Longest_segment[speaker][1].end)
            start_time = Longest_segment[speaker][1].start * 1000
            end_time = Longest_segment[speaker][1].end * 1000
            extract_subsegment('/kaggle/input/playlist/' + file_name, start_time, end_time, file_name[:-4] +speaker+ '.mp3')
            SegmentsPerSpeaker[speaker] = file_name[:-4] + speaker+'.mp3'

    SegmentsPerFile[file_name] = SegmentsPerSpeaker

# Gender prediction

In [None]:
conferences_info = []

for file in os.listdir(audio_folder):
    file_name = os.path.basename(file)
    unique_speaker = unique_speakers[file_name]
    longest_segment = longest_segments_conferences[file_name]
    speakers_duration = durations_conferences[file_name]

    # Principal speaker is the one who talked the most
    principal_speaker = max(speakers_duration, key=speakers_duration.get)

    # Predict gender for principal speaker
    principal_speaker_gender = predict_gender(principal_speaker, file_name, model)

    # Collect data for each conference
    conference_data = {
        "conference": file_name,
        "principal_speaker": principal_speaker,
        "principal_speaker_gender": principal_speaker_gender,
        "number_of_interruptions": len(unique_speaker) - 1,
        "interruptors": []
    }

    # Process interruptors
    for speaker in unique_speaker:
        if speaker != principal_speaker:
            try:
                gender = predict_gender(speaker, file_name, model)
                conference_data["interruptors"].append({
                    "speaker": speaker,
                    "gender": gender
                })
            except FileNotFoundError:
                conference_data["interruptors"].append({
                    "speaker": speaker,
                    "error": "Interruption was too short"
                })

    conferences_info.append(conference_data)

