### Import libraries and create custom functions

In [None]:
import whisper
import json
import os
from IPython.display import Audio, display, HTML
from pydub import AudioSegment
from pytube import YouTube
from moviepy.editor import *
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import pandas as pd
import random
from simple_diarizer.diarizer import Diarizer
from simple_diarizer.utils import (check_wav_16khz_mono, convert_wavfile,
                                   waveplot, combined_waveplot, waveplot_perspeaker)
import tempfile
from pprint import pprint
import soundfile as sf
from tqdm.autonotebook import tqdm


def split_into_sentences(segments):
    sentences = []
    current_sentence = []
    for segment in segments:
        word, start, end = segment['text'], segment['start'], segment['end']
        current_sentence.append((word, start, end))
        if word.endswith('.'):
            sentences.append(current_sentence)
            current_sentence = []
    if current_sentence:
        sentences.append(current_sentence)
    return sentences

def segment_audio(filename, sentences):
    audio = AudioSegment.from_file(filename)
    segments = []
    for sentence in sentences:
        start = int(sentence[0][1] * 1000)  # Convert to milliseconds
        end = int(sentence[-1][2] * 1000)
        segment = audio[start:end]
        segments.append(segment)
    return segments

def save_audio_segment(segment, filename):
    segment.export(filename, format="wav")

def analyze_audio(filename, analyze_by_sentence=False):
    mysp = __import__("my-voice-analysis")

    model = whisper.load_model("base")
    result = model.transcribe(filename)
    text = result["text"]
    print('Transcribed text: ', text)
    print('')

    file = os.path.basename(filename).replace('.wav', '')
    directory = os.path.dirname(filename)

    if analyze_by_sentence:
        sentences = split_into_sentences(result["segments"])
        audio_segments = segment_audio(filename, sentences)

        analyses = []

        for i, segment in enumerate(audio_segments):
            temp_filename = os.path.join(directory, f"temp_segment_{i}.wav")
            save_audio_segment(segment, temp_filename)

            try:
                temp_filename_ = os.path.basename(temp_filename).replace('.wav', '')
                gender, emotion = mysp.myspgend(temp_filename_, directory)
                dataset, *analysis_results = mysp.mysptotal(temp_filename_, directory)
            except:
                gender = emotion = "Unknown"
                analysis_results = ["Unknown"] * 13

            analyses.append({
                "sentence": ' '.join([word[0] for word in sentences[i]]),
                "gender": gender,
                "emotion": emotion,
                **dict(zip(["number_of_syllables", "number_of_pauses", "rate_of_speech", "articulation_rate", "speaking_duration", "original_duration", "balance", "f0_mean", "f0_std", "f0_median", "f0_min", "f0_max", "f0_quantile25", "f0_quan75"], analysis_results))
            })

            os.remove(temp_filename)
    else:
        try:
            gender, emotion = mysp.myspgend(file, directory)
            dataset, *analysis_results = mysp.mysptotal(file, directory)
        except:
            gender = emotion = "Unknown"
            analysis_results = ["Unknown"] * 13

        analyses = [{
            "filename": filename,
            "text": text,
            "gender": gender,
            "emotion": emotion,
            **dict(zip(["number_of_syllables", "number_of_pauses", "rate_of_speech", "articulation_rate", "speaking_duration", "original_duration", "balance", "f0_mean", "f0_std", "f0_median", "f0_min", "f0_max", "f0_quantile25", "f0_quan75"], analysis_results))
        }]

    return analyses

def sanitize_filename(filename):
    """Remove spaces and special characters from the filename."""
    filename = re.sub(r'[^a-zA-Z0-9\-_\.]', '', filename.replace(" ", "_"))
    return filename

def download_youtube_audio(url, output_path="downloaded_audio"):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Download YouTube video
    yt = YouTube(url)
    video_stream = yt.streams.filter(only_audio=True).first()
    audio_file = video_stream.download(output_path=output_path)

    # Sanitize filename
    sanitized_filename = sanitize_filename(yt.title) + '.wav'
    wav_filepath = os.path.join(output_path, sanitized_filename)

    audio_clip = AudioFileClip(audio_file)
    audio_clip.write_audiofile(wav_filepath, fps=44100, nbytes=2, codec='pcm_s16le')

    # Remove the original download (if it's not a wav file)
    if not audio_file.endswith('.wav'):
        os.remove(audio_file)

    print(f"Audio downloaded and converted to WAV: {wav_filepath}")
    return wav_filepath


def shorten_sentence(sentence):
    words = sentence.split()
    if len(words) > 20:
        return ' '.join(words[:5]) + ' ... ' + ' '.join(words[-5:])
    return sentence

    
def freq_plot(analysis_results):
    data = analysis_results
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Convert columns to numeric as needed
    numeric_cols = ['f0_mean', 'f0_std', 'f0_median', 'f0_min', 'f0_max', 'f0_quantile25', 'f0_quan75']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    # Use color to represent emotion - can map each emotion to a color
    # Ensure emotions have random colors
    unique_emotions = df['emotion'].unique()
    emotion_colors = {emotion: "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for emotion in unique_emotions}
    
    # Plotting only f0_median with quantiles as confidence
    plt.figure(figsize=(15, max(6, len(df) / 2)))  # Adjust figure size as needed
    
    # Plotting each point and its confidence interval
    for i, row in df.iterrows():
        plt.plot(row['f0_median'], i, 'o', color=emotion_colors[row['emotion']])
        plt.hlines(i, row['f0_quantile25'], row['f0_quan75'], color=emotion_colors[row['emotion']], alpha=0.3)

    # Shorten sentences if they are too long
    df['sentence_short'] = df['sentence'].copy()
    df['sentence_short'] = df['sentence_short'].apply(shorten_sentence)
    
    plt.yticks(ticks=df.index, labels=df['sentence_short'])
    plt.title("Median Fundamental Frequency (F0) with Confidence Intervals for Each Sentence")
    plt.ylabel("Sentences")
    plt.xlabel("Frequency (Hz)")
    
    # Create a legend for the emotions
    legend_elements = [Line2D([0], [0], color=color, lw=4, label=emotion) for emotion, color in emotion_colors.items()]
    plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

def heat_plot(analysis_results):
    data = analysis_results
    df = pd.DataFrame(data)
    # Convert columns to numeric as needed
    numeric_cols = ['number_of_syllables', 'number_of_pauses', 'rate_of_speech', 
                    'articulation_rate', 'speaking_duration', 'f0_mean', 'f0_std', 
                    'f0_median', 'f0_min', 'f0_max', 'f0_quantile25', 'f0_quan75']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    df['sentence_short'] = df['sentence'].copy()
    df['sentence_short'] = df['sentence_short'].apply(shorten_sentence)

    # Ensure all columns are numeric for the heatmap
    heatmap_data = df[numeric_cols]
    
    # Drop any rows with NaN values (if necessary)
    heatmap_data = heatmap_data.dropna()

    # Create the heatmap
    plt.figure(figsize=(len(heatmap_data.columns), 15))  # Adjust figure size as needed
    sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='g', yticklabels=df['sentence_short'])  # 'g' format to avoid scientific notation
    plt.title("Heatmap of Fundamental Frequencies and Speech Features")
    plt.ylabel("Sentences")
    plt.xlabel("Features")
    plt.xticks(rotation=90)
    plt.show()
    
    

In [None]:
import os
from pytube import YouTube
from moviepy.editor import AudioFileClip

def sanitize_filename(filename):
    # Add a simple function to sanitize filenames
    return "".join(c for c in filename if c.isalnum() or c in [' ', '_', '-']).rstrip()

def download_youtube_content(url, download_type="audio", output_path="downloads"):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Download YouTube content
    yt = YouTube(url)

    if download_type == "audio":
        # Download only audio
        video_stream = yt.streams.filter(only_audio=True).first()
        audio_file = video_stream.download(output_path=output_path)

        # Convert to WAV (if necessary)
        sanitized_filename = sanitize_filename(yt.title) + '.wav'
        wav_filepath = os.path.join(output_path, sanitized_filename)

        audio_clip = AudioFileClip(audio_file)
        audio_clip.write_audiofile(wav_filepath, fps=44100, nbytes=2, codec='pcm_s16le')

        # Remove the original download (if it's not a wav file)
        if not audio_file.endswith('.wav'):
            os.remove(audio_file)

        print(f"Audio downloaded and converted to WAV: {wav_filepath}")
        return wav_filepath

    elif download_type == "video":
        # Download video with audio
        video_stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
        video_file = video_stream.download(output_path=output_path)
        sanitized_filename = sanitize_filename(yt.title) + '.mp4'
        sanitized_filepath = os.path.join(output_path, sanitized_filename)
        os.rename(video_file, sanitized_filepath)

        print(f"Video downloaded: {sanitized_filepath}")
        return sanitized_filepath

# Example usage
download_youtube_content("https://www.youtube.com/watch?v=rKX4oOXQ2j0", download_type="video")


## Prepare your audio

You can use your own audio. However, your audio files must be in *.wav format, recorded at 44 kHz sample frame and 16 bits of resolution.

You also have the option to download the audio from youtube. This can either be of someone speaking or someone singing. 

Please select audio that is less than 10 mins long. 

### Download youtube audio

In [None]:
from pydub import AudioSegment

def trim_wav_file(input_file_path, start_time_s, end_time_s=None):
    """
    Trims a .wav file from start_time_s and optionally to end_time_s, then saves it as a new file.

    Args:
    input_file_path (str): Path to the input .wav file.
    start_time_s (int): Start time in seconds to begin trimming.
    end_time_s (int, optional): End time in seconds to stop trimming. If None, the audio is trimmed only from the start.
    """

    # Load the audio file
    audio = AudioSegment.from_wav(input_file_path)

    # Determine the end time for trimming
    if end_time_s is not None:
        trimmed_audio = audio[start_time_s * 1000:end_time_s * 1000]
        end_time_label = f"_to_{end_time_s}s"
    else:
        trimmed_audio = audio[start_time_s * 1000:]
        end_time_label = ""

    # Create the new file name with timestamps
    file_name, file_extension = os.path.splitext(input_file_path)
    new_file_name = f"{file_name}_{start_time_s}s{end_time_label}{file_extension}"

    # Export the trimmed audio
    trimmed_audio.export(new_file_name, format="wav")

    print(f"Trimmed audio saved to {new_file_name}")

In [None]:
# Replace or add your YouTube URLs
youtube_urls = ["https://youtu.be/InR69mIZMzA", "https://www.youtube.com/watch?v=uAPUkgeiFVY", "https://www.youtube.com/watch?v=Ez-L0qW9iGQ"]
for url in youtube_urls:
    download_youtube_audio(url)

### Separate vocals from instruments

If you are using audio from a song then you can separate the vocals from the instruments using spleeter

In [None]:
output_folder = './downloaded_audio/'
input_audio = './downloaded_audio/The_Heart_Part_5.wav'
!spleeter separate -o {output_folder} {input_audio}

### You can trim the wav file here:

In [None]:
trim_wav_file('./downloaded_audio/The_Heart_Part_5.wav', 5)

### Playback audio
If the audio file is too long, then this audio plug-in will not work

In [None]:
# If you are using backward slash, then add r in front of the filepath. If you are using forward slash, then no r necessary
file_path = r".\downloaded_audio\Justin_Trudeau_says_he_does_not_remember_how_many_times_he_wore_blackface.wav"

In [None]:
Audio(filename=file_path)

## Analysis Time

In [None]:
analyze_by_sentence = True  # Set to False to analyze the whole audio
analysis_results = analyze_audio(file_path, analyze_by_sentence)
analysis_results

In [None]:
freq_plot(analysis_results)

In [None]:
heat_plot(analysis_results)

In [None]:
diarization = Diarizer(embed_model='xvec', cluster_method='sc')
segments = diarization.diarize(file_path, num_speakers=2)
segments   

In [None]:
signal, fs = sf.read(file_path)
combined_waveplot(signal, fs, segments)
plt.show()

### Speaker Similiarity

#### Enroll stage

The enroll workflow requires two parameters, one being a unique numeric id that must be 9 characters long and a path to a wav or flac file of the users voice. Below is the required syntax and format for the this stage.


In [None]:
!python -m speaker_verification enroll --id 123456789 --audio-path C:/Users/ahnji/OneDrive/Documents/Prototypes/PietZwart/downloaded_audio/Prime_Minister_Trudeaus_message_on_Remembrance_Day.wav

### Validate stage

The validate workflow retrives a user enrollment based on the given id parameter given and then uses the --audio-path input to accept an audio file as speaker input to verify against the given user enrollment.

In [None]:
!python -m speaker_verification validate --id 123456789 --audio-path C:/Users/ahnji/OneDrive/Documents/Prototypes/PietZwart/downloaded_audio/