### Import Libraries

In [1]:
import os
import librosa
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import wavfile
import librosa
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

### Constants

In [2]:
audio_dir = './AUDIO/'

### Loading and Analyzing Audio Metadata

In [3]:
def extract_audio_metadata(directory):
    """
    Extracts metadata from audio files in a directory.
    Args:
        directory (str): Path to directory containing audio files.
    Returns:
        pd.DataFrame: DataFrame containing metadata of audio files.
    """
    metadata = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            label = dirpath.split('/')[-1]
            if filename.endswith('.wav'):
                file_path = os.path.join(dirpath, filename)
                signal, sr = librosa.load(file_path, sr=None, mono=False)
                duration = librosa.get_duration(y=signal, sr=sr)
                num_channels = signal.shape[0] if len(signal.shape) > 1 else 1
                metadata.append({
                    'file_path': file_path,
                    'filename': filename,
                    'label': label,
                    'sample_rate': sr,
                    'num_channels': num_channels,
                    'duration': duration
                })
    return pd.DataFrame(metadata)

metadata_df = extract_audio_metadata(audio_dir)
display(metadata_df.head())
# save results in an excel file called annotations file
metadata_df.to_excel('annotations_file.xlsx', index=False)

Unnamed: 0,file_path,filename,label,sample_rate,num_channels,duration
0,./AUDIO/FAKE\biden-to-linus.wav,biden-to-linus.wav,FAKE,44100,2,600.0
1,./AUDIO/FAKE\biden-to-margot.wav,biden-to-margot.wav,FAKE,44100,2,600.0
2,./AUDIO/FAKE\biden-to-musk.wav,biden-to-musk.wav,FAKE,44100,2,600.0
3,./AUDIO/FAKE\biden-to-Obama.wav,biden-to-Obama.wav,FAKE,40000,2,600.0
4,./AUDIO/FAKE\biden-to-ryan.wav,biden-to-ryan.wav,FAKE,44100,2,600.0


In [21]:
def plot_metadata_distribution(metadata_df):
    class_distr = metadata_df.groupby('label').sum()

    fig = make_subplots(
        rows=3, cols=2, 
        subplot_titles=(
            "Sample Rate Distribution (Bar Chart)", "Number of Channels Distribution (Bar Chart)",
            "Duration Distribution (Bar Chart)", "",
            "Class Distribution (Bar Chart)", "Class Distribution (Pie Chart)"
        ),
        specs=[
            [{"type": "bar"}, {"type": "bar"}],
            [{"type": "bar"}, {}],
            [{"type": "bar"}, {"type": "pie"}],
        ],
        column_widths=[0.5, 0.5]
    )

    # Plot sample rate distribution
    fig.add_trace(
        go.Bar(x=metadata_df['sample_rate'].value_counts().index, y=metadata_df['sample_rate'].value_counts(), name='Sample Rate'),
        row=1, col=1
    )

    # Plot number of channels distribution
    channels_counts = metadata_df['num_channels'].value_counts().reindex([1, 2], fill_value=0)
    fig.add_trace(
        go.Bar(x=channels_counts.index, y=channels_counts, name='Number of Channels'),
        row=1, col=2
    )

    # Plot duration distribution
    fig.add_trace(
        go.Bar(x=metadata_df['duration'].round().value_counts().sort_index().index, y=metadata_df['duration'].round().value_counts().sort_index(), name='Duration'),
        row=2, col=1
    )

    # Plot class distribution - bar chart
    bar_fig = px.bar(class_distr, x=class_distr.index, y='duration', labels={'duration': 'Length (seconds)'})
    for trace in bar_fig['data']:
        fig.add_trace(trace, row=3, col=1)

    # Plot class distribution - pie chart
    pie_fig = px.pie(class_distr, values='duration', names=class_distr.index)
    for trace in pie_fig['data']:
        fig.add_trace(trace, row=3, col=2)

    # Update layout for the entire figure
    fig.update_layout(
        height=1200,
        width=1200,
        title_text="Audio Metadata Distribution",
        showlegend=False
    )

    fig.update_xaxes(title_text="Sample Rate", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=1)
    
    fig.update_xaxes(title_text="Number of Channels", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    
    fig.update_xaxes(title_text="Duration (seconds)", row=2, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=1)

    fig.update_xaxes(title_text="Class", row=3, col=1)
    fig.update_yaxes(title_text="Length (seconds)", row=3, col=1)

    fig.show()

# Example usage:
plot_metadata_distribution(metadata_df)

We observe the following: <br>
- Different audio files have different sample rates. For consistency and to facilitate batch processing,<br>
it’s important to resample all audio files to a common sample rate.
- Audio files have varying durations. For model training, it’s necessary to have fixed-size input samples. <br>
This can be achieved by truncating longer audio files and padding shorter ones.
- Audio files may have different numbers of channels (mono vs. stereo). <br> However in our case we have 2 channels meaning stereo. Stereo audio has two channels, typically referred to as the left (L) and right (R) channels. <br>
Usually, when processing audio data for machine learning models, converting stereo audio to mono may simplify the data and reduce computational load. 