# Exploratory Data Analysis for Voice Classification

In this notebook, we will perform exploratory data analysis (EDA) on the voice dataset. We will visualize the data and gain insights that will help us in the model training process.

In [None]:
import os
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for seaborn
sns.set(style='whitegrid')

# Define the path to the raw audio files
raw_data_path = '../data/raw/'

# Function to load audio files and extract basic information
def load_audio_data(path):
    audio_files = []
    for filename in os.listdir(path):
        if filename.endswith('.wav'):
            file_path = os.path.join(path, filename)
            audio, sr = librosa.load(file_path)
            audio_files.append({'filename': filename, 'duration': librosa.get_duration(y=audio, sr=sr)})
    return pd.DataFrame(audio_files)

# Load the audio data
audio_data = load_audio_data(raw_data_path)

# Display the first few rows of the audio data
audio_data.head()

In [None]:
# Visualize the distribution of audio durations
plt.figure(figsize=(12, 6))
sns.histplot(audio_data['duration'], bins=30, kde=True)
plt.title('Distribution of Audio Durations')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Further analysis can be added here, such as visualizing audio waveforms, spectrograms, etc.
# For example, let's visualize a sample audio waveform:

sample_file = audio_data['filename'].iloc[0]
sample_audio, sr = librosa.load(os.path.join(raw_data_path, sample_file))

plt.figure(figsize=(12, 4))
plt.plot(sample_audio)
plt.title(f'Waveform of {sample_file}')
plt.xlabel('Samples')
plt.ylabel('Amplitude')
plt.show()