# Imports

In [29]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import noisereduce as nr

from IPython.display import Audio

# Config

In [30]:
from config import run_config, AUDIO_PATH

run_config()

# Helper Functions

In [31]:
def load_audio(path, sr=16000):
    try:
        y, _ = librosa.load(AUDIO_PATH / path, sr=sr)
        return y
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return None

In [32]:
def preprocess_audio(y, sr=16000):
    if y is None:
        return None
    # Remove silence
    intervals = librosa.effects.split(y, top_db=20)
    y_trimmed = np.concatenate([y[start:end] for start, end in intervals])
    
    # Normalize volume: Volume variations, Different microphone quality
    y_norm = librosa.util.normalize(y_trimmed)
    
    # Noise reduction
    y_denoised = nr.reduce_noise(y=y_norm, sr=sr)

    return y_denoised

# Preprocessing

In [33]:
df = pd.read_csv("data/filtered_data_labeled.tsv", sep='\t')[:500]
df.head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3


In [34]:
sample_input = load_audio("common_voice_en_158.mp3")
sample_output = preprocess_audio(sample_input)

# Play sample_input
if sample_input is not None: display(Audio(sample_input, rate=16000))
# Play sample_output
if sample_output is not None: display(Audio(sample_output, rate=16000))

In [None]:
# Filter out corrupt or unreadable audio
df['audio_data'] = df['path'].apply(lambda path: preprocess_audio(load_audio(path)))

# Drop rows with failed audio loading
df = df[df['audio_data'].notnull()]
df.reset_index(drop=True, inplace=True)

KeyError: 'audio'