In [4]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('data/Animal_Sound.csv')

# Print the DataFrame
print(df.columns)

df['path'] = df['name'].apply(lambda x: f"data/sounds/{x}")
df['name'] = df['name'].str.lower().str.split("_").str[0]

all_animals = df['name'].unique()
print(all_animals)

df.to_csv("data/Animal_Sound_processed.csv", index=False)

Index(['name', 'path', 'channels', 'sample_width', 'frame_rate', 'nframes',
       'duration', 'size'],
      dtype='object')
['lion' 'bear' 'cat' 'chicken' 'cow' 'dog' 'dolphin' 'donkey' 'elephant'
 'frog' 'horse' 'monkey' 'sheep']


In [2]:
import librosa
from IPython.display import Audio
import numpy as np
import soundfile as sf

def apply_time_stretch(y, rate_range=(0.8, 1.2)):
    rate = np.random.uniform(*rate_range)
    return librosa.effects.time_stretch(y, rate=rate)

def apply_pitch_shift(y, sr, n_steps_range=(-3, 3)):
    n_steps = np.random.uniform(*n_steps_range)
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

def apply_time_delay(y, sr, max_delay_sec=0.5):
    max_delay = int(sr * max_delay_sec)
    delay = np.random.randint(0, max_delay)
    return np.pad(y, (delay, 0))[:len(y)]  # Crop to original length

In [3]:
import os

os.makedirs("data/augmented", exist_ok=True)
augmented_data = []

for path, name in zip(df['path'], df['name']):
    # load the sound file 
    y, sr = librosa.load(path, sr=None)
    
    # apply time-stretching
    y_stretched = apply_time_stretch(y)

    # apply pitch-shifting
    y_stretched_pitch_shifted = apply_pitch_shift(y_stretched, sr)

    # Create output file name
    base_name = os.path.splitext(os.path.basename(path))[0]
    processed_file_name = f"data/augmented/{base_name}_modified.wav"

    # write the augmented sound file
    sf.write(processed_file_name, y_stretched_pitch_shifted, sr)

    # add the new file to the DataFrame
    augmented_data.append({'path': processed_file_name, 'name': name})

df_augmented = pd.DataFrame(augmented_data)
df = pd.concat([df, df_augmented], ignore_index=True)

df.to_csv("data/Animal_Sound_modified.csv", index=False)





In [3]:
import pandas as pd

# Load the full augmented dataset
df = pd.read_csv("data/Animal_Sound_modified.csv")

# Group by 'name' and sample 10 examples per class (with a fixed seed for reproducibility)
df_reduced = df.groupby('name', group_keys=False).apply(lambda x: x.sample(n=10, random_state=42))

# Save the reduced dataset to a new CSV file
df_reduced.to_csv("data/Animal_Sound_reduced.csv", index=False)

print("Saved reduced dataset with 10 samples per class to 'data/Animal_Sound_reduced.csv'")


Saved reduced dataset with 10 samples per class to 'data/Animal_Sound_reduced.csv'


  df_reduced = df.groupby('name', group_keys=False).apply(lambda x: x.sample(n=10, random_state=42))


In [4]:
import librosa
from IPython.display import Audio
import numpy as np

def play(file_path):
    x, Fs = librosa.load(file_path, sr=None)
    print('Class: {}'.format(file_path))
    return Audio(x, rate=Fs)


In [5]:
audio_path = df['path'].iloc[600]
play(audio_path)

Class: data/sounds/Sheep_1.wav


In [6]:
def apply_time_delay(y, sr, max_delay_sec=0.5):
    max_delay = int(sr * max_delay_sec)
    delay = np.random.randint(0, max_delay)
    return np.pad(y, (delay, 0))[:len(y)]  # Crop to original length

In [7]:
y, sr = librosa.load(audio_path, sr=None)
y_delayed = apply_time_delay(y, sr)
Audio(y_delayed, rate=sr)

In [8]:
def apply_time_stretch(y, rate_range=(0.8, 1.2)):
    rate = np.random.uniform(*rate_range)
    return librosa.effects.time_stretch(y, rate=rate)

In [9]:
y, sr = librosa.load(audio_path, sr=None)
y_fast = librosa.effects.time_stretch(y, rate=2.0)
y_stretch = apply_time_stretch(y)
Audio(y_stretch, rate=sr)

In [10]:
def apply_pitch_shift(y, sr, n_steps_range=(-3, 3)):
    n_steps = np.random.uniform(*n_steps_range)
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)


In [11]:
y, sr = librosa.load(audio_path, sr=None)
y_pitch_shift = apply_pitch_shift(y, sr)
Audio(y_pitch_shift, rate=sr)

In [12]:
y, sr = librosa.load(audio_path, sr=None)
y_fast = librosa.effects.time_stretch(y, rate=2.0)
y_pitch_shift = apply_pitch_shift(y_fast, sr)
Audio(y_pitch_shift, rate=sr)

In [13]:
import librosa
import soundfile as sf

y, sr = librosa.load(audio_path, sr=22050)
sf.write('data/normalized_output.wav', y_pitch_shift, sr)


In [14]:
y_new, sr = librosa.load('normalized_output.wav', sr=None)
Audio(y_new, rate=sr)


  y_new, sr = librosa.load('normalized_output.wav', sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'normalized_output.wav'

In [None]:
import librosa
import numpy as np

def extract_log_mel_spectrogram(
    filepath,
    sr=22050,
    n_fft=1024,
    hop_length=512,
    n_mels=60
):
    # Load and resample audio
    y, _ = librosa.load(filepath, sr=sr)

    # Normalize to [-1, 1]
    y = y / np.max(np.abs(y))

    # Compute mel spectrogram
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        power=2.0  # power=2.0 for energy spectrogram
    )

    # Convert to log scale (dB)
    log_mel = librosa.power_to_db(mel, ref=np.max)

    return log_mel


In [None]:
log_mel = extract_log_mel_spectrogram(audio_path)
log_mel

array([[-35.46962 , -32.530785, -23.58237 , ..., -26.693075, -31.507694,
        -25.603748],
       [-36.44289 , -37.436466, -31.154095, ..., -29.122112, -33.66753 ,
        -34.479843],
       [-22.913347, -22.691414, -33.960197, ..., -42.450428, -39.44455 ,
        -46.86478 ],
       ...,
       [-62.37937 , -56.630657, -56.15444 , ..., -50.871246, -48.691647,
        -51.138477],
       [-61.562973, -57.41426 , -57.654106, ..., -48.362488, -50.369686,
        -55.128693],
       [-61.626476, -58.169632, -57.142212, ..., -51.180733, -55.419994,
        -59.2877  ]], shape=(60, 27), dtype=float32)