In [5]:
import os
import librosa
import numpy as np
import pandas as pd

def augment_audio(y, sr, noise_factor, stretch_factor, n_steps):
    """
    Creates and returns 3 augmentated versions of an inputted audio file: 
    1. One with added noise (where random noise is scaled by a relatively low noise factor)
    2. One with time stretching 
    3. One with a pitch shift 
    Args:
        y: audio time series of original file
        sr: sampling rate of original audio file
        noise_factor: scales the amplitude of the random noise that is added to of the original time series
        stretch_factor: factor for time stretching (stretch_factor < 1 for a sped up version, stretch_factor > 1 for a slowed down version)
        n_steps: steps for pitch shifting (n_steps < 0 for a lower pitched version, n_steps > 0 for a higher pitched version)
    """
    # Noisy version
    y_noise = y + noise_factor * np.random.randn(len(y))
    
    # Stretched version
    y_stretch = librosa.effects.time_stretch(y, rate=stretch_factor)
    
    # Shifted version
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
    
    return y_noise, y_stretch, y_shift

def extract_features_sequentially(y, sr):
    """
    Extract the MFCCs, chromagrams, and spectral contrasts of each song sequentially based on the sample rates defined below
    and store the resulting sequence in a list
    """
    # Define the hop length that will be used to sample the songs (512 equates to a standard sampling rate of 43 frames per second
    # so 43 x 30 seconds = 1290 samples per song)
    hop_length = 512
    
    # Extract MFCCs and store in list
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20, hop_length=hop_length)
    
    # Extract Chromagrams and store in list
    chromagram = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
    
    # Extract Spectral Contrasts and store in list
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)
    
    # Return the sequentially extracted features in their list form 
    return mfcc, chromagram, spectral_contrast

In [6]:
# Define the path to the audio files folder
audio_files_path = '/Users/nikanhassanipak/Library/Mobile Documents/com~apple~CloudDocs/Georgia Tech/Spring 2024/CS 4641/Project/All_Audio_Files'

# Initialize the DataFrame columns as the features (labeled sequentially based on the sample number) and genre. 
# Recall there are 43 x 30 = 1290 samples per song, 20 MFCC bands, 12 chroma bands, and 7 spectral bands 
columns = ['filename', 'genre'] + \
          [f'mfcc{i}_sample{j}' for i in range(1, 21) for j in range(1, 1291)] + \
          [f'chroma{i}_sample{j}' for i in range(1, 13) for j in range(1, 1291)] + \
          [f'spectral{i}_sample{j}' for i in range(1, 8) for j in range(1, 1291)]

# Initialize the DataFrame
df = pd.DataFrame(columns=columns)

# Loop through each song in the audio files folder, extracting the sequential features for the original and augmented versions of the song 
for file in os.listdir(audio_files_path):
    if file.endswith('.wav'):
        file_path = os.path.join(audio_files_path, file)
        y, sr = librosa.load(file_path)

        # Extract the genre from the filename
        genre = file.split('_')[0]  

        # Augment the audio file
        y_noise, y_stretch, y_shift = augment_audio(y, sr, noise_factor=0.005, stretch_factor=0.8, n_steps=-1)

        # Create a list of all 4 versions of the song, matched with their label indicating which version they are
        versions = [(y, 'original'), (y_noise, 'noisy'), (y_stretch, 'stretched'), (y_shift, 'shifted')]

        # Extract the sequential features for each version of the song
        for audio_data, label in versions:
            mfcc, chromagram, spectral_contrast = extract_features_sequentially(audio_data, sr)

            # Flatten each feature array and prepare row data (indexing is used to make sure all songs have the same number of samples)
            row_data = [f"{file[:-4]}_{label}", genre]  
            row_data.extend(mfcc.flatten().tolist()[:25800])
            row_data.extend(chromagram.flatten().tolist()[:15480])
            row_data.extend(spectral_contrast.flatten().tolist()[:9030])

            # Add the features to the DataFrame
            df.loc[len(df)] = row_data

In [7]:
df

Unnamed: 0,filename,genre,mfcc1_sample1,mfcc1_sample2,mfcc1_sample3,mfcc1_sample4,mfcc1_sample5,mfcc1_sample6,mfcc1_sample7,mfcc1_sample8,...,spectral7_sample1281,spectral7_sample1282,spectral7_sample1283,spectral7_sample1284,spectral7_sample1285,spectral7_sample1286,spectral7_sample1287,spectral7_sample1288,spectral7_sample1289,spectral7_sample1290
0,reggae.00080_original,reggae.00080.wav,-116.029190,-102.943291,-113.375038,-116.010246,-116.837837,-109.115578,-81.046753,-79.636337,...,17.660311,14.260842,13.578406,13.342985,13.181918,13.680677,10.615456,12.795183,13.579218,16.357143
1,reggae.00080_noisy,reggae.00080.wav,-113.005824,-97.363946,-107.916686,-113.372865,-112.549058,-102.758178,-79.538245,-78.345472,...,15.968209,12.996874,14.891778,14.375874,14.456955,13.826061,10.911447,15.000694,14.945936,15.280132
2,reggae.00080_stretched,reggae.00080.wav,-116.029129,-104.924713,-127.435776,-142.686371,-137.441650,-136.532852,-140.032822,-133.096619,...,14.961805,16.728583,16.753903,15.716349,16.652128,17.367087,17.603094,16.458706,16.682939,17.559754
3,reggae.00080_shifted,reggae.00080.wav,-121.144615,-109.377792,-119.898056,-124.594307,-121.784241,-121.727959,-102.408516,-89.395569,...,66.720759,67.212013,74.524392,65.813920,63.840065,63.367759,67.302857,65.384033,63.951833,62.752506
4,jazz.00016_original,jazz.00016.wav,-245.418533,-232.972672,-257.811829,-272.275238,-275.270294,-276.789795,-278.366119,-282.312775,...,33.083776,30.184180,30.723996,30.543312,29.909328,32.181333,30.304078,31.077431,31.117528,30.894600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,jazz.00033_shifted,jazz.00033.wav,-169.769333,-151.757507,-170.000381,-180.288437,-197.664337,-214.293320,-227.959000,-229.809143,...,60.609204,57.997637,58.743953,57.362698,59.215101,60.568943,61.276837,61.193182,60.482025,59.863790
3996,reggae.00099_original,reggae.00099.wav,-129.979935,-55.368382,-0.305676,-3.460715,-63.880615,-145.220367,-122.687813,-87.512535,...,23.047471,22.926745,23.578320,23.241844,23.738907,35.080055,36.452826,38.002728,36.971474,36.147947
3997,reggae.00099_noisy,reggae.00099.wav,-123.874179,-51.208319,0.867764,-2.563490,-61.390709,-129.218776,-109.530000,-77.794523,...,18.761271,20.272070,20.681569,21.446070,18.770627,15.048487,15.784019,14.968864,15.414351,15.024857
3998,reggae.00099_stretched,reggae.00099.wav,-128.180542,-68.776047,-42.349430,-33.482040,-39.989414,-72.657745,-129.232544,-157.831070,...,22.357944,25.454540,24.546384,27.252233,25.771214,25.992904,22.727350,22.228506,19.142808,17.518300


In [8]:
# Write data to a csv
filename = 'Sequential_format.csv'
df.to_csv(filename, index=False) 