In [1]:
# Import dependency
# need to install pydub
import os 
from pydub import AudioSegment
import pandas as pd
import glob
import numpy as np
import re
import librosa
from tqdm import tqdm

In [2]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    X, sample_rate = librosa.core.load(file_name)
    if chroma or contrast:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
    if contrast:
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, contrast))
    if tonnetz:
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
        result = np.hstack((result, tonnetz))
    return result

In [16]:
# This line find all our database and store them as a list of ospath object
dirname = "data"

if not os.path.isdir(dirname):
    os.mkdir(dirname)

csv_files = glob.glob("*csv")

# This loop through all filenames stored in our csv file and filter out those has no label on gender column
for j,csv_file in enumerate(csv_files):
    print("[+] Processing", csv_file)
    df = pd.read_csv(csv_file)
    df["new_file_name"] = df["filename"].apply(lambda x: re.sub(".mp3",".npy",str(x)))

    new_df = df[["new_file_name","gender"]]
    print("Previously:", len(new_df), "rows")
    # Use boolean mask to filter rows without gender label
    new_df = new_df[np.logical_or(df["gender"] == "male", df["gender"] == "female")]
    print("Now:", len(new_df), "rows")
    new_csv_file = os.path.join(dirname, csv_file)
    
    new_df.to_csv(new_csv_file, index= False)

    folder_name, _ = csv_file.split(".")
    # Empty list for audio file path
    audio_files = [] 

    # append path to list
    for x in df["filename"]:
        path = x.split("/")
        audio_files.append(f"data/{path[0]}/{x}")

    # convert mp3 file to npy array which contains the melspectrogram feature
    for i, audio_file in tqdm(list(enumerate(audio_files)), f"Extracting features of {folder_name}"):
        splited = os.path.split(audio_file)
        audio_file_path = f"data/{os.path.split(splited[0])[-1]}/{os.path.split(splited[0])[-1]}/{splited[-1]}"
        output_file_path = f"data/{os.path.split(splited[0])[-1]}/{splited[-1]}"
        # audio_filename = f"{os.path.split}"
        # Files
        dst = re.sub(".mp3",".wav",str(output_file_path))
        npy_file_name = re.sub(".mp3","",str(output_file_path))
        sound = AudioSegment.from_mp3(audio_file_path)
        samples = sound.get_array_of_samples()
        # Export to a wav file, because librosa takes wav object only
        sound.export(dst, format= "wav")
        
        feature = extract_feature(dst, mel = True)
        np.save(npy_file_name,feature)
        # Remove the wav file
        os.remove(dst)

 






[+] Processing cv-valid-train.csv
Previously: 73278 rows
Now: 73278 rows


Extracting features of cv-valid-train: 100%|██████████| 73278/73278 [4:39:17<00:00,  4.37it/s]


In [None]:
# csv_files = glob.glob("*.csv")
# print(csv_files)
# # all_audio_filenames = set(new_df["new_file_name"])

# # audio_files = glob.glob(f"{folder_name}/*")

# # for i, audio_file in tqdm(list(enumerate(audio_files)), f"Extracting features of {folder_name}"):
# #         splited = os.path.split(audio_file)
# #         # audio_filename = os.path.join(os.path.split(splited[0])[-1], splited[-1])
# #         audio_filename = f"{os.path.split(splited[0])[-1]}/{splited[-1]}"
# # # for j, csv_file in enumerate(csv_files):
# # #     folder_name, _ = csv_file.split(".")

# # #     print(folder_name)
