# Extract acoustic features

In [1]:
from google.colab import drive
import sys
import os

drive.mount('/content/drive')

sys.path.append(os.path.abspath('/content/drive/MyDrive/Università/NAML_project/Utilities'))
sys.path.append(os.path.abspath('/content/drive/MyDrive/Università/NAML_project/Utilities/rp_extract'))

Mounted at /content/drive


## Utility function

In [2]:
import os, shutil
import librosa
import numpy as np
from rp_extract.rp_extract import rp_extract
from rp_extract.audiofile_read import *

def compute_mfcc_features(audio_file):
    """
    Computes MFCC features from an audio file.

    Args:
        audio_file (str): Path to the audio file.

    Returns:
        numpy.ndarray: Final MFCC features.
    """
    # Load the audio file
    y, sr = librosa.load(audio_file)

    # Compute MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Compute statistics across all frames
    mean_mfccs = np.mean(mfccs, axis=1)
    var_mfccs = np.var(mfccs, axis=1)
    max_mfccs = np.max(mfccs, axis=1)
    min_mfccs = np.min(mfccs, axis=1)

    # Concatenate statistics to get final MFCC features
    final_mfcc_features = np.concatenate((mean_mfccs, var_mfccs, max_mfccs, min_mfccs))

    return final_mfcc_features


def compute_psycho_acoustic_descriptors(audiofile):
    """
    Compute psychoacoustic descriptors from an audio file.

    This function reads an audio file and computes various psychoacoustic descriptors
    using the rp_extract function from the rp_extract module.

    Args:
        audiofile (str): Path to the audio file.

    Returns:
        dict: Dictionary containing the computed psychoacoustic descriptors.

    Example:
        audio_file = "example.wav"
        features = compute_psycho_acoustic_descriptors(audio_file)
        print("Psychoacoustic descriptors:", features)
    """
    # Read audio file to obtain sample rate, sample width, and waveform data
    samplerate, samplewidth, wavedata = audiofile_read(audiofile)

    # Convert stereo waveform data to mono by taking the mean along the axis
    wavedata_mono = np.mean(wavedata, axis=1)

    # Define a list of features to extract
    fext = ['ssd', 'rh', 'mvd', 'tssd', 'trh']

    # Call rp_extract function to extract various psychoacoustic descriptors
    features = rp_extract(wavedata_mono,
                          samplerate,
                          extract_ssd=('ssd' in fext),  # Extract Statistical Spectrum Descriptor
                          extract_tssd=('tssd' in fext),  # Extract temporal Statistical Spectrum Descriptor
                          extract_rh=('rh' in fext),  # Extract Rhythm Histogram features
                          extract_trh=('trh' in fext),  # Extract temporal Rhythm Histogram features
                          extract_mvd=('mvd' in fext),  # Extract Modulation Frequency Variance Descriptor
                          spectral_masking=True,
                          transform_db=True,
                          transform_phon=True,
                          transform_sone=True,
                          fluctuation_strength_weighting=True,
                          skip_leadin_fadeout=1,
                          step_width=1)

    return features


# Utility function to move subfolders to parent folder
def move_subfolders_to_parent(folder_path):
  # Iterate through subdirectories of folder2
  for root, dirs, files in os.walk(folder_path):
    for subdir in dirs:
      subdir_path = os.path.join(root, subdir)
      # Move each subdirectory to folder1
      shutil.move(subdir_path, os.path.dirname(folder_path))


# Utility function to delete directories
def delete_directory(directory):
  """Deletes a directory and all its contents.

  Args:
      directory (str): The path to the directory to be deleted.
  """
  print('Deleting directory ' + directory)
  try:
    shutil.rmtree(directory)
    print(f"Directory '{directory}' and all its contents have been successfully deleted.")
  except Exception as e:
    print(f"An error occurred while deleting the directory '{directory}': {e}")

## Processing

In [None]:
import csv

datasets = [
    'gtzan',
    'homburg',
    'ismir2004'
    ]
datasets_drive_path = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/'

for dataset in datasets:
  # upload and unzip the dataset from google drive
  dataset_folder = datasets_drive_path + dataset + '.zip'
  !unzip "$dataset_folder"

  move_subfolders_to_parent('/content/content')

  # Create csv file to save feature value
  csv_path = datasets_drive_path + dataset + '_acoustic_features.csv'
  header = ['label', 'mfcc', 'ssd', 'rh', 'mvd', 'tssd', 'trh']
  with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)

  # Iterate throught the genres in the dataset
  genres = os.listdir('/content/' + dataset)
  for genre in genres:

    # if not genre in ['reggae', 'metal', 'disco', 'pop', 'country', 'hiphop', 'classical', 'jazz']:
      # Iterate throught the images for each genre
      audios = os.listdir('/content/' + dataset + '/' + genre)
      for audio in audios:

        audio_path = '/content/' + dataset + '/' + genre + '/' + audio

        # Initilize feature list with target label
        features = [genre]

        # Compute mfcc
        mfcc = compute_mfcc_features(audio_path)

        # Convert from numpy array to list
        mfcc = mfcc.tolist()

        # Add mfcc to feature list
        features.append(mfcc)

        #  Compute psycho acoustic descriptors
        pad = compute_psycho_acoustic_descriptors(audio_path)

        # Add psycho acoustic descriptors to features
        for ft in header[2:]:
          to_append = pad[ft]
          # Convert numpay array to list
          to_append = to_append.tolist()
          features.append(to_append)

        # Save features
        with open(csv_path, 'a', newline='') as csvfile:
          writer = csv.writer(csvfile)
          writer.writerow(features)

        # Log
        print('Processed file ' + audio_path)

  delete_directory('/content/' + dataset)

Archive:  /content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan.zip
  inflating: content/gtzan/reggae/reggae.00017-2.wav  
  inflating: content/gtzan/reggae/reggae.00067-0.wav  
  inflating: content/gtzan/reggae/reggae.00045-1.wav  
  inflating: content/gtzan/reggae/reggae.00083-2.wav  
  inflating: content/gtzan/reggae/reggae.00093-1.wav  
  inflating: content/gtzan/reggae/reggae.00006-2.wav  
  inflating: content/gtzan/reggae/reggae.00032-0.wav  
  inflating: content/gtzan/reggae/reggae.00014-0.wav  
  inflating: content/gtzan/reggae/reggae.00079-0.wav  
  inflating: content/gtzan/reggae/reggae.00067-2.wav  
  inflating: content/gtzan/reggae/reggae.00011-2.wav  
  inflating: content/gtzan/reggae/reggae.00048-0.wav  
  inflating: content/gtzan/reggae/reggae.00037-2.wav  
  inflating: content/gtzan/reggae/reggae.00099-0.wav  
  inflating: content/gtzan/reggae/reggae.00054-2.wav  
  inflating: content/gtzan/reggae/reggae.00041-2.wav  
  inflating: content/gtzan/reggae

  result[:,2] = stats.skew(matrix, axis=1)
  result[:,3] = stats.kurtosis(matrix, axis=1, fisher=False) # Matlab calculates Pearson's Kurtosis


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: content/ismir2004/pop/9-tied-15.wav  
  inflating: content/ismir2004/pop/9-tied-18.wav  
  inflating: content/ismir2004/pop/6-order-8.wav  
  inflating: content/ismir2004/pop/1-nocturne-6.wav  
  inflating: content/ismir2004/pop/1-nocturne-2.wav  
  inflating: content/ismir2004/pop/9-tied-4.wav  
  inflating: content/ismir2004/pop/2-artificial-13.wav  
  inflating: content/ismir2004/pop/7-dawn-17.wav  
  inflating: content/ismir2004/pop/1-nocturne-30.wav  
  inflating: content/ismir2004/pop/2-artificial-0.wav  
  inflating: content/ismir2004/pop/7-dawn-6.wav  
  inflating: content/ismir2004/pop/6-order-11.wav  
  inflating: content/ismir2004/pop/1-nocturne-24.wav  
  inflating: content/ismir2004/pop/6-order-3.wav  
  inflating: content/ismir2004/pop/6-order-21.wav  
  inflating: content/ismir2004/pop/2-artificial-19.wav  
  inflating: content/ismir2004/pop/9-tied-3.wav  
  inflating: content/ismir2004/pop/2-a

In [6]:
delete_directory('/content/gtzan')

Deleting directory /content/gtzan
Directory '/content/gtzan' and all its contents have been successfully deleted.


In [3]:
datasets_drive_path = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/'

# upload and unzip the dataset from google drive
dataset_folder = datasets_drive_path + 'gtzan' + '.zip'
!unzip "$dataset_folder"

move_subfolders_to_parent('/content/content')

Archive:  /content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan.zip
  inflating: content/gtzan/reggae/reggae.00017-2.wav  
  inflating: content/gtzan/reggae/reggae.00067-0.wav  
  inflating: content/gtzan/reggae/reggae.00045-1.wav  
  inflating: content/gtzan/reggae/reggae.00083-2.wav  
  inflating: content/gtzan/reggae/reggae.00093-1.wav  
  inflating: content/gtzan/reggae/reggae.00006-2.wav  
  inflating: content/gtzan/reggae/reggae.00032-0.wav  
  inflating: content/gtzan/reggae/reggae.00014-0.wav  
  inflating: content/gtzan/reggae/reggae.00079-0.wav  
  inflating: content/gtzan/reggae/reggae.00067-2.wav  
  inflating: content/gtzan/reggae/reggae.00011-2.wav  
  inflating: content/gtzan/reggae/reggae.00048-0.wav  
  inflating: content/gtzan/reggae/reggae.00037-2.wav  
  inflating: content/gtzan/reggae/reggae.00099-0.wav  
  inflating: content/gtzan/reggae/reggae.00054-2.wav  
  inflating: content/gtzan/reggae/reggae.00041-2.wav  
  inflating: content/gtzan/reggae

In [5]:
audio_path = '/content/gtzan/reggae/reggae.00003-0.wav'

mfcc = compute_mfcc_features(audio_path)

mfcc = mfcc.tolist()

print(mfcc)
print(type(mfcc))

# Compute psycho acoustic descriptors
pad = compute_psycho_acoustic_descriptors(audio_path)

ssd = pad['ssd']
ssd = ssd.tolist()
print(ssd)
print(type(ssd))

[-192.96142578125, 106.25189971923828, -28.324861526489258, 38.25286865234375, 10.153759956359863, 24.045177459716797, -0.18931978940963745, 13.914320945739746, -5.911829948425293, 17.662519454956055, -10.806893348693848, 10.096566200256348, -4.937828540802002, 6216.83740234375, 988.3209228515625, 782.6829223632812, 342.3885803222656, 278.1431884765625, 211.68978881835938, 261.9875183105469, 222.89207458496094, 108.88451385498047, 94.91883087158203, 96.82382202148438, 109.71653747558594, 84.38623809814453, -33.127628326416016, 183.297607421875, 52.838653564453125, 79.84310913085938, 56.144649505615234, 65.03419494628906, 34.950279235839844, 40.723167419433594, 26.37328338623047, 36.927513122558594, 11.610837936401367, 36.76343536376953, 19.511859893798828, -379.4405822753906, 28.999408721923828, -106.61056518554688, -8.517305374145508, -31.638446807861328, -13.895339012145996, -42.210906982421875, -24.70712661743164, -33.55411911010742, -6.438881874084473, -36.89912033081055, -15.75782

In [None]:
print(pad['ssd'])

[2.19066952 3.29649834 2.87837307 2.39819286 1.91497758 1.83829028 1.91015602 1.84270235 1.6268688  1.56638234 ... 7.58325057 6.01123816 4.83907228 3.90655555 3.05836533 2.75819305 2.75691751
 2.3596199  1.13666964 0.05663844]
