In [1]:
import os
import librosa
import numpy as np

In [None]:
##input_path, output_wav_path, output_features_path are to be replaced with paths on the local storage

def convert_and_extract_features(input_path, output_wav_path, output_features_path):
    # Convert audio file to WAV format
    os.system(f'ffmpeg -i "{input_path}" -ac 1 -ar 22050 -vn "{output_wav_path}"')
    #this ffmpeg command takes an input audio file, converts it to a mono(-ac 1) WAV file with a sample rate of 22,050 Hz(-ar 22050), and saves the result to the specified output file path.

    # Load audio file
    y, sr = librosa.load(output_wav_path)


    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y, sr=sr, n_mfcc=13)
    mfccs_feature = np.mean(mfccs, axis=1)
    #This line computes the MFCCs of an audio signal y. The parameters are as follows:#y: The audio time series.
    #sr: The sampling rate of y.
    #n_mfcc: The number of MFCCs to compute. In this case, it's set to 13, which is a common choice.


    # Extract pitch and volume
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch_feature = np.mean(pitches)
    volume_feature = np.mean(librosa.feature.rms(y=y))

    # Save features to a file
    with open(output_features_path, 'w') as f:
        f.write(f'MFCCs: {mfccs_feature}\n')
        f.write(f'Pitch: {pitch_feature}\n')
        f.write(f'Volume: {volume_feature}\n')