# Preprocess

This Notebook Gives a guideline on how to turn audio into MFCCs Vector. Note: This Notebook only shows code used for dataset CREMA-D AND DOES NOT USES CORRECT VARIABLES (Read the Markdown and Comment for each step)

## Library

In [None]:
import os
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf

## Step by Step

### Load to DataFrame

The Following Cell is a function that takes 2 folder input of train and test. The function only work on CREMA-D Dataset because of file naming and uses split manually 
This Function Returns 2 DataFrame containing file path and target for each file in train and test.

In [None]:
def load_to_dataframe(train_folder_path, test_folder_path):
    """
    Loads train and test data from the specified file folders, and returns them as pandas DataFrame objects.
    :param train_folder_path: The path to the folder containing the train data files.
    :type train_folder_path: str
    :param test_folder_path: The path to the folder containing the test data files.
    :type test_folder_path: str
    :return: A two pandas DataFrame objects, containing the train and test data, respectively.
    :rtype: pandas.core.frame.DataFrame, pandas.core.frame.DataFrame
    """
    train_path = 'dataset/train'
    test_path = 'dataset/test'
    train_dir_list = os.listdir(train_path)
    test_dir_list = os.listdir(test_path)

    train_sentiment_value = []
    test_sentiment_value = []
    train_file_path = []
    test_file_path = []

    for file in train_dir_list:
        train_file_path.append(train_path + '/' + file)
        sentiment_code = file.split('_')
        if sentiment_code[2] == 'ANG':
            train_sentiment_value.append('angry')
        elif sentiment_code[2] == 'DIS':
            train_sentiment_value.append('disgust')
        elif sentiment_code[2] == 'FEA':
            train_sentiment_value.append('fear')
        elif sentiment_code[2] == 'HAP':
            train_sentiment_value.append('happy')
        elif sentiment_code[2] == 'NEU':
            train_sentiment_value.append('neutral')
        elif sentiment_code[2] == 'SAD':
            train_sentiment_value.append('sad')
        else:
            train_sentiment_value.append('unknown')

    for file in test_dir_list:
        test_file_path.append(test_path + '/' + file)
        sentiment_code = file.split('_')
        if sentiment_code[2] == 'ANG':
            test_sentiment_value.append('angry')
        elif sentiment_code[2] == 'DIS':
            test_sentiment_value.append('disgust')
        elif sentiment_code[2] == 'FEA':
            test_sentiment_value.append('fear')
        elif sentiment_code[2] == 'HAP':
            test_sentiment_value.append('happy')
        elif sentiment_code[2] == 'NEU':
            test_sentiment_value.append('neutral')
        elif sentiment_code[2] == 'SAD':
            test_sentiment_value.append('sad')
        else:
            test_sentiment_value.append('unknown')

    train_sentiment_df = pd.DataFrame(
        {"File_Path": train_file_path, "Target": train_sentiment_value})

    test_sentiment_df = pd.DataFrame(
        {"File_Path": test_file_path, "Target": test_sentiment_value})

    return train_sentiment_df, test_sentiment_df


### Turn Audio Signal into MFCCs

1. Setting up Variables and Initialazation

In [None]:
# Set Variables for Constructing MFCCs
num_mfcc = 40 # Number of MFCCS
SAMPLE_RATE = 16000 # Set this for the desired sample rate, use None if want to use Native Sample Rate
n_fft = 2048 # Length of FFT
hop_length = 512 # Length of Hop_Length

# For more info open Librosa MFCCs Documentation

train_df = "" # Place the Train DataFrame returned from previous CELL here
test_df = "" # Place the Test DataFrame returned from previous CELL here


# Initialize 2 Dictionary of train and test value
train_data = {
    "labels": [],
    "mfcc": []
}

test_data = {
    "labels": [],
    "mfcc": []
}

2. Encode Categories


In [None]:
# The following Labels is get from previous step on turning the data into data frame
labels = {'disgust': 0, 'happy': 1, 'sad': 2,
            'neutral': 3, 'fear': 4, 'angry': 5}
train_df_encoded = train_df.replace({'Target': labels}, inplace=False)
test_df_encoded = test_df.replace({'Target': labels}, inplace=False)

3. Loop all data to Turn it into MFCCs

In [None]:
# The following code loop the train_df from previous process. To create the test set just replace all named "train" into "test"
for item, row in train_df.iterrows():
    train_data['labels'].append(train_df_encoded.iloc[item, 1])
    signal, sample_rate = librosa.load(
        train_df_encoded.iloc[item, 0], sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(
        y=signal, sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
    
    # Transpose the MFCCs (For more readable data)
    mfcc = mfcc.T
    train_data["mfcc"].append(np.asarray(mfcc))

4. Zero-padding and Saving the preprocessed data into numpy for easier load to Model

In [None]:
# Turn it into numpy array 
train_data_value = np.asarray(train_data['mfcc'])
train_data_target = np.asarray(train_data["labels"])

# Preprocess using TensorFlow 
# maxlen should be set on the longest value of the data (on the Frame Axis)
# to check each data loop the train_data_value and use .shape to find the longest length.
# maxLength = max(len(x) for x in train_data_value)
# The commented line above can be used to find the maximum length from the train_data_value
train_data_value = tf.keras.preprocessing.sequence.pad_sequences(
    train_data_value, maxlen=156, dtype="float32")

# Saving the dataset for easier load
# this would create .npy file on your directory
np.save("Crema-D_Train", train_data_value)