# Extract Audio Feautures


In [None]:
import sys
import pickle

import scipy.stats as stats
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences


import audiofile
import opensmile
from util import *


In [2]:
pd.set_option('display.max_columns', None)


In [3]:
def configure_openSmile(**kwargs):
    '''
    Configure OpenSmile.

        Parameters:
            folder_list (list): A list of folder paths to extract features

        Returns:
            smile (obj): Returns configured smile object
    '''
    
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
        loglevel=4,
        logfile='smile.log',
        verbose=True,

    )
    return smile


In [4]:
def extract_feautre(folder_list):
    '''
    Extracts audio feature for each file in file path.

        Parameters:
            folder_list (list): A list of filepaths to extract audio features

        Returns:
            features (Dataframe): Extracted features from USC-Iemocap database
    '''
    
    smile = configure_openSmile()
    try:
        features = smile.process_files(folder_list)
        return features

    except FileNotFoundError as e:
        print(f'Error --> {e}')


In [5]:
def z_score(dataframe, inplace=False):
    '''
    Apply z-score (standardization) to the columns of the database.

        Parameters:
            data: Data to apply z-score
            inplace (bool): Modify the dataframe inplace. Default (False)

        Returns:
            dataframe: Z-scores applied on each columns of dataframe
    '''

    if not inplace:
        dataframe = dataframe.copy()

    # apply the z-score method
    for column in dataframe.columns:
        dataframe[column] = (dataframe[column] - dataframe[column].mean()
                             ) / dataframe[column].std()

    print('Finished standardizing (z-score) data')
    return dataframe


In [6]:
def pad_signal_data(signal_sequence, dtype='int32', padding='pre'):
    '''
    Pad each signal with zeros for eqaul data dimenstions.

        Parameters:
            signal_sequence (List): Signal list to be padded
            padding (string):  ('pre' or 'post') pad either before or after each sequence

        Returns:
            padded (DataFrame): Padded dataframe
    '''

    padded = pad_sequences(signal_sequence, dtype=dtype, padding=padding)

    print('Finished padding data')
    return padded


In [6]:
full_ids = []
with open('./data/processed/IEMOCAP/processed_ids.txt') as f:
    full_ids = f.readlines()
full_ids = [x.strip() for x in full_ids]


In [7]:
list_files = []

for x in range(1, 6):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/sentences/wav'

    file_search(path, list_files)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")


Session1, # Num of files: 1819
Session2, # Num of files: 3630
Session3, # Num of files: 5766
Session4, # Num of files: 7869
Session5, # Num of files: 10039


## Get list of filenames in category, (Angry, Happy, Sad, Neutral)

In [8]:
full_path = []

for file_ in full_ids:
    for file2 in list_files:
        if file_+'.wav' == file2.split('\\')[-1]:
            full_path.append(file2)
            break


In [9]:
print(len(full_path))

5531


## Extract four category feature

In [None]:
# features = extract_feautre(full_path)
# features.to_pickle('D:/feat.pk')

## Post-processing: Load Pickled feature

In [10]:
with open('D:/feat.pk', 'rb') as pickle_file:
    data = pickle.load(pickle_file)


In [12]:
feature = []

for idx, file in enumerate(full_path):
    feature.append(data.loc[file].values.tolist())
    # feature.append(data.loc[file].to_numpy())

print(len(feature))


5531


In [13]:
padded_data = pad_signal_data(feature, dtype='float64', padding='post')

Finished padding data


In [14]:
print(padded_data.shape)

(5531, 3409, 65)


In [25]:
np.save('padded_features.npy', padded_data)

In [7]:
pad_dat = np.load('padded_features.npy')

In [None]:
data_norm = z_score(pad_dat)


In [13]:
label_data = []
with open('./data/processed/IEMOCAP/processed_label.txt') as f:
    label_data = f.readlines()

with open('./data/processed/IEMOCAP/FC_label.txt', 'w') as f:
    for i, label in enumerate(full_cat_label):
        if label != '-1':
            f.write(label_data[i])


## lll

In [None]:
with open('padded_features.pkl', 'wb') as f:
    pickle.dump(padded_data, f)
