In [1]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import os
# import ffmpeg
import matplotlib.pyplot as plt
import librosa
import librosa.display
from pydub import AudioSegment as AS
from pydub import effects
import noisereduce as nr
import tensorflow as tf
import tensorflow_io as tfio
import keras
import sklearn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import sys
import h5py
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Input,Dense, Conv2D, Flatten, MaxPooling2D, Dropout, BatchNormalization, Conv1D, MaxPooling1D
from keras.layers import Bidirectional, LSTM, Reshape
from keras.regularizers import l2
from keras.callbacks import ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
import pickle
import datetime
import joblib



In [2]:
MAX_LENGTH = 157409

In [3]:
def get_signal(df):
#     librosa_signals = []
#     normalized_librosa_signals = []
    trimmed_librosa_signals = []
#     pydub_signals = []
#     normalized_pydub_signals = []
#     trimmed_pydub_signals = []
    librosa_lens = []
#     pydub_lens = []
    for index, row in df.iterrows():
        path  = row['path']
#         print(path)
        librosa_signal, sr = librosa.load(path)
#         librosa_signals.append(librosa_signal)
        normalized_librosa_signal = librosa.util.normalize(librosa_signal)
#         normalized_librosa_signals.append(normalized_librosa_signal)
        trimmed_librosa_signal, index = librosa.effects.trim(normalized_librosa_signal, top_db = 30)
        trimmed_librosa_signals.append(trimmed_librosa_signal)
        librosa_len = len(trimmed_librosa_signal)
        librosa_lens.append(librosa_len)
#         segment = AS.from_file(path)
#         segment = segment.set_channels(1)
#         segment = segment.set_frame_rate(22050)
#         pydub_signal = np.array(segment.get_array_of_samples(), dtype = 'float32')
#         pydub_signals.append(pydub_signal)
#         normalized_segment = effects.normalize(segment, headroom = 5.0)
#         normalized_pydub_signal = np.array(normalized_segment.get_array_of_samples(), dtype = 'float32')
#         normalized_pydub_signals.append(normalized_pydub_signal)
#         trimmed_pydub_signal, index2 = librosa.effects.trim(normalized_pydub_signal, top_db = 30)
#         trimmed_pydub_signals.append(trimmed_pydub_signal)
#         pydub_len = len(trimmed_pydub_signal)
#         pydub_lens.append(pydub_len)
       
        
        
        
#     df['librosa_signals'] = librosa_signals
#     df['normalized_librosa_signals'] = normalized_librosa_signals
    df['trimmed_librosa_signals'] = trimmed_librosa_signals
    df['librosa_lens'] = librosa_lens
#     df['pydub_signals'] = pydub_signals
#     df['normalized_pydub_signals'] = normalized_pydub_signals
#     df['trimmed_pydub_signals'] = trimmed_pydub_signals
#     df['pydub_lens'] = pydub_lens
    
    return df

In [4]:
def split_train_validation_test(df, train_size=0.8):
    train_df, remainder_df = train_test_split(df, test_size=1-train_size)
    validation_df, test_df = train_test_split(remainder_df, test_size=0.5)
    return train_df, validation_df, test_df

In [5]:
def get_features(data):
    mel_spectogram = librosa.feature.melspectrogram(y=data, sr = 22050)
    features = librosa.power_to_db(mel_spectogram)
#     features = np.expand_dims(logspec, axis=-1)
#     mfcc = librosa.feature.mfcc(y = data, sr=22050, n_mfcc=20)
#     features = np.expand_dims(mfcc, axis=-1)
#     features = librosa.feature.mfcc(y = data, sr=22050, n_mfcc=20)
    #deltas
#     d1 = librosa.feature.delta(mfcc, order=1)
#     d2 = librosa.feature.delta(mfcc, order=2)
#     features = np.concatenate((mfcc, d1, d2), axis=0)
    #rms and zcr
#     rms = librosa.feature.rms(y = data)
#     zcr = librosa.feature.zero_crossing_rate(y = data)
#     features = np.concatenate((mfcc, rms, zcr), axis=0)
    return features

def add_noise(data):
    noise_amp = 0.005*np.random.uniform()*np.amax(data)
    data = data.astype('float32') + noise_amp * np.random.normal(size=data.shape[0])
    return data


def add_time_stretching_slower(data, rate=0.5):
    data = librosa.effects.time_stretch(data, rate=rate)
    return data


def add_time_stretching_faster(data, rate=2.0):
    data = librosa.effects.time_stretch(data, rate=rate)
    return data


def add_time_shifting(data):
    data = np.roll(data, int(22050/10))
    return data


def add_pitch_shifting(data):
    data = librosa.effects.pitch_shift(y=data, sr=22050, n_steps = -5)

    return data


def add_freq_mask(df):
    df['freq_mask'] = df['features'].apply(lambda x: np.array(tfio.audio.freq_mask(x, param=4), dtype=np.float32))
    return df


def add_time_mask(df):
    df['time_mask'] = df['features'].apply(lambda x: np.array(tfio.audio.time_mask(x, param=4), dtype=np.float32))
    return df

def augument_mfcc(df):
    df = add_freq_mask(df.copy())
    df = add_time_mask(df.copy())
    return df

def melt_mfcc_df(df):
    df = df.melt(
        id_vars=["label"], 
        value_name="masked_features"
    )
    return df[['label', 'masked_features']]

def pad_signal_librosa(data, max_length = 157409):
    data = librosa.util.fix_length(data=data, size=MAX_LENGTH)
    return np.array(data)


# def pad_signal_pydub(data, max_length = 157409):
#     data =librosa.util.fix_length(data=data, size=MAX_LENGTH)
#     final_data = nr.reduce_noise(y=data, 
#                           y_noise=data, 
#                           sr=22050)
#     return np.array(final_data, dtype = np.float32)

In [6]:
def augment_signal_librosa(
    df,
    noise=False,
    time_shifting=False,
    pitch_shifting=False,
    time_stretching_faster=False,
    time_stretching_slower=False):
    

    noised_signal = []
    pitched_signal = []
    time_shifted_signal = []
    faster_signal = []
    slower_signal = []
    features = []
    
    for index, row in df.iterrows():
        signal = np.array(row['signal'])
     
        if noise:
            noised = add_noise(signal)
            noised = pad_signal_librosa(noised)
            noised = get_features(noised)
            noised_signal.append(noised)

            
        if pitch_shifting:
            pitched = add_pitch_shifting(signal)
            pitched = pad_signal_librosa(pitched)
            pitched = get_features(pitched)
            pitched_signal.append(pitched)
            
        if time_shifting:
            time = pad_signal_librosa(signal)
            time = add_time_shifting(time)
            time = get_features(time)
            time_shifted_signal.append(time)
                
        if time_stretching_faster:
            faster = add_time_stretching_faster(signal)
            faster = pad_signal_librosa(faster)
            faster = get_features(faster)
            faster_signal.append(faster)
            
        if time_stretching_slower:
            slower = add_time_stretching_slower(signal)
            slower = pad_signal_librosa(slower)
            slower = get_features(slower)
            slower_signal.append(slower)
            
        padded = pad_signal_librosa(signal)

        feature = get_features(padded)
        features.append(feature)

    df['unaug_features'] = features
    print('Done with the unaugumented samples')
    
    if noise:
        df['noised_features'] = noised_signal
        print('Done adding noise')
        
    if pitch_shifting:
        df['pitched_features'] = pitched_signal
        print('Done pitch_shifting')
        
    if time_shifting:
        df['time_shifted_features'] = time_shifted_signal
        print('Done time_shifting')
        
    if time_stretching_faster:
        df['faster_features'] = faster_signal
        print('Done time_stretching_faster')
        
    if time_stretching_slower:
        df['slower_features'] = slower_signal
        print('Done time_stretching_slower')
        
    

    return df


# def augment_signal_pydub(
#     df,
#     noise=False,
#     time_shifting=False,
#     pitch_shifting=False,
#     time_stretching_faster=False,
#     time_stretching_slower=False):
    

#     noised_signal = []
#     pitched_signal = []
#     time_shifted_signal = []
#     faster_signal = []
#     slower_signal = []
#     features = []
    
#     for index, row in df.iterrows():
#         signal = np.array(row['signal'])
         
#         if noise:
#             noised = add_noise(signal)
#             noised = pad_signal_pydub(noised)
#             noised = get_features(noised)
#             noised_signal.append(noised)

            
#         if pitch_shifting:
#             pitched = add_pitch_shifting(signal)
#             pitched = pad_signal_pydub(pitched)
#             pitched = get_features(pitched)
#             pitched_signal.append(pitched)
            
#         if time_shifting:
#             time = pad_signal_pydub(signal)
#             time = add_time_shifting(time)
#             time = get_features(time)
#             time_shifted_signal.append(time)
                
#         if time_stretching_faster:
#             faster = add_time_stretching_faster(signal)
#             faster = pad_signal_pydub(faster)
#             faster = get_features(faster)
#             faster_signal.append(faster)
            
#         if time_stretching_slower:
#             slower = add_time_stretching_slower(signal)
#             slower = pad_signal_pydub(slower)
#             slower = get_features(slower)
#             slower_signal.append(slower)
            
#         padded = pad_signal_pydub(signal)
        
        
#         feature = get_features(padded)
#         features.append(feature)

#     df['unaug_features'] = features
#     print('Done with the unaugumented samples')
    
#     if noise:
#         df['noised_features'] = noised_signal
#         print('Done adding noise')
        
#     if pitch_shifting:
#         df['pitched_features'] = pitched_signal
#         print('Done pitch_shifting')
        
#     if time_shifting:
#         df['time_shifted_features'] = time_shifted_signal
#         print('Done time_shifting')
        
#     if time_stretching_faster:
#         df['faster_features'] = faster_signal
#         print('Done time_stretching_faster')
        
#     if time_stretching_slower:
#         df['slower_features'] = slower_signal
#         print('Done time_stretching_slower')
        
    

#     return df

In [7]:
def melt_signal_df(df):
    
    df = df[['emotion', 'label'] + list(df.filter(regex='_features$'))]
    df = df.melt(
        id_vars=['emotion', "label"], 
        value_name="features"
    )
    return df[['label', 'features']]

In [8]:
CREMA_df = pd.read_pickle('CREMA_M_df.pkl')
RAVDESS_df = pd.read_pickle('RAVDESS_M_df.pkl')
SAVEE_df = pd.read_pickle('SAVEE_df.pkl')
# TESS_df = pd.read_pickle('TESS_df.pkl')

In [9]:
all_df = pd.concat([RAVDESS_df, SAVEE_df, CREMA_df]).reset_index(drop=True)

In [10]:
all_df = get_signal(all_df.copy())

In [11]:
all_df.head()

Unnamed: 0,emotion,path,trimmed_librosa_signals,librosa_lens
0,neutral,E:\Dissertation/RAVDESS/audio_speech_actors_01...,"[-0.00012082751, -0.0005314536, -0.00022877126...",29184
1,neutral,E:\Dissertation/RAVDESS/audio_speech_actors_01...,"[-0.0005309384, 0.00028353286, 0.00044418397, ...",29696
2,neutral,E:\Dissertation/RAVDESS/audio_speech_actors_01...,"[2.0554473e-05, 0.00054405327, 0.00017764534, ...",29184
3,neutral,E:\Dissertation/RAVDESS/audio_speech_actors_01...,"[0.0012361169, 0.0013529633, 0.00049721624, 0....",27648
4,happiness,E:\Dissertation/RAVDESS/audio_speech_actors_01...,"[-4.820062e-05, 3.8167676e-05, -4.1627845e-05,...",34304


In [12]:
librosa_df = pd.DataFrame()
librosa_df['emotion'] = all_df['emotion']
librosa_df['signal'] = all_df['trimmed_librosa_signals']

In [13]:
# pydub_df = pd.DataFrame()
# pydub_df['emotion'] = all_df['emotion']
# pydub_df['signal'] = all_df['trimmed_pydub_signals']

In [14]:
del all_df

In [15]:
def augument_samples(df, encoder_name):
    print(f'{datetime.datetime.now()}: Start augmenting...')
    ######## SET 1 ########
    set_df = df.copy().reset_index(drop=True)
    del df
    ######## encoder #########
    label_col = 'emotion' if 'emotion' in set_df.columns else 'sex_emotion'
    
    ohc = OneHotEncoder(sparse=False)
    ohc_labels = ohc.fit_transform(set_df[label_col].values.reshape(-1, 1))
    joblib.dump(ohc, f'{encoder_name}_ohe.joblib')
    
    le = LabelEncoder()
    le_labels = le.fit_transform(set_df[label_col])
    np.save(f'{encoder_name}_classes', le.classes_)
    
    set_df['label'] = pd.Series(list(ohc_labels))
#     scaler = StandardScaler()
    unprocessed_train_df, unprocessed_validation_df, unprocessed_test_df = split_train_validation_test(set_df.copy())
    
    valid_config = {
#         'noise': True,
#         'time_shifting': True,
#         'pitch_shifting': True,
#         'time_stretching_faster': True,
#         'time_stretching_slower': True
    }
    
    train_config = {
#         'noise': True,
#         'time_shifting': True,
#         'pitch_shifting': True,
#         'time_stretching_faster': True,
#         'time_stretching_slower': True
    }
    
    #train
#     if data == 'librosa':
    augmented_df = augment_signal_librosa(unprocessed_train_df.copy(), **train_config)
#     else:
#         augmented_df = augment_signal_pydub(unprocessed_train_df.copy(), **train_config)
    del unprocessed_train_df
    train_df = melt_signal_df(augmented_df.copy())
#     train_df['features'] = train_df['features'].apply(lambda x: scaler.fit_transform(x))
    #for mfcc aug
#     train_df = augument_mfcc(train_df.copy())
#     train_df = melt_mfcc_df(train_df.copy())
    
    del augmented_df
    print(f'{datetime.datetime.now()}: Done train set')
    
    # validation
#     if data == 'librosa':
    processed_df = augment_signal_librosa(unprocessed_validation_df.copy(), **valid_config)
#     else:
#         processed_df = augment_signal_pydub(unprocessed_validation_df.copy(), **valid_config)
    del unprocessed_validation_df
    validation_df = melt_signal_df(processed_df.copy())
#     scaler = StandardScaler()
#     validation_df['features'] = validation_df['features'].apply(lambda x: scaler.fit_transform(x))
    del processed_df
    print(f'{datetime.datetime.now()}: Done validation set')
    
    # test
#     if data == 'librosa':
    processed_df = augment_signal_librosa(unprocessed_test_df.copy(), **valid_config)
#     else:
#         processed_df = augment_signal_pydub(unprocessed_test_df.copy(), **valid_config)
    del unprocessed_test_df
    test_df = melt_signal_df(processed_df.copy())
#     scaler = StandardScaler()
#     test_df['features'] = test_df['features'].apply(lambda x: scaler.fit_transform(x))
    del processed_df
    
    return train_df, validation_df, test_df

In [16]:
all_samples_train, all_samples_valid, all_samples_test = augument_samples(librosa_df, encoder_name = 'librosa_all_samples')
all_samples_train.to_pickle('librosa_all_samples_train.pkl')
all_samples_valid.to_pickle('librosa_all_samples_valid.pkl')
all_samples_test.to_pickle('librosa_all_samples_test.pkl')
del librosa_df
del all_samples_train
del all_samples_valid
del all_samples_test

2022-06-15 00:13:04.944482: Start augmenting...
Done with the unaugumented samples
2022-06-15 00:14:01.313478: Done train set
Done with the unaugumented samples
2022-06-15 00:14:09.216752: Done validation set
Done with the unaugumented samples


In [17]:
# all_samples_train, all_samples_valid, all_samples_test = augument_samples(pydub_df, encoder_name = 'pydub_all_samples', data = 'pydub')
# all_samples_train.to_pickle('pydub_all_samples_train.pkl')
# all_samples_valid.to_pickle('pydub_all_samples_valid.pkl')
# all_samples_test.to_pickle('pydub_all_samples_test.pkl')
# del pydub_df
# del all_samples_train
# del all_samples_valid
# del all_samples_test

In [18]:
# ex1 = pydub_df[:10]
# ex2 = librosa_df[:10]

In [19]:
# ex1_train, ex1_valid, ex1_test = augument_samples(ex1, encoder_name = 'test', data = 'pydub')

In [20]:
# ex1_train

In [21]:
# test = np.stack([np.array(val) for val in ex1_train['masked_features'].values], axis=0)

In [22]:
# test.shape

In [23]:
# ex1_valid

In [24]:
# ex1_test

In [25]:
# ex2_train, ex2_valid, ex2_test = augument_samples(ex2, encoder_name = 'test2', data = 'librosa')

In [26]:
# ex2_train

In [27]:
# ex2_valid

In [28]:
# ex2_test