<a href="https://colab.research.google.com/github/brancatellimat/speech-emotion-recognition/blob/main/DataAugmentation_FeatureExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf


import os
import sys
import math

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import librosa.feature as libf
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, minmax_scale, scale
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint

from scipy.stats import kurtosis
from scipy.stats import skew

eps = sys.float_info.epsilon

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

No GPU, using /device:CPU:0.


### Data Augmentation
In this work, we are going to generate more audio samples by adding noise and pitching the signal

In [None]:
data_path = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Audio Pattern Recognition/Project_Brancatelli/final_combined_dataset.csv')
data_path.head()

Unnamed: 0,Emotions,Path
0,angry,/content/drive/MyDrive/Colab Notebooks/Audio P...
1,disgust,/content/drive/MyDrive/Colab Notebooks/Audio P...
2,disgust,/content/drive/MyDrive/Colab Notebooks/Audio P...
3,angry,/content/drive/MyDrive/Colab Notebooks/Audio P...
4,sad,/content/drive/MyDrive/Colab Notebooks/Audio P...


In [None]:
# NOISE
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

# PITCH
def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

### Extract both Time and Frequency domain features

In [None]:
def zcr(data,frame_length,hop_length):
  with tf.device(device_name):
    zcr=librosa.feature.zero_crossing_rate(y=data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(zcr)
def rmse(data,frame_length=2048,hop_length=512):
  with tf.device(device_name):
    rmse=librosa.feature.rms(y=data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(rmse)
def mfcc(data,sr,frame_length=2048,hop_length=512,flatten:bool=True):
  with tf.device(device_name):
    mfcc=librosa.feature.mfcc(y=data,sr=sr)
    return np.squeeze(mfcc.T)if not flatten else np.ravel(mfcc.T)

In [None]:
def extract_features(data,sr=22050,frame_length=2048,hop_length=512):
  with tf.device(device_name):
    result=np.array([])

    result=np.hstack((result,
                      zcr(data,frame_length,hop_length),
                      rmse(data,frame_length,hop_length),
                      mfcc(data,sr,frame_length,hop_length)
                     ))
    return result

In [None]:
def get_features(path,duration=2.5, offset=0.6):
  with tf.device(device_name):

    # Original Audio
    data,sr=librosa.load(path,duration=duration,offset=offset)
    aud=extract_features(data)
    audio=np.array(aud)

    # Noised Audio
    noised_audio=noise(data)
    aud2=extract_features(noised_audio)
    audio=np.vstack((audio,aud2))

    # Pitched Audio
    pitched_audio=pitch(data,sr)
    aud3=extract_features(pitched_audio)
    audio=np.vstack((audio,aud3))

    # Noised and Pitched Audio
    pitched_audio1=pitch(data,sr)
    pitched_noised_audio=noise(pitched_audio1)
    aud4=extract_features(pitched_noised_audio)
    audio=np.vstack((audio,aud4))

    return audio

In [None]:
from joblib import Parallel, delayed
import timeit

with tf.device(device_name):
  start = timeit.default_timer()
  # Define a function to get features for a single audio file
  def process_feature(path, emotion):
      features = get_features(path)
      X = []
      Y = []
      for ele in features:
          X.append(ele)
          # appending emotion 3 times as we have made 2 augmentation techniques on each audio file + the original audio file.
          Y.append(emotion)
      return X, Y

  paths = data_path.Path
  emotions = data_path.Emotions

  # Run the loop in parallel
  with tf.device(device_name):
    results = Parallel(n_jobs=-1)(delayed(process_feature)(path, emotion) for (path, emotion) in zip(paths, emotions))

  # Collect the results
  X = []
  Y = []
  for result in results:
      x, y = result
      X.extend(x)
      Y.extend(y)


  stop = timeit.default_timer()

  print('Time: ', stop - start)

In [None]:
len(X), len(Y), data_path.Path.shape

(1680, 1680, (420,))

In [None]:
Emotions = pd.DataFrame(X)
Emotions['Emotions'] = Y
Emotions.head()

In [None]:
Emotions.to_csv('features.csv', index=False)

In [None]:
import shutil
shutil.move('/content/features.csv', '/content/drive/MyDrive/Colab Notebooks/Audio Pattern Recognition/Project_Brancatelli/features.csv')