In [None]:
import pandas as pd
import numpy as np
import os
import librosa
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib
import keras
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import to_categorical


# Define paths for different datasets
RAVDESS_PATH = "/ravdess/audio_speech_actors_01-24/"
CREMA_PATH = "/cremad/AudioWAV/"
TESS_PATH = "/TESS Toronto emotional speech set data/"
SAVEE_PATH = "/savee/ALL/"

# RAVDESS dataset
def load_ravdess(path):
    directory_list = os.listdir(path)
    file_emotion, file_path = [], []
    
    for actor_dir in directory_list:
        actor_path = os.path.join(path, actor_dir)
        actor_files = os.listdir(actor_path)
        
        for file in actor_files:
            filename_parts = file.split('.')[0].split('-')
            if len(filename_parts) >= 3:
                emotion = int(filename_parts[2])
                
                file_emotion.append(emotion)
                file_path.append(os.path.join(actor_path, file))
    
    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)


# CREMA dataset
def load_crema(path):
    directory_list = os.listdir(path)
    file_emotion, file_path = [], []

    for file in directory_list:
        file_path.append(os.path.join(path, file))
        part = file.split('_')[2]

        if part == 'SAD':
            file_emotion.append('sad')
        elif part == 'ANG':
            file_emotion.append('angry')
        elif part == 'DIS':
            file_emotion.append('disgust')
        elif part == 'FEA':
            file_emotion.append('fear')
        elif part == 'HAP':
            file_emotion.append('happy')
        elif part == 'NEU':
            file_emotion.append('neutral')
        else:
            file_emotion.append('Unknown')

    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)

# TESS dataset
def load_tess(path):
    directory_list = os.listdir(path)
    file_emotion, file_path = [], []

    for dir in directory_list:
        sub_directories = os.listdir(os.path.join(path, dir))
        for file in sub_directories:
            part = file.split('.')[0].split('_')[2]
            emotion = 'surprise' if part == 'ps' else part
            file_emotion.append(emotion)
            file_path.append(os.path.join(path, dir, file))

    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)

# SAVEE dataset
def load_savee(path):
    directory_list = os.listdir(path)
    file_emotion, file_path = [], []

    for file in directory_list:
        file_path.append(os.path.join(path, file))
        part = file.split('_')[1]
        ele = part[:-6]

        if ele == 'a':
            file_emotion.append('angry')
        elif ele == 'd':
            file_emotion.append('disgust')
        elif ele == 'f':
            file_emotion.append('fear')
        elif ele == 'h':
            file_emotion.append('happy')
        elif ele == 'n':
            file_emotion.append('neutral')
        elif ele == 'sa':
            file_emotion.append('sad')
        else:
            file_emotion.append('surprise')

    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)

# Feature extraction function
def extract_features(data, sample_rate):
    # features: ZCR, Chroma, MFCC, RMS, Mel spectrogram
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=np.abs(librosa.stft(data)), sr=sample_rate).T, axis=0)
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)

    return np.hstack((zcr, chroma_stft, mfcc, rms, mel))

# Load datasets
ravdess_df = load_ravdess(RAVDESS_PATH)
crema_df = load_crema(CREMA_PATH)
tess_df = load_tess(TESS_PATH)
savee_df = load_savee(SAVEE_PATH)

# Concatenate dataframes
data_path = pd.concat([ravdess_df, crema_df, tess_df, savee_df], axis=0)

# Feature extraction
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    features = extract_features(data, sample_rate)
    X.append(features)
    Y.append(emotion)

# Convert to NumPy arrays
X = np.array(X)
Y = np.array(Y)

# encode the labels
encoder = OneHotEncoder()
Y_encoded = encoder.fit_transform(Y.reshape(-1, 1)).toarray()

# Saving the encoder file
joblib.dump(encoder, 'encoder.joblib', protocol=4)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, Y_encoded, random_state=0, shuffle=True)

# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Saving the scaler file
joblib.dump(scaler, 'scaler.joblib', protocol=4)

# Expanding dimensions for compatibility with Conv1D
x_train_scaled = np.expand_dims(x_train_scaled, axis=2)
x_test_scaled = np.expand_dims(x_test_scaled, axis=2)

# Building model
model = Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train_scaled.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=len(encoder.categories_[0]), activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()


# Callbacks for model training
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
checkpoint = ModelCheckpoint("speech_model.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# Training the model
history = model.fit(x_train_scaled, y_train, batch_size=64, epochs=50,
                    validation_data=(x_test_scaled, y_test),
                    callbacks=[rlrp, checkpoint])


# Evaluate the model on test data
accuracy = model.evaluate(x_test_scaled, y_test)[1] * 100
print("Accuracy of the model on test data:", accuracy, "%")

