In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# paths for datasets
RAVDESS_PATH = 'ravdess/audio_speech_actors_01-24/'
CREMA_PATH = 'cremad/AudioWAV/'
AUDIO_EMOTIONS_PATH = 'audio emotions/Emotions/'

# Loading RAVDESS dataset into DataFrame
def load_ravdess(path):
    directory_list = os.listdir(path)
    file_emotion, file_path = [], []
    
    for actor_dir in directory_list:
        actor_path = os.path.join(path, actor_dir)
        actor_files = os.listdir(actor_path)
        
        for file in actor_files:
            filename_parts = file.split('.')[0].split('-')
            if len(filename_parts) < 3:
                continue  
            emotion = int(filename_parts[2])
            
            file_emotion.append(emotion)
            file_path.append(os.path.join(actor_path, file))
    
    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)

# Loading CREMA dataset into DataFrame
def load_crema(path):
    directory_list = os.listdir(path)
    file_emotion, file_path = [], []

    for file in directory_list:
        file_path.append(os.path.join(path, file))
        part = file.split('_')[2]

        if part == 'SAD':
            file_emotion.append('sad')
        elif part == 'ANG':
            file_emotion.append('angry')
        elif part == 'DIS':
            file_emotion.append('disgust')
        elif part == 'FEA':
            file_emotion.append('fear')
        elif part == 'HAP':
            file_emotion.append('happy')
        elif part == 'NEU':
            file_emotion.append('neutral')
        else:
            file_emotion.append('Unknown')

    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)

# Loading audio emotions dataset into DataFrame
def load_audio_dataset(path):
    emotions = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
    file_emotion, file_path = [], []

    for emotion in emotions:
        emotion_folder = os.path.join(path, emotion)
        if not os.path.exists(emotion_folder):
            continue
        files = os.listdir(emotion_folder)
        for file in files:
            file_path.append(os.path.join(emotion_folder, file))
            file_emotion.append(emotion)

    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
    path_df = pd.DataFrame(file_path, columns=['Path'])
    return pd.concat([emotion_df, path_df], axis=1)

# Loading RAVDESS, CREMA, and AUDIO emotions datasets into DataFrames
ravdess_df = load_ravdess(RAVDESS_PATH)
crema_df = load_crema(CREMA_PATH)
audio_dataset_df = load_audio_dataset(AUDIO_EMOTIONS_PATH)

# Concatenate all datasets
all_datasets_df = pd.concat([ravdess_df, crema_df, audio_dataset_df], ignore_index=True)

# Encode labels
label_encoder = LabelEncoder()
all_datasets_df['label_encoded'] = label_encoder.fit_transform(all_datasets_df['Emotions'].astype(str))

# Train-test split
train_df, val_df = train_test_split(all_datasets_df, test_size=0.2, random_state=42)

# Function to extract MFCC features
def extract_mfcc_features(audio_path):
    data, sample_rate = librosa.load(audio_path, duration=2.5, offset=0.6)
    mfcc_features = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=20)
    if mfcc_features.shape[1] < 100:
        pad_width = 100 - mfcc_features.shape[1]
        mfcc_features = np.pad(mfcc_features, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc_features = mfcc_features[:, :100]  # Truncate to 100 frames if longer
    return mfcc_features

# Apply feature extraction
train_df['features'] = train_df['Path'].apply(extract_mfcc_features)
val_df['features'] = val_df['Path'].apply(extract_mfcc_features)

# Converting features to numpy arrays
features_train = np.array(train_df['features'].tolist())
features_val = np.array(val_df['features'].tolist())
labels_train = np.array(train_df['label_encoded'])
labels_val = np.array(val_df['label_encoded'])

# Scale features
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train.reshape(features_train.shape[0], -1))
features_val_scaled = scaler.transform(features_val.reshape(features_val.shape[0], -1))

# Reshape features back to (20, 100)
features_train_scaled = features_train_scaled.reshape(features_train_scaled.shape[0], 20, 100)
features_val_scaled = features_val_scaled.reshape(features_val_scaled.shape[0], 20, 100)

# model build
audio_input = Input(shape=(20, 100))
conv1d_1 = Conv1D(64, 3, activation='relu')(audio_input)
conv1d_2 = Conv1D(64, 3, activation='relu')(conv1d_1)
maxpool1d_1 = MaxPooling1D(pool_size=2)(conv1d_2)
conv1d_3 = Conv1D(128, 3, activation='relu')(maxpool1d_1)
conv1d_4 = Conv1D(128, 3, activation='relu')(conv1d_3)
maxpool1d_2 = MaxPooling1D(pool_size=2)(conv1d_4)
flatten1d_1 = Flatten()(maxpool1d_2)
dense1 = Dense(256, activation='relu')(flatten1d_1)
batch_norm_1 = BatchNormalization()(dense1)
dropout1 = Dropout(0.5)(batch_norm_1)
output = Dense(len(label_encoder.classes_), activation='softmax')(dropout1) 

# Compile the model
model = Model(inputs=audio_input, outputs=output)
model.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(features_train_scaled, labels_train,
                    validation_data=(features_val_scaled, labels_val),
                    epochs=50, batch_size=128)

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Save the model
model.save("speech_emotion_model.h5")
print("Model saved successfully.")

