### Importing Required Libraries

In [None]:

from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

# Load the pretrained VGG16 model without top layers
base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

# Freeze most layers
for layer in base_model.layers[:-5]:
    layer.trainable = False

# Custom classification head
x = Flatten()(base_model.output)
x = Dense(256, activation="relu", kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(8, activation="softmax")(x)  # 8 emotion classes

model = Model(inputs=base_model.input, outputs=x)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())


In [None]:
# Standard libraries for file handling and numerical operations
import os      
import random    
import numpy as np 
import pandas as pd  

# To Displays progress bars for loops
from tqdm import tqdm 

# Audio processing libraries
import librosa
import librosa.display  

# Visualization libraries
import seaborn as sns  
import matplotlib.pyplot as plt  

# To evaluate model performance using a confusion matrix
from sklearn.metrics import confusion_matrix  

# Preprocessing utilities
from sklearn.model_selection import train_test_split 
# Converts categorical labels to numerical labels 
from sklearn.preprocessing import LabelEncoder  
# Normalizes features to improve model performance
from sklearn.preprocessing import StandardScaler  

# Deep learning model training using TensorFlow Keras
from tensorflow.keras.models import Sequential  
from tensorflow.keras.layers import Dense, Dropout  
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten  

# Model evaluation utilities
from tensorflow.keras import models as tf_models  
from sklearn.metrics import classification_report
import os
import librosa
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report

### Define Dataset Path

In [None]:
dataset_root = r"I:\My Drive"
dataset_subfolders = ["Audio_Speech_Actors_01-24", "Audio_Song_Actors_01-24"]

### Define emotions based on RAVDESS Dataset

In [None]:
emotion_map = {'01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
               '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'}


### Ensuring dataset_features directory exists

In [None]:
output_dir = r"I:\My Drive\SpeechEmotionDetection\dataset_features"
os.makedirs(output_dir, exist_ok=True)

### Feature Extraction
### Audio Feature Extraction
This function extracts three key features from an audio file:
1. **MFCCs (Mel-Frequency Cepstral Coefficients)**
    - Captures timbral characteristics of speech.
    - Timbre refers to the quality or color of a sound that makes it distinct from other sounds, even when they have the same pitch and loudness.
2. **Chroma Features** 
    - Represents pitch class distribution in the audio.
3. **Mel Spectrogram** 
    - Shows energy distribution over different frequencies.

These features are essential for training a machine learning model to classify speech emotions.

In [None]:
def extract_features(file_path):
    """
    Extracts audio features (MFCCs, Chroma, and Mel Spectrogram) from a given audio file.
    And returns a feature vector (numpy array) containing MFCCs, Chroma, and Mel Spectrogram.
    """
    y, sr = librosa.load(file_path, sr=None)

    # Extract MFCCs
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)

    # Extract Chroma
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)

    # Extract Mel Spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)

    return np.hstack([mfccs, chroma, mel])

### Dataset Processing

In [None]:
# Function to process a dataset folder
def process_dataset(dataset_path, category, data):
    """Processes audio files from a dataset directory, extracts features and stores them in strucuted list"""
    # List to store all file paths and names
    all_files = []
    # Iterate through each actor's folder inside the dataset directory
    for actor_folder in os.listdir(dataset_path):
        actor_path = os.path.join(dataset_path, actor_folder)
        if not os.path.isdir(actor_path):
            continue

        # Iterate through audio files inside each actor's folder
        for file_name in os.listdir(actor_path):
            file_path = os.path.join(actor_path, file_name)

            # Ignore non-audio files (only process .wav files)
            if not file_name.lower().endswith(".wav"):
                continue  # Skip non-audio files

            all_files.append((file_path, file_name))


    # Display processing of files found in the category
    print(f"Processing {category}: {len(all_files)} files")

    # Process each audio file and extract features
    for file_path, file_name in tqdm(all_files, desc=f"{category}"):
        try:
            features = extract_features(file_path)
            emotion_code = file_name.split("-")[2]
            emotion = emotion_map.get(emotion_code, "unknown")
            data.append([file_path, category, emotion] + features.tolist())
        except Exception as e:
            print(f"Error processing {file_name}: {e}")


In [None]:
# Initializing empty list to store extracted features
data = []

# Iterate through each subfolder in the dataset
for subfolder in dataset_subfolders:
    process_dataset(os.path.join(dataset_root, subfolder), subfolder, data)

# Check if feature extraction was successful
if data:
    # Define Dataframe to store metadata and extracted features
    columns = ["file_path", "category", "emotion"] + [f"feature_{i}" for i in range(len(data[0]) - 3)]
    df = pd.DataFrame(data, columns=columns)

    #Converting Dataframe to csv
    df.to_csv(os.path.join(output_dir, "audio_features.csv"), index=False)
    print("Feature extraction completed! Data saved to dataset_features/audio_features.csv")
else:
    print("No valid audio files found for feature extraction.")

In [None]:
# Load the saved CSV file for verification
df = pd.read_csv(r"I:\My Drive\SpeechEmotionDetection\dataset_features\audio_features.csv")

# Display Top 5 rows
df.head()

### Plotting Sample Waveform and MFCC

In [None]:
# Randomly select audio file from dataset
audio_file = random.choice(df['file_path'])

# Load audio file
y, sr = librosa.load(audio_file, sr=None)

# Compute MFCCs
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

# Create a figure with subplots
fig, ax = plt.subplots(nrows=2, figsize=(10, 6))

# Plot waveform
ax[0].set_title(f"Waveform of {audio_file.split('/')[-1]}")
librosa.display.waveshow(y, sr=sr, ax=ax[0])
ax[0].set_xlabel("Time")
ax[0].set_ylabel("Amplitude")

# Plot MFCC features
img = librosa.display.specshow(mfccs, x_axis="time", sr=sr, ax=ax[1], cmap="viridis")
ax[1].set_title("MFCC Features")
ax[1].set_xlabel("Time")
ax[1].set_ylabel("MFCC Coefficients")
fig.colorbar(img, ax=ax[1], format="%+2.f")
# Show plot
plt.tight_layout()
plt.show()

### Preprocessing Dataset

In [None]:
# Encoding emotions
print(df['emotion'])
label_encoder = LabelEncoder()
df["emotion"] = label_encoder.fit_transform(df["emotion"])
df['emotion']

In [None]:
# Defining columns to exclude non-feature columns
exclude_columns = ["emotion", "file_path", "category"]

In [None]:
# Split data into features and labels
X = df.drop(columns=exclude_columns, axis=1)
y = df["emotion"]

In [None]:
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=28)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


#### What is **.npy**?
* NumPy Binary Format is a file format used by NumPy to store arrays efficiently. 
* *It saves arrays in binary format that retains their structure and datatype

In [None]:
# Saving preprocessed feature arrays and labels as .npy files for quick loading

np.save(os.path.join(output_dir, "X_train.npy"), X_train)

np.save(os.path.join(output_dir, "X_test.npy"), X_test)

np.save(os.path.join(output_dir, "y_train.npy"), y_train)

np.save(os.path.join(output_dir, "y_test.npy"), y_test)

print("Data preprocessing completed! Datasets saved in dataset_features/")

### Model Training

In [None]:
# Load dataset
dataset_dir = r"I:\My Drive\SpeechEmotionDetection\dataset_features"
X_train = np.load(os.path.join(dataset_dir, "X_train.npy"))
X_test = np.load(os.path.join(dataset_dir, "X_test.npy"))
y_train = np.load(os.path.join(dataset_dir, "y_train.npy"))
y_test = np.load(os.path.join(dataset_dir, "y_test.npy"))


### Defining CNN Model
- This model uses **1D convolutional layers** to extract meaningful features from **audio input sequences**.  
- Using **Sequential layer** as it is a linear stack of layers in Keras, used to build neural network models where each layer has exactly one input tensor and one output tensor. It is simple to use but only supports layer-by-layer stacking.
- **Pooling layers** reduce dimensions and prevent overfitting.  
- The **Dense layer and Dropout** enhance learning, followed by **Softmax** for final classification.  
- The final layer has neurons equal to the **number of unique emotions** in the dataset.


In [None]:

from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

# Load the pretrained VGG16 model without top layers
base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

# Freeze most layers
for layer in base_model.layers[:-5]:
    layer.trainable = False

# Custom classification head
x = Flatten()(base_model.output)
x = Dense(256, activation="relu", kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(8, activation="softmax")(x)  # 8 emotion classes

model = Model(inputs=base_model.input, outputs=x)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())


In [None]:
# Compiling the Model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

# Callbacks to prevent overfitting
early_stop = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5)

history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop, reduce_lr])

# Plot Training vs Validation Loss
plt.figure(figsize=(6,4))
plt.plot(history.history['loss'], label="Training Loss", color='blue')
plt.plot(history.history['val_loss'], label="Validation Loss", color='orange')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training vs Validation Loss")
plt.show()


In [None]:
# Save Model
model.save(r"I:\My Drive\SpeechEmotionDetection\models\speech_emotion_model.h5")
print("Model trained and saved!")

### Model Evaluation 📊
- Model evaluation is the process of assessing how well a trained model performs on unseen data. It helps determine accuracy, robustness, and generalization before deployment.
- Once the model is trained, we evaluate its performance using various metrics:
    1. **Accuracy** - Measures overall correctness.
    2. **Precision & Recall** - Important for imbalanced datasets.
    3. **F1-Score** - Balances precision and recall.
    4. **Confusion Matrix** - Provides a detailed view of model errors.
    5. **Loss Function** - Determines model optimization.

We will use **Scikit-learn** to compute these metrics and visualize performance.

In [None]:
# Reshape input
X_test = np.expand_dims(X_test, axis=2)

In [None]:
# Load Saved model
model = tf_models.load_model("I:\My Drive\SpeechEmotionDetection\models\speech_emotion_model_vgg16.h5")

In [None]:
# Predict Output
y_pred = np.argmax(model.predict(X_test), axis=1)

In [None]:
# Evaluate Print
print(classification_report(y_test, y_pred))

### Confusion Matrix Visualization 🎯
A **confusion matrix** is a useful tool to evaluate the performance of a classification model. It shows:
- **Correct classifications** along the diagonal.
- **Misclassifications** off the diagonal.

We use **Seaborn’s heatmap** to visually analyze model errors and performance.


In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()