In [1]:
!pip install librosa soundfile numpy pandas scikit-learn tensorflow matplotlib



In [5]:
!unzip archive.zip -d content

Archive:  archive.zip
  inflating: content/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: content/Actor_01/03-01-01-01-01-02-01.wav  
  inflating: content/Actor_01/03-01-01-01-02-01-01.wav  
  inflating: content/Actor_01/03-01-01-01-02-02-01.wav  
  inflating: content/Actor_01/03-01-02-01-01-01-01.wav  
  inflating: content/Actor_01/03-01-02-01-01-02-01.wav  
  inflating: content/Actor_01/03-01-02-01-02-01-01.wav  
  inflating: content/Actor_01/03-01-02-01-02-02-01.wav  
  inflating: content/Actor_01/03-01-02-02-01-01-01.wav  
  inflating: content/Actor_01/03-01-02-02-01-02-01.wav  
  inflating: content/Actor_01/03-01-02-02-02-01-01.wav  
  inflating: content/Actor_01/03-01-02-02-02-02-01.wav  
  inflating: content/Actor_01/03-01-03-01-01-01-01.wav  
  inflating: content/Actor_01/03-01-03-01-01-02-01.wav  
  inflating: content/Actor_01/03-01-03-01-02-01-01.wav  
  inflating: content/Actor_01/03-01-03-01-02-02-01.wav  
  inflating: content/Actor_01/03-01-03-02-01-01-01.wav  
  inflati

In [6]:
dataset_path= "/content/content/audio_speech_actors_01-24"

In [7]:
import os
for root, dirs, files in os.walk(dataset_path):
  for file in files:
    print(file)

03-01-04-01-01-01-09.wav
03-01-08-02-01-02-09.wav
03-01-04-01-01-02-09.wav
03-01-03-01-01-01-09.wav
03-01-06-02-02-01-09.wav
03-01-07-02-01-01-09.wav
03-01-05-02-02-01-09.wav
03-01-05-02-02-02-09.wav
03-01-08-01-01-02-09.wav
03-01-06-01-01-01-09.wav
03-01-05-01-01-01-09.wav
03-01-03-01-01-02-09.wav
03-01-07-02-01-02-09.wav
03-01-01-01-01-01-09.wav
03-01-08-01-01-01-09.wav
03-01-04-02-02-02-09.wav
03-01-01-01-02-01-09.wav
03-01-02-02-02-01-09.wav
03-01-06-01-02-01-09.wav
03-01-05-01-01-02-09.wav
03-01-04-02-02-01-09.wav
03-01-07-01-02-02-09.wav
03-01-02-01-01-01-09.wav
03-01-07-01-01-02-09.wav
03-01-06-02-01-02-09.wav
03-01-06-01-02-02-09.wav
03-01-02-02-01-01-09.wav
03-01-06-02-01-01-09.wav
03-01-07-02-02-02-09.wav
03-01-08-02-02-02-09.wav
03-01-04-02-01-02-09.wav
03-01-05-01-02-02-09.wav
03-01-08-01-02-01-09.wav
03-01-02-02-01-02-09.wav
03-01-01-01-02-02-09.wav
03-01-07-02-02-01-09.wav
03-01-02-01-02-01-09.wav
03-01-07-01-01-01-09.wav
03-01-06-01-01-02-09.wav
03-01-03-02-01-02-09.wav


# **Extracting Features from .wav files: **

In [37]:
import librosa
import numpy as np
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
def extract_features(dataset_path):
  y, sr = librosa.load(dataset_path)
  mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
  return mfccs.T
features =[]
labels=[]

Here we are trying to access the actor folders , thus access the files (voice recordings) the audio data is labeled ( file names contain numbers that refer to the diffferent type of emotions that exist in the audio)

In [38]:
for actor_folder in os.listdir(dataset_path):
  actor_path = os.path.join(dataset_path, actor_folder)
  for file in os.listdir(actor_path):
    if file.endswith(".wav"):
      file_path = os.path.join(actor_path, file)
      emotion_code= file.split("-")[2]
      label = emotion_map[emotion_code]
      if label:
        mfcc =extract_features(file_path)
        features.append(mfcc)
        labels.append(label)

# **Encode Labels + Split Data ( for training and testing)**

In [39]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [42]:
#Encoding Labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels)
y = to_categorical(y)

# Find the maximum length of the features
max_length = max([mfcc.shape[0] for mfcc in features])

# Pad the features to the maximum length
padded_features = []
for mfcc in features:
    # Calculate padding required
    padding_length = max_length - mfcc.shape[0]
    # Pad the array with zeros
    padded_mfcc = np.pad(mfcc, ((0, padding_length), (0, 0)), mode='constant')
    padded_features.append(padded_mfcc)

# Convert the padded features to a NumPy array
x = np.array(padded_features)

In [43]:
#splitting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Building and training the Model:**

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten

As we are using the ANN, we will be setting the number of hidden layers which are 3 for these layer  we will use ReLU as an activation function (of their neurons)  and for the last layer (output)  we will use the softmax activation function.

In [47]:
num_classes = 8
max_length= max(mfcc.shape[0] for mfcc in features)
time_steps= max_length
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(time_steps, 40 , 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#Now the training!
model.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 419ms/step - accuracy: 0.1361 - loss: 48.2096 - val_accuracy: 0.1771 - val_loss: 2.0548
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 665ms/step - accuracy: 0.1946 - loss: 2.0443 - val_accuracy: 0.1736 - val_loss: 2.0238
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 430ms/step - accuracy: 0.2241 - loss: 1.9825 - val_accuracy: 0.1979 - val_loss: 1.9996
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 426ms/step - accuracy: 0.1907 - loss: 1.9848 - val_accuracy: 0.2083 - val_loss: 1.9968
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 442ms/step - accuracy: 0.2508 - loss: 1.9135 - val_accuracy: 0.2118 - val_loss: 2.0006
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 439ms/step - accuracy: 0.2441 - loss: 1.9487 - val_accuracy: 0.2292 - val_loss: 1.9835
Epoch 7/50
[1m36/36[0m [32m━━

<keras.src.callbacks.history.History at 0x78bef16c2e90>

# Model Evaluation:

In [55]:
loss , accuracy = model.evaluate(x_test, y_test)
print(f"Accuracy: {accuracy*100:.2f} %")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 84ms/step - accuracy: 0.4748 - loss: 2.8719
Accuracy: 45.83 %


# **Predict Label(emotions) for a new Audio**

In [56]:
audio_Path= "/content/content/audio_speech_actors_01-24/Actor_01/03-01-01-01-01-01-01.wav"

In [58]:
def predict_emotion(audio_path):
    mfcc = extract_features(audio_path)
    mfcc = mfcc.reshape(1, -1)
    label=encoder.inverse_transform([np.argmax(model)])
    return label[0]
print(predict_emotion(audio_Path))

angry
