In [1]:
import soundfile  #read to audio file
import numpy as np
import pickle #to save model after training
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier #multi-lyer perception model
from sklearn.metrics import accuracy_score  #to measure how good we are

In [2]:
ls

 Volume in drive C is Windows
 Volume Serial Number is C65D-9667

 Directory of c:\Users\hp\OneDrive\Desktop\voice emotion detetction

07-02-2024  13:40    <DIR>          .
08-02-2024  11:20    <DIR>          ..
07-02-2024  21:20            22,364 Speech_Emotion_Recognition_with_librosa.ipynb
07-02-2024  13:36    <DIR>          speech-emotion-recognition-ravdess-data
07-02-2024  21:20             8,196 voice_tone_emotion.ipynb
               2 File(s)         30,560 bytes
               3 Dir(s)  128,806,678,528 bytes free


In [3]:
import os
import glob
import librosa

# Define the root directory where your data is stored
root_dir = "C:\\Users\\hp\\OneDrive\\Desktop\\voice emotion detetction"


# Change the current working directory to the root directory
os.chdir(root_dir)

# Define a list to store audio data and labels
audio_data = []
labels = []

# Loop through each actor folder
for actor_folder in glob.glob("speech-emotion-recognition-ravdess-data/Actor_*"):
    # Loop through each audio file in the actor folder
    for audio_file in glob.glob(os.path.join(actor_folder, "*.wav")):
        # Load the audio file using librosa
        y, sr = librosa.load(audio_file, sr=None)
        
        # Append the audio data and label to the lists
        audio_data.append(y)
        labels.append(actor_folder.split("_")[-1])

# Print the number of audio files loaded
print("Number of audio files loaded:", len(audio_data))


Number of audio files loaded: 1440


In [39]:
ls

 Volume in drive C is Windows
 Volume Serial Number is C65D-9667

 Directory of C:\Users\hp\OneDrive\Desktop\voice emotion detetction

07-02-2024  13:40    <DIR>          .
07-02-2024  13:36    <DIR>          ..
07-02-2024  13:59            22,364 Speech_Emotion_Recognition_with_librosa.ipynb
07-02-2024  13:36    <DIR>          speech-emotion-recognition-ravdess-data
07-02-2024  19:36             5,588 voice_tone_emotion.ipynb
               2 File(s)         27,952 bytes
               3 Dir(s)  131,290,144,768 bytes free


In [4]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(y=X, sr=sample_rate).T, axis=0)
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)

    return np.hstack((chroma, mfccs, mel))


In [6]:
# emotion in dataset

emotions={
    '01':'neutral',
    '02':'calm',
    '03':'happy',
    '04':'sad',
    '05':'angry',
    '06':'fearful',
    '07':'disgust',
    '08':'surprised'
}

# emotions for observe
observed_emotions=['calm', 'happy', 'disgust','fearful']

load the data and extract the features from each file

In [8]:
#Load the data and extract features for each sound file
def load_data(test_size=0.2, random_state=None):
     x, y = [], []
     for file in glob.glob("speech-emotion-recognition-ravdess-data/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
     return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [9]:
# Load the data
X_train, X_test, y_train, y_test = load_data(test_size=0.25, random_state=42)


In [10]:
X_train

array([[4.03765470e-01, 4.20298666e-01, 4.16564405e-01, ...,
        1.65243138e-04, 1.04321596e-04, 6.55571494e-05],
       [4.51234460e-01, 3.72855783e-01, 3.72669905e-01, ...,
        3.89261913e-05, 3.05255344e-05, 2.94166621e-05],
       [4.54155535e-01, 4.76314396e-01, 4.50824767e-01, ...,
        4.75216839e-05, 3.46632551e-05, 1.62844444e-05],
       ...,
       [2.86590755e-01, 3.05683523e-01, 3.67723614e-01, ...,
        1.51764631e-04, 1.16828531e-04, 8.47479314e-05],
       [4.29127097e-01, 5.19392073e-01, 5.99547565e-01, ...,
        1.61086471e-04, 1.04962477e-04, 6.52811796e-05],
       [3.90557587e-01, 4.08611894e-01, 4.51169908e-01, ...,
        6.08151488e-04, 5.55269769e-04, 4.47782222e-04]], dtype=float32)

In [11]:
#Get the shape of the training and testing datasets
print((X_train.shape[0], X_test.shape[0]))



(576, 192)


In [12]:
# get the number feature extracted
print(f'feature_extracted:{X_train.shape[1]}')

feature_extracted:180


In [13]:
#Initialize the Multi Layer Perceptron Classifier
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,),learning_rate='adaptive', max_iter=500 )

In [14]:
# train the model
model.fit(X_train, y_train)

In [15]:
# predict for the test set
y_pred=model.predict(X_test)

In [16]:
y_pred

array(['happy', 'calm', 'calm', 'happy', 'fearful', 'calm', 'calm',
       'happy', 'calm', 'fearful', 'happy', 'fearful', 'fearful', 'happy',
       'disgust', 'happy', 'calm', 'fearful', 'disgust', 'calm', 'calm',
       'disgust', 'disgust', 'calm', 'fearful', 'happy', 'calm', 'happy',
       'calm', 'fearful', 'fearful', 'disgust', 'happy', 'fearful',
       'happy', 'calm', 'calm', 'fearful', 'calm', 'disgust', 'happy',
       'calm', 'fearful', 'calm', 'fearful', 'calm', 'calm', 'calm',
       'calm', 'happy', 'fearful', 'fearful', 'fearful', 'happy', 'happy',
       'fearful', 'calm', 'happy', 'calm', 'calm', 'disgust', 'calm',
       'fearful', 'calm', 'disgust', 'calm', 'calm', 'calm', 'fearful',
       'fearful', 'fearful', 'fearful', 'fearful', 'fearful', 'fearful',
       'disgust', 'fearful', 'happy', 'calm', 'fearful', 'calm', 'calm',
       'fearful', 'calm', 'disgust', 'calm', 'fearful', 'fearful',
       'fearful', 'fearful', 'disgust', 'fearful', 'calm', 'happy',
    

In [17]:
# check the accuracy of our model
accuracy= accuracy_score(y_true=y_test, y_pred=y_pred)

# print the accuracy
print("accuracy:{:.2f}%".format(accuracy*100))

accuracy:61.98%


In [18]:
from sklearn.metrics import accuracy_score, f1_score

In [19]:
f1_score(y_test, y_pred, average=None)

array([0.76335878, 0.48484848, 0.57407407, 0.55696203])

In [21]:
# now import the actual and prediction value
import pandas as pd
df= pd.DataFrame({'Actual':y_test, 'prediction': y_pred})
df.head(10)

Unnamed: 0,Actual,prediction
0,happy,happy
1,calm,calm
2,happy,calm
3,happy,happy
4,disgust,fearful
5,calm,calm
6,happy,calm
7,happy,happy
8,disgust,calm
9,happy,fearful


In [22]:
import pickle
# Writing different model files to file
with open( 'modelForPrediction1.sav', 'wb') as f:
    pickle.dump(model,f)

In [23]:
filename = 'modelForPrediction1.sav'
loaded_model = pickle.load(open(filename, 'rb')) # loading the model file from the storage

feature=extract_feature("speech-emotion-recognition-ravdess-data/Actor_01/03-01-01-01-01-01-01.wav", mfcc=True, chroma=True, mel=True)

feature=feature.reshape(1,-1)

prediction=loaded_model.predict(feature)
prediction

array(['fearful'], dtype='<U7')

In [24]:
feature

array([[ 5.19868493e-01,  5.20555913e-01,  4.31395352e-01,
         4.22415048e-01,  4.80894387e-01,  5.05568624e-01,
         5.55076957e-01,  5.76952040e-01,  6.11260414e-01,
         5.78311801e-01,  5.81391752e-01,  5.21786213e-01,
        -6.93497009e+02,  5.00643921e+01,  5.71450770e-01,
         1.43299646e+01,  3.33637023e+00, -2.54071975e+00,
        -4.05790901e+00, -1.07119989e+01, -7.29413891e+00,
         1.74018908e+00, -4.19064283e+00,  1.95466173e+00,
        -5.24789381e+00,  2.78142977e+00, -3.16756773e+00,
        -3.40008307e+00, -2.37803221e+00, -5.68717480e-01,
        -6.47753334e+00, -1.24320579e+00, -2.80542541e+00,
        -5.43635845e+00, -4.46875364e-01, -3.63516593e+00,
        -2.98372650e+00, -5.63902617e-01, -1.65101945e+00,
        -5.55944800e-01, -3.41018462e+00, -2.24465489e+00,
        -3.13058877e+00, -2.70090008e+00, -1.88821304e+00,
        -5.54154634e-01, -3.96459866e+00, -2.13485193e+00,
        -3.94577074e+00, -1.62457836e+00, -2.03990746e+0