# Import(s)

In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from playsound import playsound 
import pyaudio
import wave 
import pickle ##for storing the model once created in pkl extension

# Data Cleansing

In [2]:
##for data cleansing, we lower down the sampling rate and ofcourse try to cut down the noise in our audio file
def clean(y,rate,threshold):
    mask=[]
    y = pd.Series(y).apply(np.abs) #we use rolling window analysis for time-series data 
    y_mean= y.rolling(window = int(rate / 10),min_periods = 1, center = True).mean()
    #each  window is size of frq./ 10 and we check if mean > threshold, we deem it true, else its noise we deem it false.
    for mean in y_mean:
        if mean > threshold:
            mask.append(True)
        else:
            mask.append(False)
    return mask

# Store the Cleansed Audio in Clean Directory

In [3]:
##storing clean data in new directory using wavfile from scipy
from scipy.io import wavfile
for file in glob.glob("E:\\ravdess_data\\Actor_*\\*.wav"):
    file_name = os.path.basename(file)
    signal , rate = librosa.load(file, sr=16000)
    mask = clean(signal,rate, 0.0005)
    wavfile.write(filename= r'E:\\clean_data\\clean_data'+str(file_name), rate=rate,data=signal[mask])

# Feature Extraction

In [4]:
#works.
#extracting all our features of mfcc and mel.
#extract feature transpose if feature present from librosa, and extract the mean value of it.

def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32") #convert the sound file to float32 type for our timeseries data
        sample_rate=sound_file.samplerate ##process at the same samplerate as that of param provided audio
        if chroma: #extracting short time fourier transform(stft): time freq values over windows. expected op a matrix.
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            ##y expects the time-series data of audio , sr is sample rate nd n_mfcc are the number of mfcc to return
            result=np.hstack((result, mfccs)) #horizontal columnar store.
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) #expects abs(TimeSeriesdata)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

# Label Classification & data-loading

In [5]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

observed_emotions=['calm', 'happy', 'neutral', 'sad']

In [6]:
def load_data(test_size=0.33): ##33% test data 
    x,y=[],[]
    answer = 0
    for file in glob.glob("E:\\clean_data\\clean_data*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            answer += 1
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append([emotion,file_name]) 
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

# Mapping Data Testing Data corresponding fileName Labels

In [7]:
##split test ddata set
##labelling file names to predicted emotion output
##also showing the shape tuples(r,c) of our dataSets
x_train,x_test,y_trai,y_tes=load_data(test_size=0.25)
print(np.shape(x_train),np.shape(x_test),np.shape(y_trai),np.shape(y_tes))
y_test_map = np.array(y_tes).T
y_test = y_test_map[0]
test_filename = y_test_map[1]
y_train_map = np.array(y_trai).T
y_train = y_train_map[0]
train_filename = y_train_map[1]
print(np.shape(y_train),np.shape(y_test))
print(*test_filename,sep="\n")


(504, 180) (168, 180) (504, 2) (168, 2)
(504,) (168,)
clean_data03-01-02-01-01-02-22.wav
clean_data03-01-04-01-02-02-23.wav
clean_data03-01-04-02-01-02-05.wav
clean_data03-01-03-01-01-02-11.wav
clean_data03-01-03-02-02-02-16.wav
clean_data03-01-02-02-02-02-21.wav
clean_data03-01-01-01-02-02-20.wav
clean_data03-01-03-02-01-02-03.wav
clean_data03-01-02-02-02-01-11.wav
clean_data03-01-01-01-02-01-15.wav
clean_data03-01-01-01-01-02-21.wav
clean_data03-01-02-01-01-01-21.wav
clean_data03-01-03-02-02-01-16.wav
clean_data03-01-02-01-02-01-03.wav
clean_data03-01-03-01-02-01-14.wav
clean_data03-01-03-02-01-01-06.wav
clean_data03-01-02-01-02-02-06.wav
clean_data03-01-03-02-02-02-23.wav
clean_data03-01-02-02-01-01-01.wav
clean_data03-01-04-01-01-01-03.wav
clean_data03-01-04-01-02-02-13.wav
clean_data03-01-01-01-01-02-17.wav
clean_data03-01-02-02-02-02-08.wav
clean_data03-01-02-01-01-01-11.wav
clean_data03-01-01-01-01-02-16.wav
clean_data03-01-02-02-01-01-04.wav
clean_data03-01-04-02-02-02-24.wav
c

In [8]:
#Get Shape of our Training and tesing data
print((x_train[0], x_test[0]))


(array([-5.35849243e+02,  7.55318680e+01, -1.01592093e+01,  5.69860876e-01,
       -2.02535820e+01, -1.49986048e+01, -2.56977406e+01, -2.15807724e+01,
       -1.25628242e+01, -1.23922682e+01, -8.72327709e+00, -1.44741106e+01,
       -5.93426323e+00, -8.21510887e+00, -1.33028297e+01, -5.46756887e+00,
       -1.25385141e+01, -6.15823936e+00, -7.92096329e+00, -2.08902076e-01,
        6.21816456e-01,  5.57069349e+00,  4.84790993e+00,  1.57187319e+00,
       -2.48557329e+00,  3.68227363e+00,  6.15425158e+00,  1.20167704e+01,
        1.34859543e+01,  1.55437689e+01,  1.09278345e+01,  8.44227314e+00,
        3.42682195e+00, -3.40451550e+00, -7.54415512e+00, -3.84342581e-01,
       -1.79084861e+00, -3.03773284e+00, -6.86734962e+00, -4.51440334e+00,
        3.53163868e-01,  3.89058441e-01,  4.37834412e-01,  5.12297690e-01,
        3.55163455e-01,  3.40410918e-01,  2.87093759e-01,  3.36011887e-01,
        5.82483828e-01,  6.12853944e-01,  5.82298100e-01,  4.13455278e-01,
        3.94078825e-06, 

In [9]:
print(f'Features extracted: {x_train.shape[1]}')


Features extracted: 180


# Applying Algorithm MLP Classifier

In [10]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
#DataFlair - Initialize the Multi Layer Perceptron Classifier

In [11]:
model.fit(x_train,y_train)




MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300,), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

# Saving Trained Model

In [12]:
Pkl_File = "Emotion_Voice_Detection_Model.pkl"

In [13]:
with open(Pkl_File,'wb') as file:
    pickle.dump(model,file)
#storing the model using file handler with 'wb' permission in same dir.


In [14]:
#Loading the same model back from file
with open(Pkl_File,'rb') as file:
    model = pickle.load(file)
##display loadead model
model

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300,), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
y_pred=model.predict(x_test)

In [16]:
print(y_pred)
single_val = np.array(x_test)
np.expand_dims(single_val,0)
model.predict(single_val)

['calm' 'sad' 'sad' 'happy' 'happy' 'calm' 'sad' 'happy' 'calm' 'neutral'
 'neutral' 'calm' 'happy' 'neutral' 'happy' 'happy' 'neutral' 'happy'
 'calm' 'sad' 'sad' 'neutral' 'calm' 'calm' 'neutral' 'calm' 'sad' 'sad'
 'calm' 'happy' 'calm' 'neutral' 'neutral' 'happy' 'sad' 'happy' 'sad'
 'sad' 'neutral' 'happy' 'neutral' 'neutral' 'happy' 'neutral' 'calm'
 'calm' 'sad' 'happy' 'sad' 'calm' 'sad' 'calm' 'calm' 'calm' 'happy'
 'neutral' 'neutral' 'happy' 'happy' 'neutral' 'happy' 'neutral' 'neutral'
 'sad' 'sad' 'neutral' 'calm' 'calm' 'sad' 'happy' 'sad' 'sad' 'neutral'
 'happy' 'happy' 'neutral' 'calm' 'happy' 'calm' 'happy' 'happy' 'neutral'
 'happy' 'neutral' 'neutral' 'happy' 'happy' 'calm' 'happy' 'sad' 'happy'
 'calm' 'happy' 'sad' 'happy' 'calm' 'happy' 'calm' 'calm' 'sad' 'sad'
 'happy' 'sad' 'happy' 'neutral' 'sad' 'neutral' 'calm' 'neutral' 'sad'
 'calm' 'calm' 'neutral' 'happy' 'calm' 'neutral' 'neutral' 'happy' 'calm'
 'calm' 'sad' 'neutral' 'sad' 'sad' 'neutral' 'happy' 'ne

array(['calm', 'sad', 'sad', 'happy', 'happy', 'calm', 'sad', 'happy',
       'calm', 'neutral', 'neutral', 'calm', 'happy', 'neutral', 'happy',
       'happy', 'neutral', 'happy', 'calm', 'sad', 'sad', 'neutral',
       'calm', 'calm', 'neutral', 'calm', 'sad', 'sad', 'calm', 'happy',
       'calm', 'neutral', 'neutral', 'happy', 'sad', 'happy', 'sad',
       'sad', 'neutral', 'happy', 'neutral', 'neutral', 'happy',
       'neutral', 'calm', 'calm', 'sad', 'happy', 'sad', 'calm', 'sad',
       'calm', 'calm', 'calm', 'happy', 'neutral', 'neutral', 'happy',
       'happy', 'neutral', 'happy', 'neutral', 'neutral', 'sad', 'sad',
       'neutral', 'calm', 'calm', 'sad', 'happy', 'sad', 'sad', 'neutral',
       'happy', 'happy', 'neutral', 'calm', 'happy', 'calm', 'happy',
       'happy', 'neutral', 'happy', 'neutral', 'neutral', 'happy',
       'happy', 'calm', 'happy', 'sad', 'happy', 'calm', 'happy', 'sad',
       'happy', 'calm', 'happy', 'calm', 'calm', 'sad', 'sad', 'happy',
       

In [17]:

accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
##checkin out the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 70.24%


# Summary Of Predicted Data

In [18]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd

In [19]:
results = confusion_matrix(y_test,y_pred)
print("confusion matrix")
'''
cm : performance measurement for classification problems
true pos : true and right pred   recall = TP/TP+FN
true neg : -ve pred and right    prec = TP/TP+FP
false pos: wrong pred(error)     acc = 2*recall*prec / recall+prec
false neg: is true wrong pred(error)
'''
print(results)
print("Report")
report = classification_report(y_test,y_pred)
print(report)

confusion matrix
[[34  1  7  2]
 [ 1 34  1  4]
 [ 5  0 24  4]
 [ 6  8 11 26]]
Report
              precision    recall  f1-score   support

        calm       0.74      0.77      0.76        44
       happy       0.79      0.85      0.82        40
     neutral       0.56      0.73      0.63        33
         sad       0.72      0.51      0.60        51

   micro avg       0.70      0.70      0.70       168
   macro avg       0.70      0.71      0.70       168
weighted avg       0.71      0.70      0.70       168



# Storing Predicted File as CSV

In [27]:
dataF = pd.DataFrame(y_pred, columns = ['predictions'])
dataF['file_names'] = test_filename #printing with their previous label names
dataF['file_names'] = test_filename #printing with their previous label names
print(dataF)
dataF.to_csv('output_prediction.csv') 
#we get ith wav file , got predicted emotion and so on in a readable format

    predictions                          file_names
0          calm  clean_data03-01-02-01-01-02-22.wav
1           sad  clean_data03-01-04-01-02-02-23.wav
2           sad  clean_data03-01-04-02-01-02-05.wav
3         happy  clean_data03-01-03-01-01-02-11.wav
4         happy  clean_data03-01-03-02-02-02-16.wav
5          calm  clean_data03-01-02-02-02-02-21.wav
6           sad  clean_data03-01-01-01-02-02-20.wav
7         happy  clean_data03-01-03-02-01-02-03.wav
8          calm  clean_data03-01-02-02-02-01-11.wav
9       neutral  clean_data03-01-01-01-02-01-15.wav
10      neutral  clean_data03-01-01-01-01-02-21.wav
11         calm  clean_data03-01-02-01-01-01-21.wav
12        happy  clean_data03-01-03-02-02-01-16.wav
13      neutral  clean_data03-01-02-01-02-01-03.wav
14        happy  clean_data03-01-03-01-02-01-14.wav
15        happy  clean_data03-01-03-02-01-01-06.wav
16      neutral  clean_data03-01-02-01-02-02-06.wav
17        happy  clean_data03-01-03-02-02-02-23.wav
18         c

# Specify PyAudio and Sampling bit-rates

In [None]:
chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 2
fs = 44100  # Record at 44100 samples per second
seconds = 4
filename = "getC12345.wav"

p = pyaudio.PyAudio()  # Create an interface to PortAudio
print('Recording')

stream = p.open(format=sample_format,
                channels=channels,
                rate=fs,
                frames_per_buffer=chunk,
                input=True)

frames = []  # Initialize array to store frames

# Store data in chunks for 3 seconds
for i in range(0, int(fs / chunk * seconds)):
    data = stream.read(chunk)
    frames.append(data)

# Stop and close the stream 
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()

print('Finished recording')
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()

# Test Recorded Data

In [None]:
#using for testing recorded voice.
playsound(filename)
file_name = filename # so that we edit a copy.

# Display WavePlot For Sample Recorded Data


In [None]:
import matplotlib.pyplot as plt 
from librosa import display
data,sampling_rate = librosa.load(file_name)
plt.figure(figsize=(15,5))
librosa.display.waveplot(data,sr=sampling_rate)

In [None]:
ans = []
newA="test"
from pydub import AudioSegment
sound = AudioSegment.from_wav(file_name)
sound = sound.set_channels(1)
sound.export(newA, format="wav")
##there are 2 types of audio's, mono and stereo.
##extract feature expects a mono. thus its need to be converted using audio Segment.
new_feature= extract_feature(newA,mfcc=True,chroma=True,mel=True)
ans.append(new_feature)
ans = np.array(ans)

model.predict(ans)