<a href="https://colab.research.google.com/github/abdoghareeb46/Emotion_Audio_recognition/blob/master/Emotion_Audio_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files
files.upload()

In [0]:
!pip install -q kaggle

In [0]:
!mkdir -p ~/.kaggle

In [0]:
!cp kaggle.json ~/.kaggle/

In [0]:
! chmod 600 ~/.kaggle/kaggle.json

In [0]:
!kaggle datasets download -d suso172/arabic-natural-audio-dataset

Downloading arabic-natural-audio-dataset.zip to /content
 98% 548M/560M [00:13<00:00, 23.8MB/s]
100% 560M/560M [00:13<00:00, 43.6MB/s]


In [0]:
!ls

arabic-natural-audio-dataset.zip  data	drive  kaggle.json  sample_data


In [0]:
! mkdir data

In [0]:
! unzip arabic-natural-audio-dataset.zip -d data

In [0]:
!pip install SoundFile
!pip install librosa

In [0]:
import numpy as np
import pandas as pd
import librosa # to extract speech features
import glob
import os
import pickle # to save model after training
from sklearn.model_selection import train_test_split # for splitting training and testing
from sklearn.neural_network import MLPClassifier # multi-layer perceptron model
from sklearn.metrics import accuracy_score # to measure how good we are

In [0]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    X,sample_rate=librosa.load(file_name)
    if chroma or contrast:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
    if contrast:
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, contrast))
    if tonnetz:
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
        result = np.hstack((result, tonnetz))
    return result

In [0]:
!ls /content/data/

1sec_segmented_part1  ANAD.csv		     V1.wav  V4.wav  V7.wav
1sec_segmented_part2  ANAD_Normalized.csv    V2.wav  V5.wav  V8.wav
1sec_segmented_part3  caller_Reciever_turns  V3.wav  V6.wav


In [0]:
data=pd.read_csv("/content/data/ANAD.csv")
data=data.iloc[:,[0,1]].copy()

In [0]:
data["name"]=data["name"].apply(lambda x:x.replace("'",""))
data.head()

Unnamed: 0,name,Emotion
0,V2_1 (1).wav,surprised
1,V2_1 (2).wav,surprised
2,V2_1 (3).wav,surprised
3,V2_1 (4).wav,surprised
4,V2_1 (5).wav,surprised


In [0]:
data.columns=["name","emo"]

In [0]:
!ls /content/data

1sec_segmented_part1  ANAD.csv		     V1.wav  V4.wav  V7.wav
1sec_segmented_part2  ANAD_Normalized.csv    V2.wav  V5.wav  V8.wav
1sec_segmented_part3  caller_Reciever_turns  V3.wav  V6.wav


In [0]:
files = glob.glob("/content/data/1sec_segmented_part1/1sec_segmented_part1/*.wav")+glob.glob("/content/data/1sec_segmented_part2/1sec_segmented_part2/*.wav")+ glob.glob("/content/data/1sec_segmented_part3/1sec_segmented_part3/*.wav")

In [0]:
len(files)

1420

In [0]:
names=[]
for f in files:
  names.append(f.split("/")[-1])

In [0]:
emo_=[]
for i in names:
  try:
    emo_.append(data[data["name"]==i]["emo"].values[0])
  except:
    emo_.append(np.nan)

In [0]:
full_data=pd.DataFrame([files,emo_]).T
full_data.columns=["filePath","labels"]
full_data.head()

Unnamed: 0,filePath,labels
0,/content/data/1sec_segmented_part1/1sec_segmen...,surprised
1,/content/data/1sec_segmented_part1/1sec_segmen...,happy
2,/content/data/1sec_segmented_part1/1sec_segmen...,happy
3,/content/data/1sec_segmented_part1/1sec_segmen...,happy
4,/content/data/1sec_segmented_part1/1sec_segmen...,happy


In [0]:
full_data.isna().sum()


filePath     0
labels      86
dtype: int64

In [0]:
full_data.dropna(inplace=True)

In [0]:
X=[]
for e in full_data["filePath"].values:
  features = extract_feature(e, mfcc=True, chroma=True, mel=True)
  X.append(features)
  

In [0]:
X=np.array(X)
X.shape

(1334, 180)

In [0]:
from sklearn.preprocessing import LabelEncoder

In [0]:
lb=LabelEncoder()
labels=lb.fit_transform(full_data["labels"])

In [0]:
X_train, X_test, y_train, y_test=train_test_split(X, labels, test_size=30, random_state=7)

In [0]:
model_params = {
    'alpha': 0.001,
    'batch_size': 256,
    'epsilon': 1e-08, 
    'hidden_layer_sizes': (512, 256, 128), 
    'learning_rate': 'adaptive', 
    'max_iter': 500, 
}

model = MLPClassifier(**model_params)
model.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.001, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(512, 256, 128), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [0]:
y_pred = model.predict(X_test)

In [0]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 96.67%
