In [41]:

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import pickle
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [42]:
#if os.path.exists("mfcc_data.pkl"):
 #   os.remove("mfcc_data.pkl")
  #  print("Removed corrupted mfcc_data.pkl")

In [43]:
audio_folder = r"A:\video_project\data\raw\audio"


preprocessed_file = "mfcc_data.pkl"


load_from_file = True


label_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

if load_from_file and os.path.exists(preprocessed_file):
    with open(preprocessed_file, "rb") as f:
        X, y, le = pickle.load(f)
    print(f"Loaded preprocessed data from '{preprocessed_file}'")
else:
    print("Preprocessing audio files...")

    mfcc_features = []
    labels = []

    for file_name in os.listdir(audio_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(audio_folder, file_name)
            y_audio, sr = librosa.load(file_path, sr=16000)
            mfcc = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=40)
            if mfcc.shape[1] < 150:
                pad_width = 150 - mfcc.shape[1]
                mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :150]
            mfcc_features.append(mfcc.T)
            emotion_code = file_name.split("-")[2]
            labels.append(label_map.get(emotion_code, 'unknown'))

    X = np.array(mfcc_features)
    y = np.array(labels)


    le = LabelEncoder()
    y = le.fit_transform(y)

    with open(preprocessed_file, "wb") as f:
        pickle.dump((X, y, le), f)

    print(f"Processed {len(X)} audio files and saved to '{preprocessed_file}'")

print("Feature shape:", X.shape)


Loaded preprocessed data from 'mfcc_data.pkl'
Feature shape: (2452, 150, 40)


In [44]:

le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Classes:", le.classes_)


Classes: [0 1 2 3 4 5 6 7]


In [45]:

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


## 5. PyTorch Dataset & DataLoader

In [47]:

class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [48]:

class AudioEmotionNet(nn.Module):
    def __init__(self, input_dim=40, hidden_dim=128, output_dim=8):
        super(AudioEmotionNet, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioEmotionNet(output_dim=len(le.classes_)).to(device)


In [49]:

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 1.9303
Epoch 2, Loss: 1.7386
Epoch 3, Loss: 1.6431
Epoch 4, Loss: 1.5353
Epoch 5, Loss: 1.4256
Epoch 6, Loss: 1.3295
Epoch 7, Loss: 1.2640
Epoch 8, Loss: 1.1826
Epoch 9, Loss: 1.1211
Epoch 10, Loss: 1.0592


In [50]:

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())
target_names = [str(c) for c in le.classes_]
print(classification_report(all_labels, all_preds, target_names=target_names))


              precision    recall  f1-score   support

           0       0.82      0.67      0.74        75
           1       0.62      0.92      0.74        76
           2       0.28      0.56      0.38        39
           3       0.49      0.57      0.53        67
           4       0.80      0.44      0.57        89
           5       0.78      0.25      0.38        28
           6       0.45      0.54      0.49        71
           7       0.35      0.15      0.21        46

    accuracy                           0.55       491
   macro avg       0.57      0.51      0.50       491
weighted avg       0.60      0.55      0.54       491



In [51]:
def preprocess_audio(file_path, max_len=150):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return torch.tensor(mfcc.T, dtype=torch.float32).unsqueeze(0)


In [69]:
test_folder = r"A:\video_project\data\raw\test_audio"
results = []
label_map = {
    '1': 'neutral',
    '2': 'calm',
    '3': 'happy',
    '4': 'sad',
    '5': 'angry',
    '6': 'fearful',
    '7': 'disgust',
    '8': 'surprised'
}
model.eval()
for file in os.listdir(test_folder):
    if file.endswith(".wav"):
        file_path = os.path.join(test_folder, file)
        input_tensor = preprocess_audio(file_path).to(device)
        with torch.no_grad():
            output = model(input_tensor)
            pred_class = torch.argmax(output, dim=1).item()
            pred_emotion = str(le.classes_[pred_class])
        results.append((file, pred_emotion))

# Print results
for file_name, emotion in results:
    print(f"{file_name}: {label_map[str(emotion)]}")


03-01-02-01-01-02-01.wav: neutral
03-01-04-01-01-01-17.wav: fearful


In [52]:
import sounddevice as sd
from scipy.io.wavfile import write
import tempfile

def record_audio(duration=3, fs=16000):
    print("Recording...")
    
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    
    sd.wait()
    temp_path = tempfile.mktemp(suffix=".wav")
    write(temp_path, fs, audio)
    
    print("Recording saved. Predicting...")
    return temp_path


test_file = record_audio()
input_tensor = preprocess_audio(test_file).to(device)
label_map = {
    '1': 'neutral',
    '2': 'calm',
    '3': 'happy',
    '4': 'sad',
    '5': 'angry',
    '6': 'fearful',
    '7': 'disgust',
    '8': 'surprised'
}
model.eval()
with torch.no_grad():
    output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1).item()
    predicted_emotion = le.classes_[predicted_class]
print("Predicted Emotion:", label_map[str(predicted_emotion)])

Recording...
Recording saved. Predicting...
Predicted Emotion: fearful


In [53]:
import gradio as gr

def predict_emotion_from_upload(file_path):
    if not file_path:
        return "No file received. Please upload a .wav file."
    
    try:
        input_tensor = preprocess_audio(file_path).to(device)
        model.eval()
        with torch.no_grad():
            output = model(input_tensor)
            pred_class = torch.argmax(output, dim=1).item()
            return f"Predicted Emotion: {label_map[str(le.classes_[pred_class])]}"
    except Exception as e:
        return f"Error: {str(e)}"

gr.Interface(
    fn=predict_emotion_from_upload,
    inputs=gr.Audio(type="filepath", label="Upload or record your voice"),
    outputs="text",
    title="Real-Time Audio Emotion Detector",
    description="Upload or record a short audio to detect the emotion"
).launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


