In [None]:
!pip install librosa
!pip install torchsummary




In [None]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torchsummary import summary


In [None]:
DATA_PATH = "/content/drive/MyDrive/wav"

def extract_features(file_path, max_len=300):
    y, sr = librosa.load(file_path, sr=16000)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    if mel_db.shape[1] < max_len:
        pad_width = max_len - mel_db.shape[1]
        mel_db = np.pad(mel_db, pad_width=((0, 0), (0, pad_width)))
    else:
        mel_db = mel_db[:, :max_len]
    return mel_db

def get_emotion_from_filename(filename):
    # EmoDB emotion in 6th character
    emotions = {
        'W': 'anger', 'L': 'boredom', 'E': 'disgust', 'A': 'fear',
        'F': 'happiness', 'T': 'sadness', 'N': 'neutral'
    }
    return emotions.get(filename[5], 'unknown')

X, y = [], []

for root, _, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            label = get_emotion_from_filename(file)
            feature = extract_features(file_path)
            X.append(feature)
            y.append(label)

X = np.array(X)
le = LabelEncoder()
y = le.fit_transform(y)
y = np.array(y)


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)  # (N, 1, 128, 300)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = EmotionDataset(X_train, y_train)
test_data = EmotionDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


In [None]:
class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.bi_gru = nn.GRU(64 * 32, 128, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(128 * 2, num_classes)

    def forward(self, x):
        x = self.cnn(x)  # [B, 64, 32, 75]
        x = x.permute(0, 3, 1, 2).contiguous()  # [B, 75, 64, 32]
        x = x.view(x.size(0), x.size(1), -1)    # [B, 75, 64*32]
        gru_out, _ = self.bi_gru(x)
        x = self.fc(gru_out[:, -1, :])  # Last timestep
        return x


In [None]:
class CenterLoss(nn.Module):
    def __init__(self, num_classes, feat_dim):
        super(CenterLoss, self).__init__()
        self.centers = nn.Parameter(torch.randn(num_classes, feat_dim))

    def forward(self, features, labels):
        batch_size = features.size(0)
        expanded_centers = self.centers.index_select(0, labels)
        loss = ((features - expanded_centers) ** 2).sum() / 2.0 / batch_size
        return loss


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = EmotionModel(num_classes=len(le.classes_)).to(device)
center_loss = CenterLoss(num_classes=len(le.classes_), feat_dim=256).to(device)
ce_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(model.parameters()) + list(center_loss.parameters()), lr=1e-3)

for epoch in range(20):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)

        clf_loss = ce_loss(outputs, y_batch)
        center_feat = outputs
        c_loss = center_loss(center_feat, y_batch)

        loss = clf_loss + 0.1 * c_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


RuntimeError: The size of tensor a (7) must match the size of tensor b (256) at non-singleton dimension 1

In [None]:
class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.bi_gru = nn.GRU(64 * 32, 128, batch_first=True, bidirectional=True)
        self.embedding = nn.Linear(128 * 2, 256)  # Feature for center loss
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.cnn(x)                          # -> [B, 64, 32, 75]
        x = x.permute(0, 3, 1, 2).contiguous()   # -> [B, 75, 64, 32]
        x = x.view(x.size(0), x.size(1), -1)     # -> [B, 75, 64*32]
        gru_out, _ = self.bi_gru(x)
        last_step = gru_out[:, -1, :]            # -> [B, 256]
        embedding = self.embedding(last_step)    # -> [B, 256]
        output = self.classifier(embedding)      # -> [B, num_classes]
        return output, embedding


In [None]:
for epoch in range(20):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs, features = model(X_batch)  # unpack logits and embedding

        clf_loss = ce_loss(outputs, y_batch)
        c_loss = center_loss(features, y_batch)
        loss = clf_loss + 0.1 * c_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


ValueError: too many values to unpack (expected 2)

In [None]:
def forward(self, x):
    x = self.cnn(x)                          # -> [B, 64, 32, 75]
    x = x.permute(0, 3, 1, 2).contiguous()   # -> [B, 75, 64, 32]
    x = x.view(x.size(0), x.size(1), -1)     # -> [B, 75, 64*32]
    gru_out, _ = self.bi_gru(x)
    last_step = gru_out[:, -1, :]            # -> [B, 256]
    embedding = self.embedding(last_step)    # -> [B, 256]
    output = self.classifier(embedding)      # -> [B, num_classes]
    return output, embedding                 # ✅ return both


In [None]:
class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.bi_gru = nn.GRU(64 * 32, 128, batch_first=True, bidirectional=True)
        self.embedding = nn.Linear(128 * 2, 256)  # Feature vector
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.cnn(x)                          # -> [B, 64, 32, 75]
        x = x.permute(0, 3, 1, 2).contiguous()   # -> [B, 75, 64, 32]
        x = x.view(x.size(0), x.size(1), -1)     # -> [B, 75, 64*32]
        gru_out, _ = self.bi_gru(x)
        last_step = gru_out[:, -1, :]            # -> [B, 256]
        embedding = self.embedding(last_step)    # -> [B, 256]
        output = self.classifier(embedding)      # -> [B, num_classes]
        return output, embedding                 # ✅ return both


In [None]:
import torch.nn as nn

class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.bi_gru = nn.GRU(64 * 32, 128, batch_first=True, bidirectional=True)
        self.embedding = nn.Linear(256, 256)  # 128*2 = 256
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.cnn(x)                          # shape: [B, 64, 32, T]
        x = x.permute(0, 3, 1, 2).contiguous()   # shape: [B, T, 64, 32]
        x = x.view(x.size(0), x.size(1), -1)     # shape: [B, T, 64*32]
        gru_out, _ = self.bi_gru(x)
        last_hidden = gru_out[:, -1, :]          # shape: [B, 256]
        features = self.embedding(last_hidden)   # shape: [B, 256]
        output = self.classifier(features)       # shape: [B, num_classes]
        return output, features                  # ✅ return BOTH


In [None]:
X_batch, y_batch = next(iter(train_loader))
X_batch = X_batch.to(device)

out = model(X_batch)
print("Model output type:", type(out))
print("Output length if tuple:", len(out) if isinstance(out, tuple) else "Not a tuple")


Model output type: <class 'torch.Tensor'>
Output length if tuple: Not a tuple


In [None]:
import torch.nn as nn

class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )
        self.bi_gru = nn.GRU(64 * 32, 128, batch_first=True, bidirectional=True)
        self.embedding = nn.Linear(256, 256)
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.cnn(x)                          # [B, 64, 32, T]
        x = x.permute(0, 3, 1, 2).contiguous()   # [B, T, 64, 32]
        x = x.view(x.size(0), x.size(1), -1)     # [B, T, 2048]
        gru_out, _ = self.bi_gru(x)
        last_hidden = gru_out[:, -1, :]          # [B, 256]
        features = self.embedding(last_hidden)   # [B, 256]
        output = self.classifier(features)       # [B, num_classes]
        return output, features                  # ✅ THIS LINE IS CRUCIAL


In [None]:
import torch
import torch.nn as nn

# Initialize model, loss functions, and optimizer
num_classes = len(le.classes_)  # from your label encoder
model = EmotionModel(num_classes=num_classes).to(device)
center_loss = CenterLoss(num_classes=num_classes, feat_dim=256).to(device)
ce_loss = nn.CrossEntropyLoss()

# Optimizer (combined for model + center loss)
optimizer = torch.optim.Adam(
    list(model.parameters()) + list(center_loss.parameters()),
    lr=1e-3
)

# Training Loop
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()

        outputs, features = model(X_batch)                 # ⚠️ make sure model returns two values!
        loss_ce = ce_loss(outputs, y_batch)                # Cross-Entropy Loss
        loss_center = center_loss(features, y_batch)       # Center Loss

        loss = loss_ce + 0.1 * loss_center                 # Combine losses
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predicted = outputs.argmax(dim=1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {total_loss:.4f} - Accuracy: {accuracy:.2f}%")


Epoch [1/20] - Loss: 192.6909 - Accuracy: 20.33%
Epoch [2/20] - Loss: 182.1084 - Accuracy: 23.60%
Epoch [3/20] - Loss: 178.3149 - Accuracy: 23.60%
Epoch [4/20] - Loss: 173.9073 - Accuracy: 23.60%
Epoch [5/20] - Loss: 169.9091 - Accuracy: 23.83%
Epoch [6/20] - Loss: 164.0440 - Accuracy: 27.57%
Epoch [7/20] - Loss: 152.8246 - Accuracy: 36.92%
Epoch [8/20] - Loss: 142.9655 - Accuracy: 39.25%
Epoch [9/20] - Loss: 131.3024 - Accuracy: 42.52%
Epoch [10/20] - Loss: 125.3307 - Accuracy: 45.09%
Epoch [11/20] - Loss: 116.3570 - Accuracy: 49.07%
Epoch [12/20] - Loss: 115.0875 - Accuracy: 48.13%
Epoch [13/20] - Loss: 109.4796 - Accuracy: 51.87%
Epoch [14/20] - Loss: 105.0566 - Accuracy: 48.83%
Epoch [15/20] - Loss: 96.8737 - Accuracy: 53.27%
Epoch [16/20] - Loss: 95.1821 - Accuracy: 51.64%
Epoch [17/20] - Loss: 96.2320 - Accuracy: 52.57%
Epoch [18/20] - Loss: 92.7781 - Accuracy: 49.77%
Epoch [19/20] - Loss: 83.6897 - Accuracy: 55.61%
Epoch [20/20] - Loss: 73.7989 - Accuracy: 60.98%


In [None]:
import os
import glob

data_path = "/content/drive/MyDrive/wav"
audio_files = glob.glob(os.path.join(data_path, "*.wav"))

# Extract emotion from the 6th character of each filename
def extract_emotion(filename):
    emotion_map = {
        'W': 'anger',
        'L': 'boredom',
        'E': 'disgust',
        'A': 'fear',
        'F': 'happiness',
        'T': 'sadness',
        'N': 'neutral'
    }
    emotion_code = os.path.basename(filename)[5]  # 6th character
    return emotion_map.get(emotion_code, 'unknown')

labels = [extract_emotion(f) for f in audio_files]


In [None]:
import torch
import numpy as np
import librosa
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

class EmotionDataset(Dataset):
    def __init__(self, file_paths, labels, sr=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(labels)

    def extract_features(self, file_path):
        y, sr = librosa.load(file_path, sr=self.sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        if mfcc.shape[1] < 64:
            pad_width = 64 - mfcc.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :64]
        return mfcc

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        x = self.extract_features(self.file_paths[idx])
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # shape: (1, 40, 64)
        y = torch.tensor(self.encoded_labels[idx], dtype=torch.long)
        return x, y


In [None]:
from torch.utils.data import random_split, DataLoader

full_dataset = EmotionDataset(audio_files, labels)

# Split into train and val
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Save label encoder for decoding predictions later
le = full_dataset.label_encoder


In [None]:
class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
        )

        # we'll calculate the CNN output shape using a dummy tensor
        dummy_input = torch.zeros(1, 1, 40, 64)
        cnn_out = self.cnn(dummy_input)  # (1, C, H, W)
        _, C, H, W = cnn_out.shape
        self.rnn_input_size = C * H

        self.gru = nn.GRU(input_size=self.rnn_input_size, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)

        self.embedding = nn.Linear(128 * 2, 256)  # Bi-GRU -> 2*128
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.cnn(x)                             # (B, C, H, W)
        x = x.permute(0, 3, 1, 2)                   # (B, W, C, H)
        x = x.contiguous().view(x.size(0), x.size(1), -1)  # (B, W, C*H)

        gru_out, _ = self.gru(x)                    # (B, W, 256)
        x = gru_out[:, -1, :]                       # Last time step

        feat = self.embedding(x)
        logits = self.classifier(feat)
        return logits, feat


In [None]:
import torch.nn as nn
import torch

class EmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
        )

        # dynamically calculate input size for GRU
        dummy_input = torch.zeros(1, 1, 40, 64)
        cnn_out = self.cnn(dummy_input)
        _, C, H, W = cnn_out.shape
        self.rnn_input_size = C * H

        self.gru = nn.GRU(input_size=self.rnn_input_size, hidden_size=128, num_layers=1,
                          batch_first=True, bidirectional=True)

        self.embedding = nn.Linear(128 * 2, 256)  # Bi-GRU output
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.cnn(x)  # (B, C, H, W)
        x = x.permute(0, 3, 1, 2).contiguous()  # (B, W, C, H)
        x = x.view(x.size(0), x.size(1), -1)  # (B, W, C*H)

        gru_out, _ = self.gru(x)  # (B, W, 256)
        x = gru_out[:, -1, :]  # Last time step

        feat = self.embedding(x)
        logits = self.classifier(feat)
        return logits, feat



In [None]:
import torch
import torch.nn as nn

# Initialize model, loss functions, and optimizer
num_classes = len(le.classes_)  # from your label encoder
model = EmotionModel(num_classes=num_classes).to(device)
center_loss = CenterLoss(num_classes=num_classes, feat_dim=256).to(device)
ce_loss = nn.CrossEntropyLoss()

# Optimizer (combined for model + center loss)
optimizer = torch.optim.Adam(
    list(model.parameters()) + list(center_loss.parameters()),
    lr=1e-3
)

# Training Loop
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()

        outputs, features = model(X_batch)                 # ⚠️ make sure model returns two values!
        loss_ce = ce_loss(outputs, y_batch)                # Cross-Entropy Loss
        loss_center = center_loss(features, y_batch)       # Center Loss

        loss = loss_ce + 0.1 * loss_center                 # Combine losses
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predicted = outputs.argmax(dim=1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {total_loss:.4f} - Accuracy: {accuracy:.2f}%")




Epoch [1/30] - Loss: 186.2077 - Accuracy: 30.37%
Epoch [2/30] - Loss: 160.9896 - Accuracy: 37.15%
Epoch [3/30] - Loss: 143.9675 - Accuracy: 43.22%
Epoch [4/30] - Loss: 136.1429 - Accuracy: 44.86%
Epoch [5/30] - Loss: 124.5040 - Accuracy: 54.44%
Epoch [6/30] - Loss: 115.4726 - Accuracy: 57.01%
Epoch [7/30] - Loss: 108.1930 - Accuracy: 63.55%
Epoch [8/30] - Loss: 98.9416 - Accuracy: 69.39%
Epoch [9/30] - Loss: 89.7084 - Accuracy: 73.83%
Epoch [10/30] - Loss: 81.4851 - Accuracy: 80.37%
Epoch [11/30] - Loss: 75.9040 - Accuracy: 80.84%
Epoch [12/30] - Loss: 66.5914 - Accuracy: 85.98%
Epoch [13/30] - Loss: 55.2700 - Accuracy: 92.99%
Epoch [14/30] - Loss: 47.8448 - Accuracy: 95.79%
Epoch [15/30] - Loss: 42.7706 - Accuracy: 95.56%
Epoch [16/30] - Loss: 35.9341 - Accuracy: 97.66%
Epoch [17/30] - Loss: 30.7972 - Accuracy: 98.36%
Epoch [18/30] - Loss: 25.4830 - Accuracy: 99.07%
Epoch [19/30] - Loss: 21.2480 - Accuracy: 99.53%
Epoch [20/30] - Loss: 18.0824 - Accuracy: 99.30%
Epoch [21/30] - Loss: 

In [None]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs, _ = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

decoded_preds = le.inverse_transform(all_preds)
decoded_labels = le.inverse_transform(all_labels)

print(classification_report(decoded_labels, decoded_preds))


              precision    recall  f1-score   support

       anger       0.86      0.86      0.86        21
     boredom       0.53      0.83      0.65        12
     disgust       0.43      0.38      0.40         8
        fear       0.78      0.70      0.74        20
   happiness       0.69      0.56      0.62        16
     neutral       0.63      0.71      0.67        17
     sadness       0.90      0.69      0.78        13

    accuracy                           0.70       107
   macro avg       0.69      0.68      0.67       107
weighted avg       0.72      0.70      0.70       107



In [None]:
torch.save(model.state_dict(), "emotion_model.pth")

import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)


In [None]:
def predict_emotion(file_path, model, le, device):
    model.eval()
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    if mfcc.shape[1] < 64:
        pad_width = 64 - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :64]

    x = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)

    with torch.no_grad():
        output, _ = model(x)
        pred_idx = torch.argmax(output, dim=1).item()
        emotion = le.inverse_transform([pred_idx])[0]

    return emotion


In [None]:
file_path = "/content/drive/MyDrive/wav/03a01Fa.wav"
emotion = predict_emotion(file_path, model, le, device)
print("Predicted Emotion:", emotion)


Predicted Emotion: happiness


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_BiGRU(nn.Module):
    def __init__(self, num_classes):
        super(CNN_BiGRU, self).__init__()

        # 2D CNN layers
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Reduces size by 2

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        # After conv layers: (assuming input 1x40x173 MFCC, pool 3 times → 64x5x21)
        self.gru_input_size = 64 * 5  # channels * freq
        self.bi_gru = nn.GRU(input_size=self.gru_input_size, hidden_size=128, num_layers=1,
                             batch_first=True, bidirectional=True)

        self.embedding = nn.Linear(256, 128)  # 128*2 (bidirectional) → 128

        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.cnn(x)  # x: [B, C, F, T]

        # Prepare for GRU: [B, C, F, T] → [B, T, C*F]
        x = x.permute(0, 3, 1, 2)  # [B, T, C, F]
        x = x.reshape(x.size(0), x.size(1), -1)  # [B, T, C*F]

        # GRU
        gru_out, _ = self.bi_gru(x)  # [B, T, 256]
        last_step = gru_out[:, -1, :]  # Take last time step [B, 256]

        feature = self.embedding(last_step)  # [B, 128]
        output = self.classifier(feature)    # [B, num_classes]

        return output, feature  # Return both for center loss


In [None]:
class CenterLoss(nn.Module):
    def __init__(self, num_classes, feat_dim):
        super(CenterLoss, self).__init__()
        self.centers = nn.Parameter(torch.randn(num_classes, feat_dim))

    def forward(self, features, labels):
        batch_size = features.size(0)
        centers_batch = self.centers.index_select(0, labels)
        loss = ((features - centers_batch) ** 2).sum() / 2.0 / batch_size
        return loss


In [None]:
# Assuming 'full_dataset' is already defined
from torch.utils.data import random_split, DataLoader

train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

train_set, val_set, test_set = random_split(full_dataset, [train_size, val_size, test_size])
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)


In [None]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs, _ = model(X_batch)  # Get logits
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Compute test accuracy
test_acc = accuracy_score(all_labels, all_preds)
print(f"✅ Test Accuracy: {test_acc * 100:.2f}%")


✅ Test Accuracy: 96.30%


In [None]:
file_path = "/content/drive/MyDrive/wav/03a01Fa.wav"
emotion = predict_emotion(file_path, model, le, device)
print("Predicted Emotion:", emotion)

Predicted Emotion: happiness


In [None]:
from google.colab import files

uploaded = files.upload()


Saving 03-01-01-01-01-01-06.wav to 03-01-01-01-01-01-06.wav


In [None]:
import librosa

y, sr = librosa.load("03-01-01-01-01-01-06.wav", sr=None)
print(f"Sample Rate: {sr}, Duration: {len(y)/sr:.2f}s, Channels: {'Mono' if len(y.shape)==1 else 'Stereo'}")


Sample Rate: 48000, Duration: 3.34s, Channels: Mono


In [None]:
y, sr = librosa.load("03-01-01-01-01-01-06.wav", sr=16000)  # Resample to 16 kHz


In [None]:
def extract_features(file_path, max_pad_len=173):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]

    return mfcc


In [None]:
features = extract_features("03-01-01-01-01-01-06.wav")
input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)


In [None]:
print("Input Shape:", input_tensor.shape)  # Should be [1, 1, 40, 173]


Input Shape: torch.Size([1, 1, 40, 173])


In [None]:
model.eval()
with torch.no_grad():
    output, _ = model(input_tensor)
    predicted_label = torch.argmax(output, dim=1).item()

label_map = {
    0: 'Neutral',
    1: 'Calm',
    2: 'Happy',
    3: 'Sad',
    4: 'Angry',
    5: 'Fearful',
    6: 'Disgust',
    7: 'Surprised'
}

print("Predicted Emotion:", label_map[predicted_label])


Predicted Emotion: Neutral


In [None]:
import torch
import numpy as np
import librosa
from google.colab import files

# Step 1: Upload a local audio file
uploaded = files.upload()
filename = list(uploaded.keys())[0]  # Get uploaded file name

# Step 2: Define MFCC extraction function
def extract_features(file_path, max_pad_len=173):
    y, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    # Pad or truncate to fixed shape
    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]

    return mfcc

# Step 3: Process the uploaded audio file
features = extract_features(filename)
input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)  # Shape: [1, 1, 40, 173]

# Step 4: Predict using the trained model
model.eval()
with torch.no_grad():
    outputs = model(input_tensor)
    if isinstance(outputs, tuple):
        outputs = outputs[0]  # Handle (logits, features) output
    predicted_label = torch.argmax(outputs, dim=1).item()

# Step 5: Map label to emotion name
label_map = {
    0: 'Neutral',
    1: 'Calm',
    2: 'Happy',
    3: 'Sad',
    4: 'Angry',
    5: 'Fearful',
    6: 'Disgust',
    7: 'Surprised'
}

# Step 6: Print result
print(f"\n🎧 Predicted Emotion: {label_map.get(predicted_label, 'Unknown')} (Label {predicted_label})")


Saving 03-01-01-01-01-01-01.wav to 03-01-01-01-01-01-01.wav

🎧 Predicted Emotion: Sad (Label 3)


In [None]:
import torch
import numpy as np
import librosa
from google.colab import files

# Step 1: Upload a local audio file
uploaded = files.upload()
filename = list(uploaded.keys())[0]  # Get uploaded file name

# Step 2: Define MFCC extraction function
def extract_features(file_path, max_pad_len=173):
    y, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    # Pad or truncate to fixed shape
    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]

    return mfcc

# Step 3: Process the uploaded audio file
features = extract_features(filename)
input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)  # Shape: [1, 1, 40, 173]

# Step 4: Predict using the trained model
model.eval()
with torch.no_grad():
    outputs = model(input_tensor)
    if isinstance(outputs, tuple):
        outputs = outputs[0]  # Handle (logits, features) output
    predicted_label = torch.argmax(outputs, dim=1).item()

# Step 5: Map label to emotion name
label_map = {
    0: 'Neutral',
    1: 'Calm',
    2: 'Happy',
    3: 'Sad',
    4: 'Angry',
    5: 'Fearful',
    6: 'Disgust',
    7: 'Surprised'
}

# Step 6: Print result
print(f"\n🎧 Predicted Emotion: {label_map.get(predicted_label, 'Unknown')} (Label {predicted_label})")


Saving 03-01-01-01-01-01-06.wav to 03-01-01-01-01-01-06 (1).wav

🎧 Predicted Emotion: Neutral (Label 0)
