**Import Libraries**

In [2]:
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import speech_recognition as sr

**Select Model**

In [3]:
class AudioStressCNN(nn.Module):
    def __init__(self, num_classes=7):  # Number of output classes
        super(AudioStressCNN, self).__init__()
        
        # 1D CNN Layers for feature extraction
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=5, padding=2)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)

        self.relu = nn.ReLU()
        self.batch_norm1 = nn.BatchNorm1d(64)
        self.batch_norm2 = nn.BatchNorm1d(128)
        self.batch_norm3 = nn.BatchNorm1d(256)
        
        self.pool = nn.MaxPool1d(kernel_size=2)

        # Dropout layer for CNN feature extraction
        self.dropout_cnn = nn.Dropout(0.2)

        # LSTM Layer for capturing temporal dependencies
        self.lstm = nn.LSTM(input_size=256, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)

        # Fully connected layers for classification
        self.fc1 = nn.Linear(128 * 2, 256)  # Bidirectional LSTM doubles the hidden size
        self.dropout_fc = nn.Dropout(0.5)  # Dropout after FC1
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        # CNN feature extraction
        x = self.relu(self.batch_norm1(self.conv1(x)))
        x = self.pool(x)
        x = self.dropout_cnn(x)

        x = self.relu(self.batch_norm2(self.conv2(x)))
        x = self.pool(x)
        x = self.dropout_cnn(x)

        x = self.relu(self.batch_norm3(self.conv3(x)))
        x = self.pool(x)
        x = self.dropout_cnn(x)

        # Transpose for LSTM
        x = x.permute(0, 2, 1)

        # LSTM for temporal modeling
        x, _ = self.lstm(x)

        # Take the last time step's output
        x = x[:, -1, :]

        # Fully connected layers
        x = self.relu(self.fc1(x))
        x = self.dropout_fc(x)
        x = self.fc2(x)
        return x

In [5]:
# Define the path to the saved model
model_path = "models/trained_model_modify_cnn_lstm_audio.pth"

model=AudioStressCNN()
# Load the model's state_dict (weights)
model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
model.eval()  # Set the model to evaluation mode

  model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))


AudioStressCNN(
  (conv1): Conv1d(1, 64, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv2): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (batch_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout_cnn): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(256, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=256, bias=True)
  (dropout_fc): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=7, bias=True)
)

**Transcribe Module**

In [6]:
# Function to transcribe the audio to text
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    audio = sr.AudioFile(audio_path)
    
    with audio as source:
        audio_data = recognizer.record(source)
        
    try:
        text = recognizer.recognize_google(audio_data)
    except sr.UnknownValueError:
        text = "Unable to transcribe"
    except sr.RequestError as e:
        text = f"Error: {e}"
    
    return text

**Level Detection Module**

In [9]:
# Function to predict stress level from an audio file
def predict_stress_from_file(audio_path):
    try:
        # Load audio file
        audio_data, sr = librosa.load(audio_path, sr=16000)  # Resample to 16 kHz
        print(f"Audio file loaded: {audio_path}, Duration: {len(audio_data)/sr:.2f} seconds")

        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=40)
        mfcc = np.mean(mfcc.T, axis=0)
        mfcc = np.expand_dims(mfcc, axis=(0, 1))  # Add batch and channel dimensions

        # Convert to tensor
        mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32)

        # Predict using the model
        output = model(mfcc_tensor)
        _, predicted = torch.max(output, 1)

        # Map prediction to stress level
        label_mapping = {0: '1 (Low)', 1: '2 (Low-Mild)', 2: '4 (Mild)', 3: '5 (Moderate)', 4: '6 (Moderate-High)', 5: '8 (high)', 6: '9 (critical)'}

        stress_level = label_mapping[predicted.item()]
        return stress_level
    except Exception as e:
        print(f"Error processing file: {e}")
        return "Error"

**Inference**

In [12]:
# Input from the user
audio_file_path = "dataset/test_data/EB0_anger_10.wav"
text=transcribe_audio(audio_file_path)
print(f"Audio Text: {text}")
# Predict stress level
predicted_stress_level = predict_stress_from_file(audio_file_path)
print(f"Predicted Stress Level: {predicted_stress_level}")

Audio Text: Unable to transcribe
Audio file loaded: dataset/test_data/EB0_anger_10.wav, Duration: 2.04 seconds
Predicted Stress Level: 6 (Moderate-High)
