In [1]:
import torch
import torchaudio
import numpy as np
import pandas as pd
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch import nn
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2Config

2025-02-24 04:43:05.884096: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-24 04:43:05.891668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740352385.900535    8897 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740352385.903215    8897 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 04:43:05.912753: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
class Wav2VecClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(Wav2VecClassifier, self).__init__()
        
        # Load the pre-trained Wav2Vec model
        self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        
        # Freeze the feature extractor layers (optional)
        for param in self.wav2vec.feature_extractor.parameters():
            param.requires_grad = False
            
        # Get the output dimension of wav2vec
        hidden_size = self.wav2vec.config.hidden_size  # typically 768
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # Get wav2vec features
        outputs = self.wav2vec(x)
        hidden_states = outputs.last_hidden_state
        
        # Pool the output (take mean of all time steps)
        pooled_output = torch.mean(hidden_states, dim=1)
        
        # Classify
        logits = self.classifier(pooled_output)
        return logits

# Let's test the model with random data
def test_model():
    # Create model instance
    model = Wav2VecClassifier(num_classes=3)
    model.eval()
    
    # Create random input tensor
    # Wav2Vec expects input shape: [batch_size, sequence_length]
    # Typical audio sampling rate is 16kHz, let's create 1 second of audio
    batch_size = 2
    sequence_length = 16000  # 1 second of audio at 16kHz
    random_audio = torch.randn(batch_size, sequence_length)
    
    # Forward pass
    with torch.no_grad():
        output = model(random_audio)
    
    print("Input shape:", random_audio.shape)
    print("Output shape:", output.shape)
    print("Output (logits):", output)

In [4]:
model = Wav2VecClassifier(num_classes=3)
model.load_state_dict(torch.load('wav2vec_model.pth'))
model.eval()

# Test the model on test set in CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [5]:
def preprocess_audio(audio_path):
    # Load audio
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Resample if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)
    
    # Convert to numpy array and squeeze
    waveform = waveform.squeeze().numpy()
    
    # Normalize audio
    waveform = waveform / np.max(np.abs(waveform))
    
    return waveform

def test_model_on_audio(model, audio_path, device):
    class_mapping = {
            'Cry': 0,
            'NotScreaming': 1,
            'Screaming': 2
        }
    
    # Preprocess audio
    waveform = preprocess_audio(audio_path)
    input_values = torch.FloatTensor(waveform).unsqueeze(0).to(device)
    
    # Forward pass
    model.eval()
    with torch.no_grad():
        outputs = model(input_values)
    
    # Get predicted class
    _, predicted = torch.max(outputs.data, 1)
    
    # Get class name
    class_mapping = {v: k for k, v in class_mapping.items()}
    predicted_class = class_mapping[predicted.item()]
    
    return predicted_class


In [6]:
audio_path = "Datasets/Cry/0c8f14a9-6999-485b-97a2-913c1cbf099c-1430760379259-1.7-m-26-hu.wav"
predicted_class = test_model_on_audio(model, audio_path, device)
print(f"Predicted Class: {predicted_class}")

Predicted Class: Cry
