In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio
from sklearn.preprocessing import OneHotEncoder

import torch
import torchaudio
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from torchvision.models import resnet18
import torch.nn as nn
import torch.optim as optim
from google.colab import drive

from google.colab import drive
import os
# Mount Google Drive
drive.mount('/content/drive')

# Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Mounted at /content/drive


In [17]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import torch
import os
import torchaudio
import torch.nn.functional as F
from IPython.display import display, Audio, HTML

# Initialize the feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
model_save_path = os.path.join("/content/drive/My Drive", "Final_Model_Aug")
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_save_path)
model.eval()

def preprocess_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)

    # Ensure the waveform is mono (single channel)
    if waveform.size(0) > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if needed
    if sample_rate != feature_extractor.sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=feature_extractor.sampling_rate)
        waveform = resampler(waveform)

    # Extract features with padding and truncation
    inputs = feature_extractor(
        waveform.squeeze().numpy(),
        sampling_rate=feature_extractor.sampling_rate,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=40000  # Set a fixed max_length value
    )

    # Return input values with the correct shape
    return inputs.input_values.squeeze(0)  # Ensure it's [sequence_length] or [batch_size, sequence_length]

# Paths to your WAV files
wav_file_paths = [
    os.path.join("/content/drive/My Drive/Final_Model_Aug", "Hadar-angry.wav"),
    os.path.join("/content/drive/My Drive/Final_Model_Aug", "Hadar-happy (1).wav"),
    os.path.join("/content/drive/My Drive/Final_Model_Aug", "Hadar-happy (2).wav"),
    os.path.join("/content/drive/My Drive/Final_Model_Aug", "Hadar-NEU.wav")
]

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Label names
label_names = ['neutral', 'happy', 'fear', 'angry', 'disgust', 'sad']

# Processing and displaying results
for wav_file_path in wav_file_paths:
    input_values = preprocess_audio(wav_file_path)

    # Ensure input values are 2D tensor (batch_size, sequence_length)
    input_values = input_values.unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        outputs = model(input_values)
        logits = outputs.logits
        probabilities = F.softmax(logits.squeeze(), dim=0)

    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = label_names[predicted_id]

    # Display audio and probabilities
    display(HTML(f"<h3>Predicted emotion for '{os.path.basename(wav_file_path)}': {predicted_label}</h3>"))
    display(Audio(wav_file_path))

    # Display the probabilities in a more readable format
    prob_percentages = (probabilities * 100).tolist()
    for label, prob in zip(label_names, prob_percentages):
        print(f"{label}: {prob:.2f}%")

    print("-" * 50)  # Separator for readability


Output hidden; open in https://colab.research.google.com to view.