In [None]:
import moviepy.editor as mp

def extract_audio_from_video(video_path, audio_path):
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)

# Example usage
extract_audio_from_video('meld_video.mp4', 'meld_audio.wav')


In [None]:
import numpy as np
import scipy.io.wavfile as wav
from scipy.signal import spectrogram

def mel_filter_bank(num_filters, fft_size, sample_rate, low_freq=0, high_freq=None):
    if high_freq is None:
        high_freq = sample_rate / 2
    mel_min = 2595 * np.log10(1 + low_freq / 700)
    mel_max = 2595 * np.log10(1 + high_freq / 700)
    mel_points = np.linspace(mel_min, mel_max, num_filters + 2)
    hz_points = 700 * (10**(mel_points / 2595) - 1)
    bin_points = np.floor((fft_size + 1) * hz_points / sample_rate).astype(int)

    filters = np.zeros((num_filters, fft_size // 2 + 1))
    for i in range(1, num_filters + 1):
        filters[i - 1, bin_points[i - 1]:bin_points[i]] = \
            (np.arange(bin_points[i - 1], bin_points[i]) - bin_points[i - 1]) / (bin_points[i] - bin_points[i - 1])
        filters[i - 1, bin_points[i]:bin_points[i + 1]] = \
            1 - (np.arange(bin_points[i], bin_points[i + 1]) - bin_points[i]) / (bin_points[i + 1] - bin_points[i])

    return filters

def extract_mfsc(audio_path):
    sample_rate, signal = wav.read(audio_path)
    _, Sxx, _ = spectrogram(signal, fs=sample_rate, nperseg=1024, noverlap=512)
    mel_filters = mel_filter_bank(num_filters=40, fft_size=1024, sample_rate=sample_rate)
    mfsc = np.dot(mel_filters, Sxx)
    return np.log(mfsc + 1e-6)

# Example usage
mfsc_features = extract_mfsc('meld_audio.wav')

from pyAudioAnalysis import audioFeatureExtraction as aF

def extract_plp(audio_path):
    features, _ = aF.feature_extraction_file(audio_path, 16000, 0.050, 0.025)
    return features

# Example usage
plp_features = extract_plp('meld_audio.wav')



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(32 * 16 * 16, 128)
        self.fc2 = nn.Linear(128, 10)  # Example output size

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 32 * 16 * 16)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Example usage with MFSC features
def prepare_input(features):
    features = np.expand_dims(features, axis=0)  # Add batch dimension
    features = np.expand_dims(features, axis=1)  # Add channel dimension
    return torch.tensor(features, dtype=torch.float32)

cnn_model = AudioCNN()
cnn_model.eval()
mfsc_tensor = prepare_input(mfsc_features)
with torch.no_grad():
    cnn_features = cnn_model(mfsc_tensor)


In [None]:
import torchaudio
from transformers import Wav2Vec2Model, Wav2Vec2Tokenizer

tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

def extract_high_level_features(audio_path):
    waveform, _ = torchaudio.load(audio_path)
    inputs = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding="longest")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.numpy()

# Example usage
high_level_features = extract_high_level_features('meld_audio.wav')


In [None]:
from transformers import BertTokenizer, BertModel

text_features = preprocess_text("Example sentence from audio")  # Assume text is extracted from audio
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def analyze_with_llm(text_features):
    with torch.no_grad():
        outputs = model(**text_features)
    return outputs.last_hidden_state.numpy()

# Example usage
text_features = tokenizer("example text", return_tensors="pt")
llm_features = analyze_with_llm(text_features)


In [None]:
def combine_features(low_level_features, high_level_features):
    combined_features = np.concatenate([low_level_features, high_level_features], axis=1)
    return combined_features

# Example usage
combined_features = combine_features(cnn_features.numpy(), llm_features)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim

# Assume labels are provided
labels = np.array([...])  # Emotion labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(combined_features, y_encoded, test_size=0.2, random_state=42)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleNN(X_train.shape[1], len(np.unique(y_encoded)))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
model.train()
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/10, Loss: {loss.item()}')

# Testing loop
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print(f'Accuracy: {accuracy}')
