In [37]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa
import torch
import pandas as pd
import numpy as np
import os
import random
import pickle
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [38]:
#dataset
unseen_emotion = "sad"
folder = r"/content/drive/MyDrive/Audio_Speech_Actors_01-24/Actor_01"
path = r"/content/drive/MyDrive/Audio_Speech_Actors_01-24/Actor_01"
# where emotion word embeddings are stored
fasttext_folder='/content/drive/MyDrive/emotion_vectors'
#where to save the model
model_save = 'advanced_embedding_mapper.pth'
#where the pickle is saved
pickle_path = '/content/drive/MyDrive/USER/fileTensorDict.pckl'

In [39]:
def get_emotion_vector(filename):
    parts = filename.split('-')
    third_number = parts[2]

    emotion_vector_label = None
    if third_number == '05':
        emotion_vector_label = 'angry'
    elif third_number == '02':
        emotion_vector_label = 'calm'
    elif third_number == '07':
        emotion_vector_label = 'disgust'
    elif third_number == '06':
        emotion_vector_label = 'fearful'
    elif third_number == '03':
        emotion_vector_label = 'happy'
    elif third_number == '01':
        emotion_vector_label = 'neutral'
    elif third_number == '04':
        emotion_vector_label = 'sad'
    elif third_number == '08':
        emotion_vector_label = 'surprised'

    return emotion_vector_label

In [40]:
def addToDict(folder):
  emo_dict = {}
  for file in os.listdir(folder):
      emo_dict[file] = get_emotion_vector(file)
  str_emo_dict = str(emo_dict)
  with open("vectors.txt", "a") as vec:
    vec.write(str_emo_dict)

for x in os.listdir(folder):
  addToDict(folder)

In [41]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
def get_vector_from_audio(path):
  audio_input, sampling_rate = librosa.load(path, sr=16000)
  inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
  with torch.no_grad():
    outputs = model(**inputs)

  hidden_states = outputs.last_hidden_state

  vector_rep = torch.mean(hidden_states, dim=1)
  return vector_rep



In [43]:
def get_embeddings():
  filename_vector_dict = {}
  # path = r"/content/drive/MyDrive/Audio_Speech_Actors_01-24/Actor_01"
  for f in os.listdir(path):
    file_path = os.path.join(path,f)
    emov = get_vector_from_audio(file_path)
    filename_vector_dict[f] = emov
  return filename_vector_dict

In [44]:
def load_fasttext_embedding(emotion_label, fasttext_folder):
  filepath = os.path.join(fasttext_folder, f'{emotion_label}.txt')
  if not os.path.exists(filepath):
      raise FileNotFoundError(f"Embedding file for {emotion_label} not found in {folder}")

  embedding = []
  with open(filepath, 'r') as file:
      for line in file:
          embedding.append(float(line.strip()))

  return embedding


In [45]:
def map_fasttext_to_wav2vec(wav2vec_dict, fasttext_folder='emotion_vectors'):
    fasttext_vector_dict = {}
    for filename, wav2vec_embedding in wav2vec_dict.items():
        emotion_label = get_emotion_vector(filename)
        fasttext_embedding = load_fasttext_embedding(emotion_label, fasttext_folder)
        fasttext_embedding = torch.tensor(fasttext_embedding, dtype=torch.float32)
        fasttext_vector_dict[filename] = (wav2vec_embedding, fasttext_embedding)
    return fasttext_vector_dict

In [46]:
# filename_vector_dict = get_embeddings()
# vector_map = map_fasttext_to_wav2vec(filename_vector_dict,fasttext_folder)

In [47]:
def check_or_create_vector_map(pickle_path, fasttext_folder):
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)
            if 'vector_map' in data:
                print("vector_map loaded from pickle file.")
                return data['vector_map']

    # If the file does not exist or vector_map is not in the file, create it
    filename_vector_dict = get_embeddings()
    vector_map = map_fasttext_to_wav2vec(filename_vector_dict, fasttext_folder)

    # Save the vector_map to the pickle file
    with open(pickle_path, 'wb') as f:
        pickle.dump({'vector_map': vector_map}, f)
    print("vector_map created and saved to pickle file.")

    return vector_map

vector_map = check_or_create_vector_map(pickle_path, fasttext_folder)

vector_map loaded from pickle file.


In [48]:
def split_sets(dictionary, unseen_emotion, train_ratio=0.8, seed=42):
    # Set the random seed for reproducibility
    random.seed(seed)

    # Separate keys for the unseen emotion and other emotions
    unseen_keys = [key for key in dictionary.keys() if get_emotion_vector(key) == unseen_emotion]
    filtered_keys = [key for key in dictionary.keys() if key not in unseen_keys]

    # Shuffle the filtered keys
    random.shuffle(filtered_keys)

    # Calculate the number of training samples needed from the filtered data
    total_samples = len(dictionary)
    num_train_samples = int(total_samples * train_ratio)
    num_test_samples = total_samples - num_train_samples

    # Adjust the number of test samples from the filtered data
    num_test_samples_from_filtered = num_test_samples - len(unseen_keys)

    # Ensure there are enough samples in the filtered data
    if num_test_samples_from_filtered < 0:
        raise ValueError("Not enough samples in the filtered data to maintain the overall split ratio.")

    # Split the filtered keys into training and test sets
    train_keys = filtered_keys[:num_train_samples]
    test_keys = filtered_keys[num_train_samples:num_train_samples + num_test_samples_from_filtered]

    # Create training and test dictionaries from the filtered data
    train_dict = {key: dictionary[key] for key in train_keys}
    test_dict = {key: dictionary[key] for key in test_keys}

    # Add the unseen emotion samples to the test dictionary
    test_dict.update({key: dictionary[key] for key in unseen_keys})

    # Check for overlaps
    train_keys_set = set(train_dict.keys())
    test_keys_set = set(test_dict.keys())
    overlapping_keys = train_keys_set & test_keys_set
    if overlapping_keys:
        raise ValueError(f"Overlapping filenames found between training and test sets: {overlapping_keys}")


    return train_dict, test_dict

# Example usage
train_dict, test_dict = split_sets(vector_map, unseen_emotion)

# Check the counts
print("Training samples:", len(train_dict))
print("Test samples:", len(test_dict))

# Ensure no unseen emotion samples in the training set
print("Unseen emotion in training set:", any(get_emotion_vector(key) == unseen_emotion for key in train_dict.keys()))
print("Unseen emotion in test set:", any(get_emotion_vector(key) == unseen_emotion for key in test_dict.keys()))
print("Unseen emotion:", unseen_emotion)


Training samples: 48
Test samples: 12
Unseen emotion in training set: False
Unseen emotion in test set: True
Unseen emotion: sad


In [49]:
class AdvancedEmbeddingMapper(nn.Module):
    def __init__(self):
        super(AdvancedEmbeddingMapper, self).__init__()
        self.fc1 = nn.Linear(768, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 300)
        self.dropout = nn.Dropout(0.3)  # Increase dropout rate for better regularization
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = x.transpose(1, 2)
        x = self.bn1(x)
        x = x.transpose(1, 2)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = x.transpose(1, 2)
        x = self.bn2(x)
        x = x.transpose(1, 2)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc3(x)
        x = x.transpose(1, 2)
        x = self.bn3(x)
        x = x.transpose(1, 2)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc4(x)
        return x

In [50]:
class CNNEmbeddingMapper(nn.Module):

  def __init__(self):
    super(CNNEmbeddingMapper, self).__init__()
    self.conv1 = nn.Conv1d(in_channels=768, out_channels=512, kernel_size=3, padding=1)
    self.bn1 = nn.BatchNorm1d(512)
    self.conv2 = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=3, padding=1)
    self.bn2 = nn.BatchNorm1d(256)
    self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
    self.bn3 = nn.BatchNorm1d(128)
    self.conv4 = nn.Conv1d(in_channels=128, out_channels=300, kernel_size=3, padding=1)
    self.dropout = nn.Dropout(0.5)
    self.relu = nn.ReLU()


  def forward(self, x):

    x = x.transpose(1, 2)
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.conv2(x)
    x = self.bn2(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.conv3(x)
    x = self.bn3(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.conv4(x)
    x = x.transpose(1, 2)
    return x

In [51]:
def create_dataloader(data_dict, batch_size=32, shuffle=True):
    wav2vec_tensors = []
    fasttext_tensors = []

    for key in data_dict:
        wav2vec_tensors.append(data_dict[key][0])
        fasttext_tensors.append(data_dict[key][1])

    X = torch.stack(wav2vec_tensors)
    Y = torch.stack(fasttext_tensors)

    dataset = TensorDataset(X, Y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

train_dataloader = create_dataloader(train_dict)
test_dataloader = create_dataloader(test_dict, shuffle=False)

In [54]:
def create_dataloader_cnn(data_dict, batch_size=32, shuffle=True):
    wav2vec_tensors = []
    fasttext_tensors = []

    for key in data_dict:
        wav2vec_tensors.append(data_dict[key][0])
        fasttext_tensors.append(data_dict[key][1])

    X = torch.stack(wav2vec_tensors)
    Y = torch.stack(fasttext_tensors)

    dataset = TensorDataset(X, Y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

train_dataloader_cnn = create_dataloader_cnn(train_dict)
test_dataloader_cnn = create_dataloader_cnn(test_dict, shuffle=False)

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [53]:
model = AdvancedEmbeddingMapper()
CNNmodel = CNNEmbeddingMapper()

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


def train_model(model, train_dataloader, num_epochs=500):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch_x, batch_y in train_dataloader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        if (epoch + 1) % 50 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_dataloader):.4f}')
            #evaluate_model_more(model, test_dataloader)

#train_model(model, train_dataloader)
train_model(CNNmodel, train_dataloader_cnn)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [50/500], Loss: 0.1216
Epoch [100/500], Loss: 0.1119
Epoch [150/500], Loss: 0.1136
Epoch [200/500], Loss: 0.1163


KeyboardInterrupt: 

Why do I get **"running_mean should contain 1 elements not 512"** when BatchNorm1d(512) or 256 or 128?

In [None]:
def evaluate_model_more(model, test_dataloader):
    CNNmodel.eval()
    test_loss = 0
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        for batch_x, batch_y in test_dataloader:
            outputs = model(batch_x).squeeze(1)  # Squeeze to remove singleton dimension
            loss = criterion(outputs, batch_y)
            test_loss += loss.item()

            all_targets.append(batch_y)
            all_predictions.append(outputs)

    # Compute average test loss
    test_loss /= len(test_dataloader)

    # Concatenate all targets and predictions
    all_targets = torch.cat(all_targets).cpu().numpy()
    all_predictions = torch.cat(all_predictions).cpu().numpy()

    # Compute additional metrics
    mse = mean_squared_error(all_targets, all_predictions)
    mae = mean_absolute_error(all_targets, all_predictions)
    r2 = r2_score(all_targets, all_predictions)

    print(f'Test Loss: {test_loss:.4f}')
    print(f'Mean Squared Error (MSE): {mse:.4f}')
    print(f'Mean Absolute Error (MAE): {mae:.4f}')
    print(f'R-squared (R²): {r2:.4f}')

evaluate_model_more(CNNmodel, test_dataloader_cnn)

In [None]:
def cosine_similarity(model, target):
  cos = nn.CosineSimilarity(dim=0, eps=1e-6)
  output = cos(model, target)

  return output

# def evaluate_cosine():

#   model = torch.tensor(model(batch_x).squeeze(1))
#   target = torch.tensor(load_fasttext_embedding(unseen_emotion, fasttext_folder))

# cosine_similarity(model, target)

In [None]:
def evaluate_model_cosine(model, test_dataloader):
    model.eval()
    all_cosine_similarities = []

    with torch.no_grad():
        for batch_x, batch_y in test_dataloader:
            outputs = model(batch_x)  # Pass batch_x to the model
            cosine_sim = cosine_similarity(outputs, batch_y)
            all_cosine_similarities.extend(cosine_sim.cpu().numpy().tolist())  # Collect cosine similarities as scalar floats

    # Compute average cosine similarity
    avg_cosine_similarity = np.mean(all_cosine_similarities)

    print(f'Average Cosine Similarity: {avg_cosine_similarity:.4f}')

# Example usage
# Assuming you have the model and dataloaders defined
evaluate_model_cosine(model, test_dataloader)

In [None]:
input1 = torch.randn(99999999)
input2 = torch.randn(99999999)
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
output = cos(input1, input2)
print(output)