In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa
import torch
import pandas as pd
import numpy as np
import os
import random
import pickle
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
!pip install ray
!pip install optuna
from ray import tune, air
from ray.tune.search.optuna import OptunaSearch
from ray.air import session
from ray.air.config import RunConfig, ScalingConfig
from ray.tune import Tuner

Collecting ray
  Downloading ray-2.34.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (13 kB)
Downloading ray-2.34.0-cp310-cp310-manylinux2014_x86_64.whl (64.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ray
Successfully installed ray-2.34.0
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━

In [3]:
#dataset
unseen_emotion = ""
folder = r"/content/drive/MyDrive/Audio_Speech_Actors_01-24/Actor_01"
path = r"/content/drive/MyDrive/Audio_Speech_Actors_01-24/Actor_01"
# where emotion word embeddings are stored
fasttext_folder='/content/drive/MyDrive/emotion_vectors'
#where to save the model
model_save = 'advanced_embedding_mapper.pth'
#where the pickle is saved
pickle_path = '/content/drive/MyDrive/USER/fileTensorDict.pckl'
seed = 420

In [4]:
def get_emotion_vector(filename):
    parts = filename.split('-')
    third_number = parts[2]

    emotion_vector_label = None
    if third_number == '05':
        emotion_vector_label = 'angry'
    elif third_number == '02':
        emotion_vector_label = 'calm'
    elif third_number == '07':
        emotion_vector_label = 'disgust'
    elif third_number == '06':
        emotion_vector_label = 'fearful'
    elif third_number == '03':
        emotion_vector_label = 'happy'
    elif third_number == '01':
        emotion_vector_label = 'neutral'
    elif third_number == '04':
        emotion_vector_label = 'sad'
    elif third_number == '08':
        emotion_vector_label = 'surprised'

    return emotion_vector_label

In [5]:
def load_emotion_vectors(folder):
    emotion_vectors = {}
    for filename in os.listdir(folder):
        if filename.endswith('.txt'):
            emotion_name = filename.split('.')[0]
            filepath = os.path.join(folder, filename)
            with open(filepath, 'r') as file:
                vector = [float(line.strip()) for line in file]
                emotion_vectors[emotion_name] = torch.tensor(vector, dtype=torch.float32)
    return emotion_vectors

In [6]:
def addToDict(folder):
  emo_dict = {}
  for file in os.listdir(folder):
      emo_dict[file] = get_emotion_vector(file)
  str_emo_dict = str(emo_dict)
  with open("vectors.txt", "a") as vec:
    vec.write(str_emo_dict)

for x in os.listdir(folder):
  addToDict(folder)

In [7]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def get_vector_from_audio(path):
  audio_input, sampling_rate = librosa.load(path, sr=16000)
  inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
  with torch.no_grad():
    outputs = model(**inputs)

  hidden_states = outputs.last_hidden_state

  vector_rep = torch.mean(hidden_states, dim=1)
  return vector_rep



In [9]:
def get_embeddings():
  filename_vector_dict = {}
  # path = r"/content/drive/MyDrive/Audio_Speech_Actors_01-24/Actor_01"
  for f in os.listdir(path):
    file_path = os.path.join(path,f)
    emov = get_vector_from_audio(file_path)
    filename_vector_dict[f] = emov
  return filename_vector_dict

In [10]:
def load_fasttext_embedding(emotion_label, fasttext_folder):
  filepath = os.path.join(fasttext_folder, f'{emotion_label}.txt')
  if not os.path.exists(filepath):
      raise FileNotFoundError(f"Embedding file for {emotion_label} not found in {folder}")

  embedding = []
  with open(filepath, 'r') as file:
      for line in file:
          embedding.append(float(line.strip()))

  return embedding


In [11]:
def map_fasttext_to_wav2vec(wav2vec_dict, fasttext_folder='emotion_vectors'):
    fasttext_vector_dict = {}
    for filename, wav2vec_embedding in wav2vec_dict.items():
        emotion_label = get_emotion_vector(filename)
        fasttext_embedding = load_fasttext_embedding(emotion_label, fasttext_folder)
        fasttext_embedding = torch.tensor(fasttext_embedding, dtype=torch.float32)
        fasttext_vector_dict[filename] = (wav2vec_embedding, fasttext_embedding)
    return fasttext_vector_dict

In [12]:
# filename_vector_dict = get_embeddings()
# vector_map = map_fasttext_to_wav2vec(filename_vector_dict,fasttext_folder)

In [13]:
def check_or_create_vector_map(pickle_path, fasttext_folder):
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)
            if 'vector_map' in data:
                print("vector_map loaded from pickle file.")
                return data['vector_map']

    # If the file does not exist or vector_map is not in the file, create it
    filename_vector_dict = get_embeddings()
    vector_map = map_fasttext_to_wav2vec(filename_vector_dict, fasttext_folder)

    # Save the vector_map to the pickle file
    with open(pickle_path, 'wb') as f:
        pickle.dump({'vector_map': vector_map}, f)
    print("vector_map created and saved to pickle file.")

    return vector_map

vector_map = check_or_create_vector_map(pickle_path, fasttext_folder)

vector_map loaded from pickle file.


In [14]:
def split_sets(dictionary, unseen_emotion, train_ratio=0.8, seed=420):
    # Set the random seed for reproducibility
    random.seed(seed)

    # Separate keys for the unseen emotion and other emotions
    unseen_keys = [key for key in dictionary.keys() if get_emotion_vector(key) == unseen_emotion]
    filtered_keys = [key for key in dictionary.keys() if key not in unseen_keys]

    # Shuffle the filtered keys
    random.shuffle(filtered_keys)

    # Calculate the number of training samples needed from the filtered data
    total_samples = len(dictionary)
    num_train_samples = int(total_samples * train_ratio)
    num_test_samples = total_samples - num_train_samples

    # Adjust the number of test samples from the filtered data
    num_test_samples_from_filtered = num_test_samples - len(unseen_keys)

    # Ensure there are enough samples in the filtered data
    if num_test_samples_from_filtered < 0:
        raise ValueError("Not enough samples in the filtered data to maintain the overall split ratio.")

    # Split the filtered keys into training and test sets
    train_keys = filtered_keys[:num_train_samples]
    test_keys = filtered_keys[num_train_samples:num_train_samples + num_test_samples_from_filtered]

    # Create training and test dictionaries from the filtered data
    train_dict = {key: dictionary[key] for key in train_keys}
    test_dict = {key: dictionary[key] for key in test_keys}

    # Add the unseen emotion samples to the test dictionary
    test_dict.update({key: dictionary[key] for key in unseen_keys})

    # Check for overlaps
    train_keys_set = set(train_dict.keys())
    test_keys_set = set(test_dict.keys())
    overlapping_keys = train_keys_set & test_keys_set
    if overlapping_keys:
        raise ValueError(f"Overlapping filenames found between training and test sets: {overlapping_keys}")


    return train_dict, test_dict

# Example usage
train_dict, test_dict = split_sets(vector_map, unseen_emotion)

# Check the counts
print("Training samples:", len(train_dict))
print("Test samples:", len(test_dict))

# Ensure no unseen emotion samples in the training set
print("Unseen emotion in training set:", any(get_emotion_vector(key) == unseen_emotion for key in train_dict.keys()))
print("Unseen emotion in test set:", any(get_emotion_vector(key) == unseen_emotion for key in test_dict.keys()))
print("Unseen emotion:", unseen_emotion)


Training samples: 1152
Test samples: 288
Unseen emotion in training set: False
Unseen emotion in test set: False
Unseen emotion: 


In [15]:
class AdvancedEmbeddingMapper(nn.Module):
    def __init__(self):
        super(AdvancedEmbeddingMapper, self).__init__()
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, 300)
        self.dropout = nn.Dropout(0.3)  # Increase dropout rate for better regularization
        self.Tanh = nn.Tanh()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.Tanh(x)
        x = self.dropout(x)
        return x

In [16]:
class CNNEmbeddingMapper(nn.Module):

  def __init__(self):
    super(CNNEmbeddingMapper, self).__init__()
    self.conv1 = nn.Conv1d(in_channels=768, out_channels=512, kernel_size=3, padding=1)
    self.bn1 = nn.BatchNorm1d(512)
    self.conv2 = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=3, padding=1)
    self.bn2 = nn.BatchNorm1d(256)
    self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
    self.bn3 = nn.BatchNorm1d(128)
    self.conv4 = nn.Conv1d(in_channels=128, out_channels=300, kernel_size=3, padding=1)
    self.dropout = nn.Dropout(0.2)
    self.relu = nn.ReLU()


  def forward(self, x):

    x = x.transpose(1, 2)
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.conv2(x)
    x = self.bn2(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.conv3(x)
    x = self.bn3(x)
    x = self.relu(x)
    x = self.dropout(x)

    x = self.conv4(x)
    x = x.transpose(1, 2)
    return x

In [17]:
# class RNNEmbeddingMapper(nn.module):
#   def __init__(self):

In [18]:
def create_dataloader(data_dict, batch_size=2, shuffle=True):
    wav2vec_tensors = []
    fasttext_tensors = []

    for key in data_dict:
        wav2vec_tensors.append(data_dict[key][0])
        fasttext_tensors.append(data_dict[key][1])

    X = torch.stack(wav2vec_tensors)
    Y = torch.stack(fasttext_tensors)

    dataset = TensorDataset(X, Y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

train_dataloader = create_dataloader(train_dict)
test_dataloader = create_dataloader(test_dict, shuffle=False)

In [19]:
def objective(trial):

  model = define_model(trial).to(DEVICE)

In [20]:
model = AdvancedEmbeddingMapper()
#CNNmodel = CNNEmbeddingMapper()

# Define loss function and optimizer
criterion = nn.CosineEmbeddingLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


def train_model(model, train_dataloader, num_epochs=5):
    counter = 0
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch_x, batch_y in train_dataloader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            # Create a label tensor filled with 1s
            labels = torch.ones(outputs.size(0)).to(outputs.device)

            #flatten
            outputs = outputs.view(outputs.size(0), -1)
            batch_y = batch_y.view(batch_y.size(0), -1)

            loss = criterion(outputs, batch_y, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            # if counter >= 65:
            #   torch.set_printoptions(profile='full')
            #   for curr in batch_y:
            #     print(curr[0])
            # counter += 1
            # print(counter)


        if (epoch + 1) % 1 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_dataloader):.4f}')
            #evaluate_model_more(model, test_dataloader)


train_model(model, train_dataloader)
#train_model(CNNmodel, train_dataloader_cnn)

Epoch [1/5], Loss: 0.5112
Epoch [2/5], Loss: 0.4770
Epoch [3/5], Loss: 0.4704
Epoch [4/5], Loss: 0.4591
Epoch [5/5], Loss: 0.4470


In [21]:
def evaluate_model_more(model, test_dataloader, train_dataloader):
    model.eval()
    test_loss = 0
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        for batch_x, batch_y in test_dataloader:
            outputs = model(batch_x).squeeze(1)  # Squeeze to remove singleton dimension

            # Create a label tensor filled with 1s
            labels = torch.ones(outputs.size(0)).to(outputs.device)

            #flatten
            outputs = outputs.view(outputs.size(0), -1)
            batch_y = batch_y.view(batch_y.size(0), -1)

            loss = criterion(outputs, batch_y, labels)
            test_loss += loss.item()

            all_targets.append(batch_y)
            all_predictions.append(outputs)

    # Compute average test loss
    test_loss /= len(test_dataloader)
    #train_loss /= len(train_dataloader)

    # Concatenate all targets and predictions
    all_targets = torch.cat(all_targets).cpu().numpy()
    all_predictions = torch.cat(all_predictions).cpu().numpy()

    # Compute additional metrics
    mse = mean_squared_error(all_targets, all_predictions)
    mae = mean_absolute_error(all_targets, all_predictions)
    r2 = r2_score(all_targets, all_predictions)

    #print(f'Train Loss: {train_loss:.4f}')
    print(f'Test Loss: {test_loss:.4f}')
    print(f'Mean Squared Error (MSE): {mse:.4f}')
    print(f'Mean Absolute Error (MAE): {mae:.4f}')
    print(f'R-squared (R²): {r2:.4f}')

evaluate_model_more(model, test_dataloader, train_dataloader)

Test Loss: 0.3431
Mean Squared Error (MSE): 0.0101
Mean Absolute Error (MAE): 0.0789
R-squared (R²): -2.7004


In [22]:
def cosine_similarity(model_output, target):
  cos = nn.CosineSimilarity(dim=0, eps=1e-6)
  output = cos(model_output, target)

  return output

# def evaluate_cosine():

#   model = torch.tensor(model(batch_x).squeeze(1))
#   target = torch.tensor(load_fasttext_embedding(unseen_emotion, fasttext_folder))

# cosine_similarity(model, target)

In [23]:
def custom_cosine_similarity(tensor1, tensor2):
    # Flatten the tensors if they are not 1-dimensional
    if tensor1.dim() != 1:
        tensor1 = tensor1.view(-1)
    if tensor2.dim() != 1:
        tensor2 = tensor2.view(-1)

    # Compute the dot product between the two tensors
    dot_product = torch.dot(tensor1, tensor2)

    # Compute the L2 norm (Euclidean norm) of each tensor
    norm_tensor1 = torch.norm(tensor1, p=2)
    norm_tensor2 = torch.norm(tensor2, p=2)

    # Compute the cosine similarity
    cosine_similarity = dot_product / (norm_tensor1 * norm_tensor2)

    return cosine_similarity.item()

In [24]:
# def evaluate_model_cosine(model, test_dataloader):
#     model.eval()
#     all_cosine_similarities = []

#     with torch.no_grad():
#         for batch_x, batch_y in test_dataloader:
#             outputs = model(batch_x)  # Pass batch_x to the model
#             cosine_sim = custom_cosine_similarity(outputs, batch_y)
#             all_cosine_similarities.extend(cosine_sim.cpu().numpy().tolist())  # Collect cosine similarities as scalar floats
#     print
#     # Compute average cosine similarity
#     avg_cosine_similarity = np.mean(all_cosine_similarities)

#     print(f'Average Cosine Similarity: {avg_cosine_similarity:.4f}')

# # Example usage
# # Assuming you have the model and dataloaders defined
# evaluate_model_cosine(model, test_dataloader)

In [25]:
def find_most_similar_emotion(predicted_vector, emotion_vectors):
    similarities = []
    emotions = []

    for emotion, vector in emotion_vectors.items():
      similarity = custom_cosine_similarity(predicted_vector, vector)
      similarities.append(similarity)
      emotions.append(emotion)

    # Convert similarities to numpy array and use argmax to find the highest similarity
    similarities = np.array(similarities)
    max_index = np.argmax(similarities)

    most_similar_emotion = emotions[max_index]
    max_similarity = similarities[max_index]

    return most_similar_emotion, max_similarity

In [26]:
# def calculate_cosine_similarity(model, test_dataloader, test_dict, emotion_vectors):
#     model.eval()
#     results = {}
#     cosine_similarity_dict = {}
#     correct_predictions = 0

#     with torch.no_grad():
#         indices_list = list(test_dataloader.batch_sampler)

#         for batch_idx, (batch_x, batch_y) in enumerate(test_dataloader):
#             outputs = model(batch_x)  # Pass batch_x to the model
#             batch_indices = indices_list[batch_idx]  # Get the corresponding indices for the current batch

#             for i, output in enumerate(outputs):
#                 global_index = batch_indices[i]
#                 filename = list(test_dict.keys())[global_index]
#                 cosine_sim = custom_cosine_similarity(output, batch_y[i])  # Calculate mean cosine similarity for the current sample
#                 predicted_emotion, similarity_score = find_most_similar_emotion(output, emotion_vectors)
#                 actual_emotion = get_emotion_vector(filename)

#                 results[filename] = {
#                     'cosine_similarity': cosine_sim,
#                     'predicted_emotion': predicted_emotion,
#                     'actual_emotion': actual_emotion
#                 }
#                 if predicted_emotion == actual_emotion:
#                                 correct_predictions += 1
#     print(f'Number of correct labels: {correct_predictions}')
#     return results

# emotion_vectors = load_emotion_vectors(fasttext_folder)
# results = calculate_cosine_similarity(model, test_dataloader, test_dict, emotion_vectors)

# # Print the results
# for filename, result in results.items():
#     print(f'{filename}: Cosine Similarity: {result["cosine_similarity"]}, Predicted Emotion: {result["predicted_emotion"]}, Actual Emotion: {result["actual_emotion"]}')


In [27]:
def calculate_cosine_similarity(model, test_dataloader, test_dict, emotion_vectors):
  model.eval()
  results = {}
  correct_predictions = 0

  with torch.no_grad():
      indices_list = list(test_dataloader.batch_sampler)

      for batch_idx, (batch_x, batch_y) in enumerate(test_dataloader):
          outputs = model(batch_x)
          batch_indices = indices_list[batch_idx]

          for i, output in enumerate(outputs):
              global_index = batch_indices[i]
              filename = list(test_dict.keys())[global_index]
              cosine_sim = custom_cosine_similarity(output, batch_y[i])
              predicted_emotion, similarity_score = find_most_similar_emotion(output, emotion_vectors)
              actual_emotion = get_emotion_vector(filename)
              emotion_similarity = custom_cosine_similarity(output, emotion_vectors[predicted_emotion])

              results[filename] = {
                  'cosine_similarity': cosine_sim,
                  'predicted_emotion': predicted_emotion,
                  'actual_emotion': actual_emotion,
                  'emotion_similarity': emotion_similarity
              }
              if predicted_emotion == actual_emotion:
                  correct_predictions += 1

  print(f'Number of correct labels: {correct_predictions}')
  return results
emotion_vectors = load_emotion_vectors(fasttext_folder)
results = calculate_cosine_similarity(model, test_dataloader, test_dict, emotion_vectors)

# Print the results
for filename, result in results.items():
    print(f'{filename}: Cosine Similarity: {result["cosine_similarity"]}, Predicted Emotion: {result["predicted_emotion"]}, '
          f'Actual Emotion: {result["actual_emotion"]}, Emotion Similarity: {result["emotion_similarity"]}')

Number of correct labels: 94
03-01-03-01-02-01-06.wav: Cosine Similarity: 0.6345957517623901, Predicted Emotion: fearful, Actual Emotion: happy, Emotion Similarity: 0.7299978733062744
03-01-03-02-02-02-11.wav: Cosine Similarity: 0.5706480145454407, Predicted Emotion: fearful, Actual Emotion: happy, Emotion Similarity: 0.7275991439819336
03-01-02-02-02-02-19.wav: Cosine Similarity: 0.8153132200241089, Predicted Emotion: calm, Actual Emotion: calm, Emotion Similarity: 0.8153132200241089
03-01-02-02-01-02-03.wav: Cosine Similarity: 0.6818336248397827, Predicted Emotion: angry, Actual Emotion: calm, Emotion Similarity: 0.8478820323944092
03-01-01-01-01-02-14.wav: Cosine Similarity: 0.33509811758995056, Predicted Emotion: surprised, Actual Emotion: neutral, Emotion Similarity: 0.6785995960235596
03-01-05-01-01-02-06.wav: Cosine Similarity: 0.7810158729553223, Predicted Emotion: angry, Actual Emotion: angry, Emotion Similarity: 0.7810158729553223
03-01-04-01-02-02-24.wav: Cosine Similarity: 