In [15]:
import os.path

# DATA_DIR = "/tmp/semeval24_task3/"

import json
from pprint import pprint
TRAIN_DATA_FILEPATH = "/tmp/semeval24_task3/SemEval-2024_Task3/official_data/Training_data/text/training.json"
TEST_DATA_FILEPATH = "/tmp/semeval24_task3/SemEval-2024_Task3/official_data/Training_data/text/testing.json"
# TEST_DATA_FILEPATH = "/tmp/semeval24_task3/SemEval-2024_Task3/official_data/Evaluation_Data/Subtask_2_test.json"
from encoder_paths import *

In [16]:
import json

with open(TEST_DATA_FILEPATH) as f:
    test_data = json.load(f)
with open(TRAIN_DATA_FILEPATH) as f:
    train_data = json.load(f)

In [17]:
import torch
import pickle

class YourAudioEncoder():
    def __init__(self, audio_embeddings_path):
        with open(audio_embeddings_path, "rb") as f:
            self.audio_embeddings = pickle.load(f)

    def lmao(self, audio_name):
        audio_name = audio_name.split(".")[0]
        if audio_name == "dia2020utt6":
            return torch.zeros(768)
        audio_embedding = self.audio_embeddings[audio_name]
        audio_embedding = audio_embedding.squeeze()
        return torch.from_numpy(audio_embedding)
    
class YourVideoEncoder():
    def __init__(self, video_embeddings_path):
        with open(video_embeddings_path, "rb") as f:
            self.video_embeddings = pickle.load(f)

    def lmao(self, video_name):
        # video_name = video_name.split(".")[0]
        video_embedding = self.video_embeddings[video_name].reshape((16,-1))
        video_embedding = np.mean(video_embedding, axis=0)
        return torch.from_numpy(video_embedding)

class YourTextEncoder():
    def __init__(self, text_embeddings_path):
        with open(text_embeddings_path, "rb") as f:
            self.text_embeddings = pickle.load(f)

    def lmao(self, video_name):
        text_embedding = self.text_embeddings[video_name]
        return torch.from_numpy(text_embedding)


In [18]:
class EmotionIndexer:
    def __init__(self):
        self.emotion_to_index = {
            'joy': 0,
            'sadness': 1,
            'anger': 2,
            'neutral': 3,
            'surprise': 4,
            'disgust': 5,
            'fear': 6,
            'pad': 7,
        }
        self.emotion_freq = [0]*7
        self.weights = None

        self.index_to_emotion = {index: emotion for emotion, index in self.emotion_to_index.items()}

    def emotion_to_idx(self, emotion):
        return self.emotion_to_index.get(emotion, None)

    def idx_to_emotion(self, index):
        return self.index_to_emotion.get(index, None)

    def compute_weights(self, data):
        for conversation in data:
            conversation = conversation['conversation']
            for utterance in conversation:
                emotion = utterance['emotion']
                self.emotion_freq[self.emotion_to_index[emotion]] += 1
        print(self.emotion_freq)
        self.weights = [1/freq for freq in self.emotion_freq]

# Example usage
indexer = EmotionIndexer()
indexer.compute_weights(train_data)

[2059, 1024, 1472, 5282, 1647, 369, 326]


In [19]:
import torch
import json
import os
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_video
from torchvision.transforms import functional as F
from PIL import Image
import numpy as np

In [20]:
class ConversationDataset(Dataset):
    def __init__(self, json_file, audio_encoder, video_encoder, text_encoder, max_seq_len):
        self.max_seq_len = max_seq_len
        self.data = self.load_data(json_file)
        self.audio_encoder = audio_encoder
        self.video_encoder = video_encoder
        self.text_encoder = text_encoder

    def load_data(self, json_file):
        with open(json_file, 'r') as f:
            data = json.load(f)
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        conversation = self.data[idx]['conversation']
        emotion_labels = []
        audio_paths = [utterance['video_name'].replace('mp4', 'wav') for utterance in conversation]
        video_paths = [utterance['video_name'] for utterance in conversation]
        texts = [utterance['video_name'] for utterance in conversation]

        audio_embeddings = [self.audio_encoder.lmao(audio_path) for audio_path in audio_paths]
        video_embeddings = [self.video_encoder.lmao(video_path) for video_path in video_paths]
        text_embeddings = [self.text_encoder.lmao(text) for text in texts]
        
        cause_pairs = []
        useful_utterances = set([int(cause_pair[1]) for cause_pair in cause_pairs])
        cause_labels = []
        for utterance in conversation:
            if utterance['utterance_ID'] in useful_utterances:
                cause_labels.append(1)
            else:
                cause_labels.append(0)

        # Pad or truncate conversations to the maximum sequence length
        if len(conversation) < self.max_seq_len:
            pad_length = self.max_seq_len - len(conversation)
            audio_embeddings += [torch.zeros_like(audio_embeddings[0])] * pad_length
            video_embeddings += [torch.zeros_like(video_embeddings[0])] * pad_length
            text_embeddings += [torch.zeros_like(text_embeddings[0])] * pad_length
            emotion_labels += ['pad'] * pad_length
            cause_labels += [-1] * pad_length
            pad_mask = [1] * len(conversation) + [0] * pad_length
        else:
            audio_embeddings = audio_embeddings[:self.max_seq_len]
            video_embeddings = video_embeddings[:self.max_seq_len]
            text_embeddings = text_embeddings[:self.max_seq_len]
            emotion_labels = emotion_labels[:self.max_seq_len]
            cause_labels = cause_labels[:self.max_seq_len]
            pad_mask = [1] * self.max_seq_len

        emotion_indices = [indexer.emotion_to_idx(emotion) for emotion in emotion_labels]
        
        audio_embeddings = torch.stack(audio_embeddings)
        video_embeddings = torch.stack(video_embeddings)
        text_embeddings = torch.stack(text_embeddings)
        emotion_indices = torch.from_numpy(np.array(emotion_indices))
        pad_mask = torch.from_numpy(np.array(pad_mask))
        cause_labels = torch.from_numpy(np.array(cause_labels))
        
        return {
            'audio': audio_embeddings,
            'video': video_embeddings,
            'text': text_embeddings,
            # 'conversation_id': 
        }
# Example usage
# You need to define your audio, video, and text encoders accordingly

# Define your data paths
# AUDIO_EMBEDDINGS_FILEPATH = "/tmp/semeval24_task3/og_paper_embeddings/audio_embedding_6373.npy"
# VIDEO_EMBEDDINGS_FILEPATH = "/tmp/semeval24_task3/og_paper_embeddings/video_embedding_4096.npy"
# TEXT_EMBEDDINGS_FILEPATH = os.path.join(DATA_DIR, "text_embeddings", "text_embeddings_bert_base.pkl")

audio_encoder = YourAudioEncoder(AUDIO_EMBEDDINGS_FILEPATH)
video_encoder = YourVideoEncoder(VIDEO_EMBEDDINGS_FILEPATH)
text_encoder = YourTextEncoder(TEXT_EMBEDDINGS_FILEPATH)
max_seq_len = 35  # Adjust this according to your needs

# Create the dataset and dataloader
test_dataset = ConversationDataset(TEST_DATA_FILEPATH, audio_encoder, video_encoder, text_encoder, max_seq_len)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Example of iterating through batches
for batch in test_dataloader:
    audio = batch['audio']  # Shape: (batch_size, max_seq_len, audio_embedding_size)
    video = batch['video']  # Shape: (batch_size, max_seq_len, video_embedding_size)
    text = batch['text']    # Shape: (batch_size, max_seq_len, text_embedding_size)

In [21]:
import torch
import torch.nn as nn

class EmotionClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, num_emotions, embedding_dropout=0.2):
        super(EmotionClassifier, self).__init__()
        
        self.audio_dropout = nn.Dropout(embedding_dropout)
        self.video_dropout = nn.Dropout(embedding_dropout)
        self.text_dropout = nn.Dropout(embedding_dropout)

        # self.first_linear = nn.Linear(input_size, hidden_size, dtype=torch.float32)
        self.relu = nn.ReLU()
        
        # self.second_linear_layer = nn.Linear(hidden_size, hidden_size, dtype=torch.float32)
        # Replace Transformer with BiLSTM
        self.bilstm = nn.LSTM(input_size, input_size // 2, num_layers, 
                              dropout=dropout, bidirectional=True, batch_first=True)
        
        self.linear = nn.Linear(input_size, hidden_size)
        self.final_linear = nn.Linear(hidden_size, num_emotions)

    def forward(self, audio_encoding, video_encoding, text_encoding):
        # Concatenate or combine the audio, video, and text encodings
        audio_encoding = audio_encoding.float()
        video_encoding = video_encoding.float()
        text_encoding = text_encoding.float()
        
        audio_encoding = self.audio_dropout(audio_encoding)
        video_encoding = self.video_dropout(video_encoding)
        text_encoding = self.text_dropout(text_encoding)
        
        combined_encoding = torch.cat((audio_encoding, video_encoding, text_encoding), dim=2)
        
        # Pass through BiLSTM
        lstm_output, _ = self.bilstm(combined_encoding)

        # Take the output of the BiLSTM
        emotion_logits = self.linear(lstm_output)
        emotion_logits = self.relu(emotion_logits)
        emotion_logits = self.final_linear(emotion_logits)
        # Apply a softmax layer
        emotion_logits = torch.softmax(emotion_logits, dim=2)

        return emotion_logits

In [22]:
import numpy as np

def generate_positional_embeddings(dimension, count):
    embeddings = [list(np.zeros(dimension))]
    embeddings.extend([
        list(np.random.normal(loc=0.0, scale=0.1, size=dimension)) for _ in range(count)
    ])
    return np.array(embeddings)

In [23]:
import torch.nn as nn

class EmotionCauseDetector(nn.Module):
    def __init__(
        self,
        utterance_embedding_size,
        device,
        hidden_dimension=4096,
        positional_embeddings_dimension=200,
        dropout=0.2,
        *args, **kwargs,
    ):
        super().__init__()

        self.hidden_dimension = hidden_dimension

        positional_embeddings = generate_positional_embeddings(positional_embeddings_dimension, 200)
        self.positional_embeddings = torch.from_numpy(positional_embeddings).to(device).float()
        
        self.non_neutral_dropout = nn.Dropout(dropout)
        self.candidate_cause_dropout = nn.Dropout(dropout)
        self.distance_dropout = nn.Dropout(dropout)

        self.linear1 = nn.Linear(utterance_embedding_size*2 + positional_embeddings_dimension, hidden_dimension)
        self.linear1_activation = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dimension, 1)

    def forward(self, non_neutral_utterances, candidate_cause_utterances, distances):
        positional_embedding = self.positional_embeddings[distances].float()
        
        non_neutral_utterances = self.non_neutral_dropout(non_neutral_utterances)
        candidate_cause_utterances = self.candidate_cause_dropout(candidate_cause_utterances)
        positional_embedding = self.distance_dropout(positional_embedding)

        embeddings = torch.concat((non_neutral_utterances, candidate_cause_utterances, positional_embedding), axis=1).float()

        return self.linear2(
            self.linear1_activation(
                self.linear1(embeddings)
            )
        )

In [24]:
import torch
import torch.nn as nn
from TorchCRF import CRF

class EmotionClassifierCRF(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, num_emotions, embedding_dropout=0.2):
        super(EmotionClassifierCRF, self).__init__()
        
        self.audio_dropout = nn.Dropout(embedding_dropout)
        self.video_dropout = nn.Dropout(embedding_dropout)
        self.text_dropout = nn.Dropout(embedding_dropout)

        self.first_linear = nn.Linear(input_size, hidden_size, dtype=torch.float32)
        self.relu = nn.ReLU()
        
        self.second_linear_layer = nn.Linear(hidden_size, hidden_size, dtype=torch.float32)
        # Replace Transformer with BiLSTM
        self.bilstm = nn.LSTM(hidden_size, hidden_size // 2, num_layers, 
                              dropout=dropout, bidirectional=True, batch_first=True)
        
        self.linear = nn.Linear(hidden_size, num_emotions)
        self.crf_model = CRF(num_emotions)
        

    def generate_emissions(self, audio_encoding, video_encoding, text_encoding):
        # Concatenate or combine the audio, video, and text encodings
        audio_encoding = audio_encoding.float()
        video_encoding = video_encoding.float()
        text_encoding = text_encoding.float()
        
        audio_encoding = self.audio_dropout(audio_encoding)
        video_encoding = self.video_dropout(video_encoding)
        text_encoding = self.text_dropout(text_encoding)
        
        combined_encoding = torch.cat((audio_encoding, video_encoding, text_encoding), dim=2)
        
        combined_encoding = self.first_linear(combined_encoding)
        combined_encoding = self.relu(combined_encoding)
        combined_encoding = self.second_linear_layer(combined_encoding)
        
        # Pass through BiLSTM
        lstm_output, _ = self.bilstm(combined_encoding)

        # Take the output of the BiLSTM
        emotion_logits = self.linear(lstm_output)
        # Apply a softmax layer
        # emotion_logits = torch.softmax(emotion_logits, dim=2)

        return emotion_logits

    def loss(self, audio_encoding, video_encoding, text_encoding, emotion_labels, padding):

        emissions = self.generate_emissions(audio_encoding, video_encoding, text_encoding)
        emotion_labels = emotion_labels.unsqueeze(1)
        x, y, _ = emissions.shape
        padding = torch.ones((x, y), dtype=torch.bool).to('cuda')
        emotion_labels = emotion_labels.squeeze(1)
        loss = -self.crf_model(emissions, emotion_labels, padding)

        return loss
    
    def predict(self, audio_encoding, video_encoding, text_encoding):
        emissions = self.generate_emissions(audio_encoding, video_encoding, text_encoding)
        x, y, _ = emissions.shape
        padding = torch.ones((x, y), dtype=torch.bool).to('cuda')
        label = self.crf_model.viterbi_decode(emissions, padding)
        return label
    
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim).to('cuda'), torch.randn(2, 1, self.hidden_dim).to('cuda'))

In [25]:
from torch.optim import Adam
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from collections import defaultdict
import numpy as np

emotion_classifier = EmotionClassifierCRF(input_size=768*3, hidden_size=2000, num_emotions=7, dropout=0.3, num_layers=4)
emotion_classifier.load_state_dict(torch.load('/tmp/semeval24_task3/baseline_models/emotion_models/emotion_model_59.pt'))

cause_classifier = EmotionClassifier(input_size=768*3, hidden_size=2000, num_emotions=2, dropout=0.3, num_layers=3)
cause_classifier.load_state_dict(torch.load('/tmp/semeval24_task3/baseline_models/cause_models/best_cause_model.pt'))

emotion_cause_detector = EmotionCauseDetector(utterance_embedding_size=768*3, device='cuda:1', hidden_dimension=2000)
emotion_cause_detector.load_state_dict(torch.load('/tmp/semeval24_task3/baseline_models/pairing_models/paring_model_best_model.pt'))

emotion_classifier.to('cuda:1')
cause_classifier.to('cuda:1')
emotion_cause_detector.to('cuda:1')
positional_embeddings = np.load('/tmp/semeval24_task3/baseline_models/pairing_models/pairing_model_pos_embeds_best_model.npy')
emotion_cause_detector.positional_embeddings = torch.from_numpy(positional_embeddings).to('cuda:1').float()
print(AUDIO_EMBEDDINGS_FILEPATH)
print(TEXT_EMBEDDINGS_FILEPATH)
print(VIDEO_EMBEDDINGS_FILEPATH)
emotion_cause_pairs = defaultdict(list)

for conversation_idx, conversation in tqdm(enumerate(test_data), total=len(test_data)):
    conversation_id = conversation['conversation_ID']
    
    audio_paths = [utterance['video_name'].replace('mp4', 'wav') for utterance in conversation['conversation']]
    video_paths = [utterance['video_name'] for utterance in conversation['conversation']]
    texts = [utterance['video_name'] for utterance in conversation['conversation']]
    
    audio_embeddings = [audio_encoder.lmao(audio_path) for audio_path in audio_paths]
    video_embeddings = [video_encoder.lmao(video_path) for video_path in video_paths]
    text_embeddings = [text_encoder.lmao(text) for text in texts]

    audio_embeddings = torch.stack(audio_embeddings)
    video_embeddings = torch.stack(video_embeddings)
    text_embeddings = torch.stack(text_embeddings)
    
    audio_embeddings = audio_embeddings.unsqueeze(0).to('cuda:1')
    video_embeddings = video_embeddings.unsqueeze(0).to('cuda:1')
    text_embeddings = text_embeddings.unsqueeze(0).to('cuda:1')
    
    # emotion_logits = emotion_classifier(audio_embeddings, video_embeddings, text_embeddings)
    # emotion_logits = emotion_logits.squeeze(0)
    # predicted_emotions = torch.argmax(emotion_logits, dim=1)
    predicted_emotions = emotion_classifier.predict(audio_embeddings, video_embeddings, text_embeddings)[0]
    candidate_utterances = [(idx, emotion) for idx, emotion in enumerate(predicted_emotions) if emotion != 3]
    
    cause_logits = cause_classifier(audio_embeddings, video_embeddings, text_embeddings)
    cause_logits = cause_logits.squeeze(0)
    predicted_causes = torch.argmax(cause_logits, dim=1)
    
    candidate_causes = [idx for idx, cause in enumerate(predicted_causes) if cause == 1]
    
    for candidate_utterance in candidate_utterances:
        for candidate_cause in candidate_causes:
            utterance_embedding = torch.cat((audio_embeddings[0][candidate_utterance[0]], video_embeddings[0][candidate_utterance[0]], text_embeddings[0][candidate_utterance[0]]), dim=0)
            cause_embedding = torch.cat((audio_embeddings[0][candidate_cause], video_embeddings[0][candidate_cause], text_embeddings[0][candidate_cause]), dim=0)
            distance = abs(candidate_utterance[0] - candidate_cause)
            
            utterance_embedding = utterance_embedding.unsqueeze(0).to('cuda:1')
            cause_embedding = cause_embedding.unsqueeze(0).to('cuda:1')
            distance = torch.tensor([distance]).to('cuda:1')
            prediction = emotion_cause_detector(utterance_embedding, cause_embedding, distance)
            prediction = torch.sigmoid(prediction)
            prediction = prediction.cpu().item()
            if prediction >= 0.5:
                emotion_cause_pairs[conversation_id].append((f"{candidate_utterance[0]+1}_{indexer.idx_to_emotion(candidate_utterance[1])}", f"{candidate_cause+1}"))
    test_data[conversation_idx]['emotion-cause_pairs'] = emotion_cause_pairs[conversation_id]

/tmp/semeval24_task3/audio_embeddings/audio_embeddings_microsoft_wavlm-base-plus-sd.pkl
/tmp/semeval24_task3/text_embeddings/text_embeddings_roberta_base_emotion.pkl
/tmp/semeval24_task3/video_embeddings/final_embeddings.pkl


100%|██████████| 138/138 [00:03<00:00, 41.14it/s]


In [26]:
with open('Subtask_2_pred.json', 'w') as f:
    json.dump(test_data, f, indent=4)

In [27]:
!zip bilstm+crf.zip Subtask_2_pred.json

updating: Subtask_2_pred.json (deflated 90%)


In [28]:
!pwd

/home2/suyash.mathur/semeval24/task3/BiLSTM+CRF
