- Importing Libraries

In [11]:
import pickle
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
from sklearn.metrics import f1_score

### Task 1
- Loading validation embeddings for task 1

In [12]:
# Load the embeddings from the validation set
val_embeddings_file = "val_embeddings.pkl"
val_embeddings_loaded = pickle.load(open(val_embeddings_file, "rb"))

# Load the labels from the validation set
task1_val_labels = "task1_labels_dev.pkl"
task1_val_labels_loaded = pickle.load(open(task1_val_labels, "rb"))

In [13]:
# Encode the labels by a fixed mapping
mapping = {
    "-1": 0,
    "sadness": 1,
    "joy": 2,
    "fear": 3,
    "anger": 4,
    "surprise": 5,
    "disgust": 6,
    "neutral": 7
}

# Convert the labels to integers
val_labels = [np.array([mapping[str(label)] for label in task1_val_labels_loaded[key]]) for key in task1_val_labels_loaded.keys()]

# Convert the embeddings to a list
val_embeddings = [val_embeddings_loaded[key] for key in val_embeddings_loaded.keys()]

### Dataset Class and Dataloader for Validation Set

In [14]:
class EmotionDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings  # List of embedding matrices
        self.labels = labels  # List of label arrays

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx]), torch.tensor(self.labels[idx], dtype=torch.long)
    
def collate_fn(batch):
    embeddings, labels = zip(*batch)
    embeddings_pad = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True, padding_value=0)
    labels_pad = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-1)  # Use -1 for padding
    return embeddings_pad, labels_pad

# Make val dataset and dataloader
val_dataset = EmotionDataset(val_embeddings, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

## Model - M1 (Task 1)

In [15]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, _ = self.rnn(x)
        output = self.fc(output)
        return output
    
# Initialize the model
INPUT_SIZE = 384 # Dimension of the input embeddings
HIDDEN_SIZE = 128 # Dimension of the hidden state
OUTPUT_SIZE = 8 # Number of classes

model = RNNModel(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)

# Load the model from the saved state
TASK_1_MODEL_PATH = "M1_Task1.pth"
model.load_state_dict(torch.load(TASK_1_MODEL_PATH))

<All keys matched successfully>

In [16]:
# Compute F1 metrics
from sklearn.metrics import f1_score

# Predictions and labels should be flattened arrays
def compute_metrics(predictions, labels):
    mask = labels != -1  # Ignore padded labels
    masked_labels = labels[mask]
    masked_predictions = predictions[mask]
    weighted_f1 = f1_score(masked_labels, masked_predictions, average='weighted')
    macro_f1 = f1_score(masked_labels, masked_predictions, average='macro')
    return weighted_f1, macro_f1

In [17]:
# Evaluate the model on the validation set
model.eval()
val_predictions = []
val_labels = []

with torch.no_grad():
    for embeddings, labels in val_dataloader:
        output = model(embeddings)
        
        # Flatten the output and labels
        output = output.view(-1, output.shape[-1])
        labels = labels.view(-1)

        # Get the predictions
        predictions = output.argmax(dim=-1)

        val_predictions.append(predictions.detach().cpu().numpy())
        val_labels.append(labels.detach().cpu().numpy())

val_predictions = np.concatenate(val_predictions)
val_labels = np.concatenate(val_labels)

weighted_f1_val, macro_f1_val = compute_metrics(val_predictions, val_labels)

# Print the F1 scores
print("Validation Weighted F1:", weighted_f1_val)
print("Validation Macro F1:", macro_f1_val)

Validation Weighted F1: 0.8887216999599405
Validation Macro F1: 0.8618793949235899


### Testing Model M1 on a Test Set

- I am assuming the test set is in the same format as train and validation json files
- I will load the json file here and compute the sentence-bert embeddings
- Will then follow same methodology as above for computing F1 score over that

In [None]:
import json
from sentence_transformers import SentenceTransformer
import pickle

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Import the test json file
TEST_PATH = "test.json"
with open(TEST_PATH) as f:
    test_data = json.load(f)

# Store the embeddings in a dictionary
test_embeddings_dict = {}
i = 0

for test_dict in test_data:
    # Get the relevant information from the dictionary
    episode_key = test_dict["episode"]
    utterances = test_dict["utterances"]

    # Generate embeddings for the utterances
    embeddings = model.encode(utterances)
    i+=1
    # Print the episode key
    print(i)

    # Store the embedding
    test_embeddings_dict[episode_key] = embeddings