# Context LSTM with Aux Input/Output + Transformer

Based on the baseline implementation with the addition of a transformer

## Setup

In [56]:
# libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np

In [None]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [58]:
# set seed for reproducibility
seed = 123
torch.manual_seed(seed)

<torch._C.Generator at 0x20269bd4630>

## Prepare Data

In [68]:
# User Metadata
train_metadata = torch.load("../Data/Processed_Data/train_metadata_tensor.pth")
test_metadata = torch.load("../Data/Processed_Data/test_metadata_tensor.pth")
validate_metadata = torch.load("../Data/Processed_Data/validate_metadata_tensor.pth")

# Tweets Data
train_tweets = torch.load("../Data/Processed_Data/train_tweet_emb_tensor.pth")
test_tweets = torch.load("../Data/Processed_Data/test_tweet_emb_tensor.pth")
validate_tweets = torch.load("../Data/Processed_Data/validate_tweet_emb_tensor.pth")

# Labels
train_labels = torch.load("../Data/Processed_Data/train_label_tensor.pth")
test_labels = torch.load("../Data/Processed_Data/test_label_tensor.pth")
validate_labels = torch.load("../Data/Processed_Data/validate_label_tensor.pth")

In [69]:
# do 20% for speed, update later on
num_samples = int(train_tweets.shape[0] * 0.2) // 2

class_0_indices = torch.nonzero(train_labels == 0).squeeze()
class_1_indices = torch.nonzero(train_labels == 1).squeeze()

print(class_0_indices.shape)
print(class_1_indices.shape)

# Shuffle the indices using torch.randperm
shuffled_class_0_indices = class_0_indices[torch.randperm(class_0_indices.size(0))]
shuffled_class_1_indices = class_1_indices[torch.randperm(class_1_indices.size(0))]

# Select an equal number of samples from each class
class_0_sampled_indices = shuffled_class_0_indices[:num_samples]
class_1_sampled_indices = shuffled_class_1_indices[:num_samples]

print(class_0_sampled_indices.shape)
print(class_1_sampled_indices.shape)

# Combine the selected indices for a balanced dataset
balanced_indices = torch.cat((class_0_sampled_indices, class_1_sampled_indices))

torch.Size([641831])
torch.Size([756634])
torch.Size([139846])
torch.Size([139846])


In [70]:
train_tweets = train_tweets[balanced_indices]
train_metadata = train_metadata[balanced_indices]
train_labels = train_labels[balanced_indices]

print(train_tweets.shape)
print(train_metadata.shape)
print(train_labels.shape)

torch.Size([279692, 200])
torch.Size([279692, 5])
torch.Size([279692])


## Set up LSTM + Transformer

In [100]:
class LSTMTransformerModel(nn.Module):
    
    # initialization
    def __init__(self, input_size, hidden_size, lstm_output_size, metadata_size, output_size, device, 
                 n_heads = 2, dim_feedforward = 2048, dropout = 0.1, lstm_layers=1, transformer_layers=1):
        super(LSTMTransformerModel, self).__init__()

        # set up the device
        self.device = device

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, lstm_layers, batch_first=False)

        # transformer
        self.transformer = nn.Transformer(
            d_model= hidden_size,
            nhead= n_heads,
            num_encoder_layers= transformer_layers,
            num_decoder_layers= transformer_layers,
            dim_feedforward= dim_feedforward,
            dropout= dropout,
            batch_first= False
        )

        # fully connected layers for the end LSTM + Transformer
        self.fc1 = nn.Linear(hidden_size, lstm_output_size) # for the initial LSTM & transformer pass through

        # 2-layer relu for the end for lstm_output & aux_output
        self.fc2_lstm = nn.Linear(lstm_output_size + metadata_size, hidden_size) 
        self.fc2_aux = nn.Linear(lstm_output_size, hidden_size)
 
        self.fc3 = nn.Linear(hidden_size, output_size) # check the dimensions
        self.relu = nn.ReLU()


    def forward(self, src, metadata):
        # send through LSTM
        lstm_out, (h_n, c_n) = self.lstm(src)

        # dummy tensor to start for decoder
        tgt = torch.zeros(lstm_out.shape, device= device)

        # send through transformer
        transformer_out = self.transformer(lstm_out, tgt)

        # take the last output
        lstm_trans_out = self.fc1(transformer_out[:, -1, :])


        # make the aux input/output
        aux_in = torch.hstack((lstm_trans_out, metadata[:, -1, :])).to(torch.float32).to(self.device)
        aux_out = lstm_trans_out

        # pass through 2-layer relu
        main_out = self.fc2_lstm(aux_in)
        main_out = self.fc3(main_out)
        main_out = self.relu(main_out)

        aux_out = self.fc2_aux(aux_out)
        aux_out = self.fc3(aux_out)
        aux_out = self.relu(aux_out)

        return main_out, aux_out



In [106]:
# LSTM + Transformer Hyperparameters
batch_size = 64
input_size = 200 # number of features in input data(200 for 200D embeddings)
hidden_size = 50
lstm_output_size = 32
metadata_size = train_metadata.size(1)
output_size = 1
n_heads = 2 # heads for the transformer
dim_feedforward = 2048 # dimension for feedforward network in transformer
dropout = 0.1
lstm_layers = 1 # number of LSTM Layers
transformer_layers = 1 # number of encoder and decoder layers

In [107]:
# set up training data to be proper size
x_train = train_tweets.unsqueeze(1)
x_train = x_train.to(torch.float32).to(device)
print(x_train.shape)

metadata_train = train_metadata.unsqueeze(1)
metadata_train = metadata_train.to(torch.float32).to(device)
print(metadata_train.shape)

y_train = train_labels.view(-1, 1).to(torch.float32).to(device)
print(y_train.shape)

torch.Size([279692, 1, 200])
torch.Size([279692, 1, 5])
torch.Size([279692, 1])


In [None]:
# Creating dataloader
dataset = TensorDataset(x_train, metadata_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Set up model
model = LSTMTransformerModel(input_size, hidden_size, lstm_output_size, metadata_size, output_size, device, 
                                              n_heads, dim_feedforward, dropout, lstm_layers, transformer_layers)
model.to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.0001)  

In [109]:
# Training loop
num_epochs = 15  # Number of epochs to train

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    for inputs, metadata, labels in dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        main_out, aux_out = model(inputs, metadata)
        
        # Compute individual loss
        loss = criterion(main_out, labels)
        aux_loss = criterion(aux_out, labels)

        # Total loss
        total_loss = 0.8* loss + 0.2*aux_loss
        
        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch [1/15], Loss: 40.9575
Epoch [2/15], Loss: 0.6931
Epoch [3/15], Loss: 0.6931
Epoch [4/15], Loss: 0.6931
Epoch [5/15], Loss: 0.6931
Epoch [6/15], Loss: 0.6931
Epoch [7/15], Loss: 0.6931
Epoch [8/15], Loss: 0.6931
Epoch [9/15], Loss: 0.6931
Epoch [10/15], Loss: 0.6931
Epoch [11/15], Loss: 0.6931
Epoch [12/15], Loss: 0.6931
Epoch [13/15], Loss: 0.6931
Epoch [14/15], Loss: 0.6931
Epoch [15/15], Loss: 0.6931


In [148]:
# Set the model to evaluation mode 
model.eval()

# Set up testing data
x_test = test_tweets.unsqueeze(1)
x_test = x_test.to(torch.float32).to(device)

metadata_test = test_metadata.unsqueeze(1)
metadata_test = metadata_test.to(torch.float32).to(device)


# set up dataloader
test_dataset = TensorDataset(x_test, metadata_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# set up label list
predicted_labels = []

# Make predictions
with torch.no_grad():  # Disable gradient calculation to save memory and computations
    for inputs, metadata in test_dataloader:
        
        # Forward pass
        main_out, aux_out = model(inputs, metadata)
        
        # predictions
        predictions= (main_out > 0.5).float()
        predicted_labels.append(predictions)

predicted_labels = torch.cat(predicted_labels, dim=0).flatten()
predicted_labels.to(device)

print("Predictions:", predicted_labels)
print(len(predicted_labels))

Predictions: tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0')
199863


In [139]:
# Function to calculate TP, FP, TN, FN
def calculate_confusion_values(y_true, y_pred):
    """
    Calculate TP, FP, TN, FN for binary classification.
    
    Args:
        y_true (tensor): Ground truth labels.
        y_pred (tensor): Predicted labels.
        
    Returns:
        tuple: (TP, FP, TN, FN)
    """
    TP = ((y_true == 1) & (y_pred == 1)).sum().item()  # True Positive: True labels are 1, Predicted labels are 1
    FP = ((y_true == 0) & (y_pred == 1)).sum().item()  # False Positive: True labels are 0, Predicted labels are 1
    TN = ((y_true == 0) & (y_pred == 0)).sum().item()  # True Negative: True labels are 0, Predicted labels are 0
    FN = ((y_true == 1) & (y_pred == 0)).sum().item()  # False Negative: True labels are 1, Predicted labels are 0
    
    return TP, FP, TN, FN

In [151]:
TP, FP, TN, FN = calculate_confusion_values(test_labels.to(device), predicted_labels)

In [154]:
print(TP, FP, TN, FN)

0 0 96930 102933


In [153]:
# accuracy
accuracy = (TP + TN) / (TP + FP + TN + FN) * 100.0
print("Accuracy:", accuracy)

# F1
f1_score = TP / (TP + 0.5 * (FP + FN)) * 100.0
print("F1-Score", f1_score)

# MCC
mcc_score = ((TP * TN) - (FP * FN)) / (((TP +FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5) * 100.0

print("MCC Score:", mcc_score)

Accuracy: 48.498221281577884
F1-Score 0.0


ZeroDivisionError: float division by zero