# Improved Implementation: RoBERTa+BiLSTM

Using best parameters on the full training dataset with metadata, the performance metric scores on the test dataset are as follows: 
1. ACC: 0.8075
2. F1: 0.8039
3. MCC: 0.6306

## Import Libraries

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from tqdm import tqdm
import itertools
import pandas as pd

## Set Device
Set device to CUDA if available, else CPU

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
if device == 'cuda': 
    torch.cuda.empty_cache()

## Loading Data (Test, Train, Validate)

In [4]:
# User Metadata
train_metadata = torch.load("../Data/Processed_Data/train_metadata_tensor.pth").to(device)
test_metadata = torch.load("../Data/Processed_Data/test_metadata_tensor.pth").to(device)
# validate_metadata = torch.load("../Data/Processed_Data/validate_metadata_tensor.pth").to(device)

# Tweets Data
train_tweets = torch.load("../Data/Processed_Data/train_tweet_roberta_emb_tensor.pth")[:,:200].to(device)
test_tweets = torch.load("../Data/Processed_Data/test_tweet_roberta_emb_tensor.pth")[:,:200].to(device)
# validate_tweets = torch.load("../Data/Processed_Data/validate_tweet_roberta_emb_tensor.pth").to(device)

# Labels
train_labels = torch.load("../Data/Processed_Data/train_label_tensor.pth").to(device)
test_labels = torch.load("../Data/Processed_Data/test_label_tensor.pth").to(device)
# validate_labels = torch.load("../Data/Processed_Data/validate_label_tensor.pth").to(device)

In [5]:
print(train_tweets.shape)
print(test_tweets.shape)
# print(validate_tweets.shape)

print(train_metadata.shape)
print(test_metadata.shape)
# print(validate_metadata.shape)

torch.Size([1398465, 200])
torch.Size([199863, 200])
torch.Size([1398465, 5])
torch.Size([199863, 5])


Set seed for Reproducability

In [5]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x17376d38190>


## BiLSTM Model

In [6]:
class LSTMModel(nn.Module):
    
    def __init__(self, input_size=200, hidden_size=50, output_size=32, num_layers=1, dropout_rate = 0.0):
        super(LSTMModel, self).__init__()
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False, bidirectional = True, dropout = dropout_rate)
        
        # Define a fully connected output layer
        self.fc = nn.Linear(hidden_size * 2, output_size)
        
        
    def forward(self, x):
        # Pass through BiLSTM
        lstm_out, (hn, cn) = self.lstm(x)
        
        output = self.fc(lstm_out[:, -1, :])
        return output

In [7]:
lstm_input_size = 200 #768 # Number of features in input data (768 for Roberta Embeddings)
lstm_hidden_size = 50
lstm_output_size = 32
lstm_number_layers = 1 # Number of LSTM Layers
lstm_dropout = 0

In [None]:
# Creating LSTM Model
lstm_model = LSTMModel(input_size = lstm_input_size, hidden_size = lstm_hidden_size, output_size = lstm_output_size, 
                      num_layers = lstm_number_layers, dropout_rate=lstm_dropout).to(device)

## Prepare Data

Getting lstm_output for datasets

In [8]:
train_tweets = train_tweets.unsqueeze(1).to(torch.float32)
test_tweets = test_tweets.unsqueeze(1).to(torch.float32)

In [9]:
train_lstm_output = lstm_model(train_tweets)
train_lstm_output = train_lstm_output.detach()

test_lstm_output = lstm_model(test_tweets)
test_lstm_output = test_lstm_output.detach()

In [10]:
print(train_lstm_output.shape)
print(test_lstm_output.shape)

torch.Size([1398465, 32])
torch.Size([199863, 32])


Concatenate text embedding with metadata embedding

In [11]:
x_train = torch.hstack((train_lstm_output, train_metadata)).to(torch.float32)
x_test = torch.hstack((test_lstm_output, test_metadata)).to(torch.float32)

In [12]:
y_train = train_labels.view(-1, 1).to(torch.float32)
y_test = test_labels.view(-1, 1).to(torch.float32)

In [13]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

torch.Size([1398465, 37]) torch.Size([1398465, 1])
torch.Size([199863, 37]) torch.Size([199863, 1])


In [14]:
x_train.requires_grad_(False)
y_train.requires_grad_(False)

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]], device='cuda:0')

## Final Layer

In [9]:
class NeuralNetwork(nn.Module):
    
    def __init__(self, input_size, hidden_size_1, hidden_size_2, output_size=1, dropout=0.2):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size_1)  # First fully connected layer
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)  # Second fully connected layer
        self.out = nn.Linear(hidden_size_2, output_size)
        self.dropout = nn.Dropout(p = dropout)
        
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation after the first layer
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))  # Output layer
        
        x = self.out(x)
        return x

In [16]:
# Network Parameters
input_size = 37 # Number of input features (tweet embeddings + metadata)
hidden_size_1 = 128  # Number of hidden units in the first fully connected layer
hidden_size_2 = 64  # Number of hidden units in the first fully connected layer
output_size = 1  # Output size (binary classification: 1 output)

## Create Dataloaders

In [17]:
batch_size = 64 # Batch size for training

train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

## Model Training

Initialize model, loss function, and optimizer

In [18]:
torch.manual_seed(seed)
model = NeuralNetwork(input_size, hidden_size_1, hidden_size_2, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.0001)

Train model

In [10]:
def train_model(model, train_loader, optimizer, criterion, epochs=10):
    """
    :param model: bot classifier
    :param train_loader: training dataset
    :param optimizer: Adam as specified on paper
    :param criterion: BCE as specified on paper
    :param epochs: adjust for hyperparameter tuning
    :return: None
    """
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True, dynamic_ncols=True, position=0) 
        
        for i, batch in enumerate(progress_bar):
            embeddings, labels = batch[0].float().to(device), batch[1].float().to(device)
            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if i % 10 == 0:
                progress_bar.set_postfix(loss=f"{loss.item():.4f}")

In [25]:
train_model(model, train_loader, optimizer, criterion, epochs=10)

Epoch 1/10: 100%|██████████| 21852/21852 [00:46<00:00, 469.58it/s, loss=3.7123]  
Epoch 2/10: 100%|██████████| 21852/21852 [00:47<00:00, 462.30it/s, loss=0.5805] 
Epoch 3/10: 100%|██████████| 21852/21852 [00:47<00:00, 457.80it/s, loss=0.5366] 
Epoch 4/10: 100%|██████████| 21852/21852 [00:47<00:00, 461.57it/s, loss=0.4693]
Epoch 5/10: 100%|██████████| 21852/21852 [00:45<00:00, 475.20it/s, loss=0.5689]
Epoch 6/10: 100%|██████████| 21852/21852 [00:46<00:00, 469.88it/s, loss=0.5335] 
Epoch 7/10: 100%|██████████| 21852/21852 [00:48<00:00, 453.41it/s, loss=0.4210]
Epoch 8/10: 100%|██████████| 21852/21852 [00:47<00:00, 456.20it/s, loss=0.7942] 
Epoch 9/10: 100%|██████████| 21852/21852 [00:46<00:00, 469.60it/s, loss=0.4894]
Epoch 10/10: 100%|██████████| 21852/21852 [00:46<00:00, 473.38it/s, loss=0.4773]


## Model Evaluation

Evaluate model

In [11]:
def evaluate_model(model, test_loader):
    """
    :param model: bot classifier
    :param test_loader: testing dataset
    :return: None
    """
    model.eval()
    preds, true_labels = [], []
    
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Evaluating", unit="batch", dynamic_ncols=True, position=0)
        
        for batch in progress_bar:
            embeddings, labels = batch[0].float().to(device), batch[1].float().to(device)
            outputs = model(embeddings)
            probs = torch.sigmoid(outputs)
            preds.extend((probs > 0.5).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='weighted')  # Adjust 'weighted' if needed
    mcc = matthews_corrcoef(true_labels, preds)
    
    return acc, f1, mcc

In [27]:
evaluate_model(model, test_loader)

Evaluating: 100%|██████████| 3123/3123 [00:02<00:00, 1172.49batch/s]


(0.8044960798146731, 0.8017276384516153, 0.6198985535659354)

## Grid search

Batch data fed to LSTM to fit in GPU

In [12]:
def extract_lstm_outputs(model, dataloader):
    model.eval()
    outputs = []
    with torch.no_grad():
        for (batch,) in dataloader:
            batch = batch.to(device)
            out = model(batch)
            outputs.append(out.cpu())
    return torch.cat(outputs, dim=0)

Define parameter grid

In [13]:
param_grid = {
    "learning_rate": [0.0001],
    "lstm_dropout": [0.5],
    "lstm_hidden_size": [50],
    "lstm_output_size": [32],
    "lstm_number_layers": [2],
    "batch_size": [64],
    "dropout": [0.2],
    "hidden_size_1": [256],
    "hidden_size_2": [64],
    "epochs": [10],
    "weight_decay": [0]
}

# Generate all combinations
param_combinations = list(itertools.product(*param_grid.values()))

Run grid search and save results

In [15]:
# Define CSV file path
csv_filename = "../Data/Tuning_Results/hyperparameter_results_Roberta_BiLSTM.csv"

# Load existing results if the CSV exists
if os.path.exists(csv_filename):
    existing_results = pd.read_csv(csv_filename)
else:
    existing_results = pd.DataFrame(columns=["learning_rate", "lstm_dropout", "lstm_hidden_size", "lstm_output_size", "lstm_number_layers", "batch_size", "dropout", "hidden_size_1", "hidden_size_2", "epochs", "weight_decay", "accuracy", "f1_score", "mcc"])

# Convert existing results to a set of tested hyperparameters
tested_params = set(
    tuple(row) for row in existing_results[["learning_rate", "lstm_dropout", "lstm_hidden_size", "lstm_output_size", "lstm_number_layers", "batch_size", "dropout", "hidden_size_1", "hidden_size_2", "epochs", "weight_decay"]].values
)

# Filter out already tested hyperparameters
new_param_combinations = param_combinations#[params for params in param_combinations if params not in tested_params]

print(f"Total hyperparameter sets to test: {len(new_param_combinations)}")

torch.manual_seed(seed)
# Loop through only new hyperparameter combinations
for i, params in enumerate(new_param_combinations):
    print("-----------------------------------------------------------------------------------------------------")
    print(f"Processing parameters {i+1}/{len(new_param_combinations)}")

    lr, lstm_dropout, lstm_hidden_size, lstm_output_size, lstm_number_layers, batch_size, dropout, hidden_size_1, hidden_size_2, epochs, weight_decay = params

    # Initialize lstm model
    lstm_model = LSTMModel(
                    input_size = train_tweets.shape[-1],
                    hidden_size = lstm_hidden_size,
                    output_size = lstm_output_size,
                    num_layers = lstm_number_layers,
                    dropout_rate = lstm_dropout
                ).to(device)

    lstm_train_dataset = TensorDataset(train_tweets)
    lstm_train_loader = DataLoader(lstm_train_dataset, batch_size=batch_size, shuffle=False)

    lstm_test_dataset = TensorDataset(test_tweets)
    lstm_test_loader = DataLoader(lstm_test_dataset, batch_size=batch_size, shuffle=False)

    train_lstm_output = extract_lstm_outputs(lstm_model, lstm_train_loader).detach()
    test_lstm_output = extract_lstm_outputs(lstm_model, lstm_test_loader).detach()

    # save embedding checkpoint
    torch.save(train_lstm_output, f"../Data/Processed_Data/Roberta_LSTM_tweet_embedding.pth")

    # save model checkpoint
    torch.save(lstm_model.state_dict(), f"../Data/Models/Roberta_LSTM_BiLSTM.pth")

    x_train = torch.hstack((train_lstm_output.to(device), train_metadata)).to(torch.float32)
    x_test = torch.hstack((test_lstm_output.to(device), test_metadata)).to(torch.float32)

    # save embedding checkpoint
    torch.save(x_train, '../Data/Processed_Data/Roberta_LSTM_tweet_metadata_embedding.pth')

    y_train = train_labels.view(-1, 1).to(torch.float32)
    y_test = test_labels.view(-1, 1).to(torch.float32)

    x_train.requires_grad_(False)
    y_train.requires_grad_(False)

    # Initialize neural network model
    model = NeuralNetwork(
        input_size=x_train.shape[1],
        hidden_size_1=hidden_size_1,
        hidden_size_2=hidden_size_2,
        dropout=dropout
    ).to(device)

    # Define loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Create Data Loaders
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    test_dataset = TensorDataset(x_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    # Train the model
    train_model(model, train_loader, optimizer, criterion, epochs=epochs)

    # Evaluate the model
    acc, f1, mcc = evaluate_model(model, test_loader)

    # Save model checkpoint
    torch.save(model.state_dict(), f"../Data/Models/Roberta_LSTM_FC.pth")

    # Save results
    df_new = pd.DataFrame([{
        "learning_rate": lr,
        "lstm_dropout": lstm_dropout,
        "lstm_hidden_size": lstm_hidden_size,
        "lstm_output_size": lstm_output_size,
        "lstm_number_layers": lstm_number_layers,
        "batch_size": batch_size,
        "dropout": dropout,
        "hidden_size_1": hidden_size_1,
        "hidden_size_2": hidden_size_2,
        "epochs": epochs,
        "weight_decay": weight_decay,
        "accuracy": acc,
        "f1_score": f1,
        "mcc": mcc
    }])

    df_new.to_csv(csv_filename, mode='a', header=not os.path.exists(csv_filename), index=False)

print("Grid search complete. Results saved to hyperparameter_results_Roberta_BiLSTM.csv.")

Total hyperparameter sets to test: 1
-----------------------------------------------------------------------------------------------------
Processing parameters 1/1


Epoch 1/10: 100%|██████████| 21851/21851 [01:26<00:00, 252.20it/s, loss=58.1512] 
Epoch 2/10: 100%|██████████| 21851/21851 [01:25<00:00, 256.40it/s, loss=0.7587] 
Epoch 3/10: 100%|██████████| 21851/21851 [01:21<00:00, 266.54it/s, loss=0.5353]
Epoch 4/10: 100%|██████████| 21851/21851 [01:21<00:00, 267.01it/s, loss=0.5112]
Epoch 5/10: 100%|██████████| 21851/21851 [01:17<00:00, 283.15it/s, loss=0.6253]
Epoch 6/10: 100%|██████████| 21851/21851 [01:18<00:00, 279.23it/s, loss=0.4604] 
Epoch 7/10: 100%|██████████| 21851/21851 [01:17<00:00, 282.96it/s, loss=0.5014]
Epoch 8/10: 100%|██████████| 21851/21851 [01:19<00:00, 275.69it/s, loss=0.4645]
Epoch 9/10: 100%|██████████| 21851/21851 [01:19<00:00, 275.53it/s, loss=0.7359]
Epoch 10/10: 100%|██████████| 21851/21851 [01:16<00:00, 284.14it/s, loss=0.5299]
Evaluating: 100%|██████████| 3123/3123 [00:04<00:00, 640.94batch/s]


Grid search complete. Results saved to hyperparameter_results_Roberta_BiLSTM.csv.
