CS 7643 Project

Georgia Institute of Technology

Author: Daniel Solon

# Improved Implementation: RoBERTa+DenseNet
Based on "A Deep Learning Approach for Robust Detection of Bots in Twitter Using Transformers" paper where their best model is based on (RoBERTa + metadata) for the input feature vectors fed to a Dense network.

Using best parameters on the full training dataset with metadata, the performance metric scores on the test dataset are as follows: 
1. ACC: 0.765059065
2. F1: 0.7575362
3. MCC: 0.555427074

## Import Libraries

In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from tqdm import tqdm
import itertools
import pandas as pd

## Set Device
Set device to CUDA if available, else CPU

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
if device == 'cuda': 
    torch.cuda.empty_cache()

## Loading Data (Test, Train, Validate)

In [5]:
# User Metadata
train_metadata = torch.load("../Data/Processed_Data/train_metadata_tensor.pth").to(device)
test_metadata = torch.load("../Data/Processed_Data/test_metadata_tensor.pth").to(device)
# validate_metadata = torch.load("../Data/Processed_Data/validate_metadata_tensor.pth").to(device)

# Tweets Data
train_tweets = torch.load("../Data/Processed_Data/train_tweet_roberta_emb_tensor.pth").to(device)
test_tweets = torch.load("../Data/Processed_Data/test_tweet_roberta_emb_tensor.pth").to(device)
# validate_tweets = torch.load("../Data/Processed_Data/validate_tweet_roberta_emb_tensor.pth").to(device)

# Labels
train_labels = torch.load("../Data/Processed_Data/train_label_tensor.pth").to(device)
test_labels = torch.load("../Data/Processed_Data/test_label_tensor.pth").to(device)
# validate_labels = torch.load("../Data/Processed_Data/validate_label_tensor.pth").to(device)

In [6]:
print(train_tweets.shape)
print(test_tweets.shape)
# print(validate_tweets.shape)

print(train_metadata.shape)
print(test_metadata.shape)
# print(validate_metadata.shape)

torch.Size([1398465, 768])
torch.Size([199863, 768])
torch.Size([1398465, 5])
torch.Size([199863, 5])


## DenseNet
Define Dense Classifier Model. The layers are as defined on the paper: Input Layer, Hidden Layer, then Output Layer. Model parameters are based on Table 2 from the paper. However, some parameter values might not match because of description ambiguity.

In [7]:
class DenseBotClassifier(nn.Module):
    
    def __init__(self, input_dim=768+5, output_dim=1, hidden_input_dim=1024, hidden_dim=512, hidden_output_dim=256, dropout=0.5):
        """
        :param input_dim: (RoBERTa embeddings + metadata) dimension
        :param output_dim: 1 for binary classification
        :param input_dim: input_layer input dimension
        :param output_dim: input_layer output dimension
        :param hidden_input_dim: hidden_layer input dimension
        :param hidden_dim: hidden_layer intermediate dimension
        :param hidden_output_dim: hidden_layer output dimension
        :param dropout: adjust for hyperparameter tuning
        """
        super(DenseBotClassifier, self).__init__()
        
        self.input_layer = nn.Sequential(
                            nn.Linear(input_dim, hidden_input_dim),
                            nn.BatchNorm1d(hidden_input_dim),
                            nn.SELU(),
                            nn.Dropout(dropout)
                        )
        
        self.hidden_layer = nn.Sequential(
                            nn.Linear(hidden_input_dim, hidden_dim),
                            nn.BatchNorm1d(hidden_dim),
                            nn.SELU(),
                            nn.Linear(hidden_dim, hidden_output_dim),
                            nn.BatchNorm1d(hidden_output_dim),
                            nn.SELU(),
                            nn.Dropout(dropout)
                        )                    
        
        self.output_layer = nn.Linear(hidden_output_dim, output_dim)
    
    
    def forward(self, x):
        input_layer_activation = self.input_layer(x)
        hidden_layer_activation = self.hidden_layer(input_layer_activation)

        return self.output_layer(hidden_layer_activation)

## Prepare Data

Concatenate text embedding with metadata embedding

In [8]:
train = torch.hstack((train_tweets, train_metadata))
test = torch.hstack((test_tweets, test_metadata))

In [9]:
print(train.shape)
print(test.shape)

torch.Size([1398465, 773])
torch.Size([199863, 773])


Create Dataloaders

In [18]:
batch_size = 256  # Batch size for training

train_dataset = TensorDataset(train, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

test_dataset = TensorDataset(test, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

## Model Training

Initialize model, loss function, and optimizer

In [11]:
input_size = train.shape[1] # Number of input features (tweet embeddings + metadata)

model = DenseBotClassifier(input_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.001)  

Train model

In [12]:
def train_model(model, train_loader, optimizer, criterion, epochs=10):
    """
    :param model: bot classifier
    :param train_loader: training dataset
    :param optimizer: Adam as specified on paper
    :param criterion: BCE as specified on paper
    :param epochs: adjust for hyperparameter tuning
    :return: None
    """
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True, dynamic_ncols=True, position=0) 
        
        for i, batch in enumerate(progress_bar):
            embeddings, labels = batch[0].float().to(device), batch[1].float().to(device)
            optimizer.zero_grad()
            outputs = model(embeddings).squeeze(1)  # Ensure correct shape
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if i % 10 == 0:
                progress_bar.set_postfix(loss=f"{loss.item():.4f}")

In [None]:
train_model(model, train_loader, optimizer, criterion, epochs=10)

## Model Evaluation

Evaluate model

In [13]:
def evaluate_model(model, test_loader):
    """
    :param model: bot classifier
    :param test_loader: testing dataset
    :return: None
    """
    model.eval()
    preds, true_labels = [], []
    
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Evaluating", unit="batch", dynamic_ncols=True, position=0)
        
        for batch in progress_bar:
            embeddings, labels = batch[0].float().to(device), batch[1].float().to(device)
            outputs = model(embeddings).squeeze(1)
            probs = torch.sigmoid(outputs)
            preds.extend((probs > 0.5).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='weighted')  # Adjust 'weighted' if needed
    mcc = matthews_corrcoef(true_labels, preds)
    
    return acc, f1, mcc

In [None]:
evaluate_model(model, test_loader)

## Grid search

Define parameter grid

In [50]:
param_grid = {
    "learning_rate": [0.001],
    "dropout": [0.5],
    "hidden_input_dim": [1024],
    "hidden_dim": [512],
    "hidden_output_dim": [256],
    "batch_size": [256],
    "epochs": [10],
    "weight_decay": [0.015]
}

# Generate all combinations
param_combinations = list(itertools.product(*param_grid.values()))

Run grid search and save results

In [None]:
# Define CSV file path
csv_filename = "../Data/Tuning_Results/hyperparameter_results_RoBERTa_DenseNet.csv"

# Load existing results if the CSV exists
if os.path.exists(csv_filename):
    existing_results = pd.read_csv(csv_filename)
else:
    existing_results = pd.DataFrame(columns=["learning_rate", "dropout", "hidden_input_dim", "hidden_dim", "hidden_output_dim", "batch_size", "epochs", "weight_decay", "accuracy", "f1_score", "mcc"])

# Convert existing results to a set of tested hyperparameters
tested_params = set(
    tuple(row) for row in existing_results[["learning_rate", "dropout", "hidden_input_dim", "hidden_dim", "hidden_output_dim", "batch_size", "epochs", "weight_decay"]].values
)

# Filter out already tested hyperparameters
new_param_combinations = [params for params in param_combinations if params not in tested_params]

print(f"Total hyperparameter sets to test: {len(new_param_combinations)}")

# Loop through only new hyperparameter combinations
for i, params in enumerate(new_param_combinations):
    print("-----------------------------------------------------------------------------------------------------")
    print(f"Processing parameters {i+1}/{len(new_param_combinations)}") 
    
    lr, dropout, hidden_input, hidden, hidden_output, batch_size, epochs, weight_decay = params

    # Create Data Loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model
    model = DenseBotClassifier(
        input_dim=train.shape[1],
        output_dim=1,
        hidden_input_dim=hidden_input,
        hidden_dim=hidden,
        hidden_output_dim=hidden_output,
        dropout=dropout
    ).to(device)

    # Define loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    # Train the model
    train_model(model, train_loader, optimizer, criterion, epochs=epochs)
    
    # Evaluate the model
    acc, f1, mcc = evaluate_model(model, test_loader)

    # Save model checkpoint
    model_filename = f"../Data/Processed_Data/model_lr{lr}_dropout{dropout}_dim{hidden_input}-{hidden}-{hidden_output}_batch{batch_size}_epochs{epochs}_wd{weight_decay}.pth"
    torch.save(model.state_dict(), model_filename)
    print(f"Model saved: {model_filename}")
    
    # Save results
    df_new = pd.DataFrame([{
        "learning_rate": lr,
        "dropout": dropout,
        "hidden_input_dim": hidden_input,
        "hidden_dim": hidden,
        "hidden_output_dim": hidden_output,
        "batch_size": batch_size,
        "epochs": epochs,
        "weight_decay": weight_decay,
        "accuracy": acc,
        "f1_score": f1,
        "mcc": mcc
    }])
    
    df_new.to_csv(csv_filename, mode='a', header=not os.path.exists(csv_filename), index=False)

print("Grid search complete. Results saved to hyperparameter_results_RoBERTa_DenseNet.csv.")