# Improved Implementation: RoBERTa+DenseNet
Based on "A Deep Learning Approach for Robust Detection of Bots in Twitter Using Transformers" paper where their best model is based on (RoBERTa + metadata) for the input feature vectors fed to a Dense network.

## Import Libraries

In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from tqdm import tqdm

## Set Device

In [2]:
# Set device to CUDA if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Loading Data (Test, Train, Validate)

In [3]:
# User Metadata
# train_metadata = torch.load("../Data/Processed_Data/train_metadata_tensor.pth").to(device)
# test_metadata = torch.load("../Data/Processed_Data/test_metadata_tensor.pth").to(device)
# validate_metadata = torch.load("../Data/Processed_Data/validate_metadata_tensor.pth").to(device)

# Tweets Data
train_tweets = torch.load("../Data/Processed_Data/train_tweet_roberta_emb_tensor.pth").to(device)
test_tweets = torch.load("../Data/Processed_Data/test_tweet_roberta_emb_tensor.pth").to(device)
# validate_tweets = torch.load("../Data/Processed_Data/validate_tweet_roberta_emb_tensor.pth").to(device)

# Labels
train_labels = torch.load("../Data/Processed_Data/train_label_tensor.pth").to(device)
test_labels = torch.load("../Data/Processed_Data/test_label_tensor.pth").to(device)
# validate_labels = torch.load("../Data/Processed_Data/validate_label_tensor.pth").to(device)

In [4]:
print(train_tweets.shape)
print(test_tweets.shape)
# print(validate_tweets.shape)

print(train_labels.shape)
print(test_labels.shape)
# print(validate_labels.shape)

torch.Size([199863, 768])


Stratified sampling to expedite model training

In [5]:
# Move labels to CPU for processing
train_labels_cpu = train_labels.cpu().numpy()
# train_labels_cpu = test_labels.cpu().numpy()

# Perform stratified sampling to maintain label distribution
_, sampled_indices = train_test_split(
    range(train_tweets.shape[0]), 
    # range(test_tweets.shape[0]), 
    test_size=0.3, 
    stratify=train_labels_cpu, 
    random_state=42
)

# Convert to tensor and move to device
sampled_indices = torch.tensor(sampled_indices, device=device)

train_tweets = train_tweets[sampled_indices].contiguous()
# train_metadata = train_metadata[sampled_indices].contiguous()
train_labels = train_labels[sampled_indices].contiguous()

print(train_tweets.shape)

torch.Size([59959, 768])


## DenseNet

Define Dense Classifier Model. The layers are as defined on the paper: Input Layer, Hidden Layer, then Output Layer. Model parameters are based on Table 2 from the paper. However, some parameter values might not match because of description ambiguity.

In [8]:
class DenseBotClassifier(nn.Module):
    def __init__(self, input_dim=768, output_dim=1, hidden_input_dim=512, hidden_dim=256, hidden_output_dim=128, dropout=0.3):
        """
        :param input_dim: (RoBERTa embeddings + metadata) dimension
        :param output_dim: 1 for binary classification
        :param input_dim: input_layer input dimension
        :param output_dim: input_layer output dimension
        :param hidden_input_dim: hidden_layer input dimension
        :param hidden_dim: hidden_layer intermediate dimension
        :param hidden_output_dim: hidden_layer output dimension
        :param dropout: adjust for hyperparameter tuning
        """
        super(DenseBotClassifier, self).__init__()
        
        self.input_layer = nn.Sequential(
                            nn.Linear(input_dim, hidden_input_dim),
                            nn.BatchNorm1d(hidden_input_dim),
                            nn.SELU(),
                            nn.Dropout(dropout)
                        )
        
        self.hidden_layer = nn.Sequential(
                            nn.Linear(hidden_input_dim, hidden_dim),
                            nn.BatchNorm1d(hidden_dim),
                            nn.SELU(),
                            nn.Linear(hidden_dim, hidden_output_dim),
                            nn.BatchNorm1d(hidden_output_dim),
                            nn.SELU(),
                            nn.Dropout(dropout)
                        )                    
        
        self.output_layer = nn.Sequential(
                            nn.Linear(hidden_output_dim, output_dim),
                            nn.Sigmoid()
                        )
    
    def forward(self, x):
        input_layer_activation = self.input_layer(x)
        hidden_layer_activation = self.hidden_layer(input_layer_activation)

        return self.output_layer(hidden_layer_activation)

Define model parameters

In [9]:
input_size = 768 # Number of input features (tweet embeddings + metadata)
batch_size = 64  # Batch size for training

## Model Training

Create Dataloaders

In [13]:
train_dataset = TensorDataset(train_tweets, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Initialize model, loss function, and optimizer

In [73]:
model = DenseBotClassifier(input_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.001)  

Train model

In [64]:
def train_model(model, train_loader, optimizer, criterion, epochs=10):
    """
    :param model: bot classifier
    :param train_loader: training dataset
    :param optimizer: Adam as specified on paper
    :param criterion: BCE as specified on paper
    :param epochs: adjust for hyperparameter tuning
    :return: None
    """
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True, dynamic_ncols=True, position=0) 
        
        for batch in progress_bar:
            embeddings, labels = batch[0].to(device), batch[1].float().to(device)
            optimizer.zero_grad()
            outputs = model(embeddings).squeeze(1)  # Ensure correct shape
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            progress_bar.set_postfix(loss=f"{loss.item():.4f}")

In [65]:
train_model(model, train_loader, optimizer, criterion, epochs=10)

Epoch 1/10: 100%|██████████| 937/937 [00:05<00:00, 166.09it/s, loss=0.6137]
Epoch 2/10: 100%|██████████| 937/937 [00:05<00:00, 169.93it/s, loss=0.6172]
Epoch 3/10: 100%|██████████| 937/937 [00:05<00:00, 156.74it/s, loss=0.6463]
Epoch 4/10: 100%|██████████| 937/937 [00:06<00:00, 148.74it/s, loss=0.5849]
Epoch 5/10: 100%|██████████| 937/937 [00:06<00:00, 146.27it/s, loss=0.6163]
Epoch 6/10: 100%|██████████| 937/937 [00:07<00:00, 130.53it/s, loss=0.5923]
Epoch 7/10: 100%|██████████| 937/937 [00:07<00:00, 129.13it/s, loss=0.5618]
Epoch 8/10: 100%|██████████| 937/937 [00:07<00:00, 124.56it/s, loss=0.5924]
Epoch 9/10: 100%|██████████| 937/937 [00:07<00:00, 127.38it/s, loss=0.6270]
Epoch 10/10: 100%|██████████| 937/937 [00:07<00:00, 125.83it/s, loss=0.6269]


## Model Evaluation

Create Dataloaders

In [13]:
test_dataset = TensorDataset(test_tweets, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

Evaluate model

In [74]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef

def evaluate_model(model, test_loader):
    """
    :param model: bot classifier
    :param test_loader: testing dataset
    :return: None
    """
    model.eval()
    preds, true_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
            embeddings, labels = batch[0].to(device), batch[1].float().to(device)
            outputs = model(embeddings)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='weighted')  # Adjust 'weighted' if needed
    mcc = matthews_corrcoef(true_labels, preds)
    
    print(f'Test Accuracy: {acc:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'MCC: {mcc:.4f}')

In [71]:
evaluate_model(model, train_loader)

Test Accuracy: 0.4850
F1 Score: 0.3168
MCC: 0.0000
