CS 7643 Project

Georgia Institute of Technology

Author: Carmen Yu

# Baseline Implementation
Based on “Deep Neural Networks for Bot Detection” paper by Kudugunta et al. which uses a Contextual LSTM (200D GloVE) model. Our goal is to match the following performance metric scores reported on “TwiBot-20 A Comprehensive Twitter Bot Detection Benchmark” by Feng et al. paper for this model. Our scores are as follows:
1. ACC = 82.9513
2. F1 = 84.0390
3. MCC = 65.9867 

## Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [26]:
# if training on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Loading Data (Test, Train, Validate)

In [2]:
# User Metadata
train_metadata = torch.load("../Data/Processed_Data/train_metadata_tensor.pth")
test_metadata = torch.load("../Data/Processed_Data/test_metadata_tensor.pth")
validate_metadata = torch.load("../Data/Processed_Data/validate_metadata_tensor.pth")

# Tweets Data
train_tweets = torch.load("../Data/Processed_Data/train_tweet_emb_tensor.pth")
test_tweets = torch.load("../Data/Processed_Data/test_tweet_emb_tensor.pth")
validate_tweets = torch.load("../Data/Processed_Data/validate_tweet_emb_tensor.pth")

# Labels
train_labels = torch.load("../Data/Processed_Data/train_label_tensor.pth")
test_labels = torch.load("../Data/Processed_Data/test_label_tensor.pth")
validate_labels = torch.load("../Data/Processed_Data/validate_label_tensor.pth")

In [3]:
print(train_tweets.shape)
print(test_tweets.shape)
print(validate_tweets.shape)

torch.Size([1398465, 200])
torch.Size([199863, 200])
torch.Size([401540, 200])


In [4]:
# For Reproducability
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x2976c160630>

In [5]:
# Using only 50% of training for time constraints
num_samples = int(train_tweets.shape[0] * 0.5) // 2

class_0_indices = torch.nonzero(train_labels == 0).squeeze()
class_1_indices = torch.nonzero(train_labels == 1).squeeze()

print(class_0_indices.shape)
print(class_1_indices.shape)

# Shuffle the indices using torch.randperm
shuffled_class_0_indices = class_0_indices[torch.randperm(class_0_indices.size(0))]
shuffled_class_1_indices = class_1_indices[torch.randperm(class_1_indices.size(0))]

# Select an equal number of samples from each class
class_0_sampled_indices = shuffled_class_0_indices[:num_samples]
class_1_sampled_indices = shuffled_class_1_indices[:num_samples]

print(class_0_sampled_indices.shape)
print(class_1_sampled_indices.shape)

# Combine the selected indices for a balanced dataset
balanced_indices = torch.cat((class_0_sampled_indices, class_1_sampled_indices))

torch.Size([641831])
torch.Size([756634])
torch.Size([349616])
torch.Size([349616])


In [6]:
train_tweets = train_tweets[balanced_indices]
train_metadata = train_metadata[balanced_indices]
train_labels = train_labels[balanced_indices]

print(train_tweets.shape)

torch.Size([699232, 200])


In [7]:
print((test_labels == True).sum() / test_labels.shape[0])
print((test_labels == False).sum() / test_labels.shape[0])

tensor(0.5150)
tensor(0.4850)


In [8]:
print((train_labels == True).sum() / train_labels.shape[0])
print((train_labels == False).sum() / train_labels.shape[0])

tensor(0.5000)
tensor(0.5000)



## Passing tweets through LSTM

In [9]:
# LSTM Model
class LSTMModel(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)
        
        # Define a fully connected output layer
        self.fc = nn.Linear(hidden_size, output_size)
        
        
    def forward(self, x):
        # Pass the input through the LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        # Get the output from the last time step
        output = self.fc(lstm_out[:, -1, :])  # Only take the last output for prediction
        return output

In [10]:
# LSTM Model Parameters
input_size = 200 # Number of features in input data(200 for 200D embeddings)
hidden_size = 50
output_size = 32
number_layers = 1 # Number of LSTM Layers

# Creating LSTM Model
lstm_model = LSTMModel(input_size = input_size, hidden_size = hidden_size, output_size = output_size, 
                      num_layers = number_layers)

In [None]:
train_input_data = train_tweets.unsqueeze(1)
train_input_data = train_input_data.to(torch.float32).to(device)

test_input_data = test_tweets.unsqueeze(1)
test_input_data = test_input_data.to(torch.float32)

validate_input_data = validate_tweets.unsqueeze(1)
validate_input_data = validate_input_data.to(torch.float32)

# Getting lstm_output for datasets
train_lstm_output = lstm_model(train_input_data)
train_lstm_output = train_lstm_output.detach()

test_lstm_output = lstm_model(test_input_data)
test_lstm_output = test_lstm_output.detach()

validate_lstm_output = lstm_model(validate_input_data)
validate_lstm_output = validate_lstm_output.detach()

In [12]:
print(train_lstm_output.shape)

torch.Size([699232, 32])


## Set aux output (output from LSTM)

In [28]:
aux_train = train_lstm_output.to(device)
aux_test = test_lstm_output.to(device)

## Concatenating with aux input (user metadata)

In [27]:
x_train = torch.hstack((train_lstm_output, train_metadata)).to(torch.float32).to(device)

x_test = torch.hstack((test_lstm_output, test_metadata)).to(torch.float32).to(device)

x_validate = torch.hstack((validate_lstm_output, validate_metadata)).to(torch.float32).to(device)

In [29]:
y_train = train_labels.view(-1, 1).to(torch.float32).to(device)

y_test = test_labels.view(-1, 1).to(torch.float32).to(device)

y_validate = validate_labels.view(-1, 1).to(torch.float32).to(device)

In [30]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_validate.shape, y_validate.shape)

torch.Size([699232, 37]) torch.Size([699232, 1])
torch.Size([199863, 37]) torch.Size([199863, 1])
torch.Size([401540, 37]) torch.Size([401540, 1])


In [31]:
x_train.requires_grad_(False)
y_train.requires_grad_(False)

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]], device='cuda:0')

## Creating Neural Network

In [32]:
class NeuralNetwork(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, aux_input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # First fully connected layer
        self.fc2 = nn.Linear(hidden_size, output_size)  # Second fully connected layer

        # layers for aux output
        self.fc1_aux = nn.Linear(aux_input_size, hidden_size)
        self.fc2_aux = nn.Linear(hidden_size, output_size)
        
        
    def forward(self, x, aux):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation after the first layer
        x = self.fc2(x)  # Output layer

        aux = torch.relu(self.fc1_aux((aux)))
        aux = self.fc2_aux(aux)

        return x, aux

In [44]:
# Network Parameters
input_size = 37 #40  # Number of input features (tweet embeddings + metadata)
hidden_size = 128  # Number of hidden units in the first fully connected layer
output_size = 1  # Output size (binary classification: 1 output)
batch_size = 64  # Batch size for training
aux_input_size = 32 # Number of input features for aux output

In [45]:
# Creating dataloader
dataset = TensorDataset(x_train, aux_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [46]:
# Initialize mode, loss function, and optimizer
model = NeuralNetwork(input_size, hidden_size, output_size, aux_input_size)
model.to(device)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.0001)  

In [None]:
# Training model
num_epochs = 15  # Number of epochs to train

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    for inputs, aux_inputs, labels in dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs, aux_outputs = model(inputs, aux_inputs)
        
        # Compute individual loss
        loss = criterion(outputs, labels)
        aux_loss = criterion(aux_outputs, labels)

        # Total loss
        total_loss = 0.8* loss + 0.2*aux_loss
        
        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch [1/15], Loss: 61.3112
Epoch [2/15], Loss: 36.1531
Epoch [3/15], Loss: 33.5535
Epoch [4/15], Loss: 32.9373
Epoch [5/15], Loss: 31.2158
Epoch [6/15], Loss: 32.0792
Epoch [7/15], Loss: 32.8778
Epoch [8/15], Loss: 32.8107
Epoch [9/15], Loss: 30.8312
Epoch [10/15], Loss: 28.7027
Epoch [11/15], Loss: 29.0252
Epoch [12/15], Loss: 25.7905
Epoch [13/15], Loss: 30.2366
Epoch [14/15], Loss: 26.7204
Epoch [15/15], Loss: 24.8851


## Metrics

In [48]:
# Set the model to evaluation mode 
model.eval()

# Make predictions
with torch.no_grad():  # Disable gradient calculation to save memory and computations
    predictions_main, predictions_aux = model(x_test, aux_test)
    
predictions_binary = (predictions_main > 0).float()
print("Predictions:", predictions_binary)

Predictions: tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]], device='cuda:0')


In [49]:
# Function to calculate TP, FP, TN, FN
def calculate_confusion_values(y_true, y_pred):
    """
    Calculate TP, FP, TN, FN for binary classification.
    
    Args:
        y_true (tensor): Ground truth labels.
        y_pred (tensor): Predicted labels.
        
    Returns:
        tuple: (TP, FP, TN, FN)
    """
    TP = ((y_true == 1) & (y_pred == 1)).sum().item()  # True Positive: True labels are 1, Predicted labels are 1
    FP = ((y_true == 0) & (y_pred == 1)).sum().item()  # False Positive: True labels are 0, Predicted labels are 1
    TN = ((y_true == 0) & (y_pred == 0)).sum().item()  # True Negative: True labels are 0, Predicted labels are 0
    FN = ((y_true == 1) & (y_pred == 0)).sum().item()  # False Negative: True labels are 1, Predicted labels are 0
    
    return TP, FP, TN, FN

In [40]:
TP, FP, TN, FN = calculate_confusion_values(y_test, predictions_binary)

In [54]:
# Accuracy
accuracy = (TP + TN) / (TP + FP + TN + FN) * 100.0
print("Accuracy:", accuracy)

# F1
f1_score = TP / (TP + 0.5 * (FP + FN)) * 100.0
print("F1-Score", f1_score)

# MCC
mcc_score = ((TP * TN) - (FP * FN)) / (((TP +FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5) * 100.0

print("MCC Score:", mcc_score)

Accuracy: 82.9513216553339
F1-Score 84.039084896292
MCC Score: 65.9867112059793


In [52]:
print(len(predictions_main.unique()))

194148


In [55]:
print(TP, FP, TN, FN)

89705 20846 76084 13228
