# Imports

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np

# Loading Data (Test, Train, Validate)

In [2]:

# User Metadata
train_metadata = torch.load("../Data/Processed_Data/train_metadata_tensor.pth")
test_metadata = torch.load("../Data/Processed_Data/test_metadata_tensor.pth")
validate_metadata = torch.load("../Data/Processed_Data/validate_metadata_tensor.pth")

# Tweets Data
train_tweets = torch.load("../Data/Processed_Data/train_tweet_emb_tensor.pth")
test_tweets = torch.load("../Data/Processed_Data/test_tweet_emb_tensor.pth")
validate_tweets = torch.load("../Data/Processed_Data/validate_tweet_emb_tensor.pth")

# Labels
train_labels = torch.load("../Data/Processed_Data/train_label_tensor.pth")
test_labels = torch.load("../Data/Processed_Data/test_label_tensor.pth")
validate_labels = torch.load("../Data/Processed_Data/validate_label_tensor.pth")

In [3]:
print(train_tweets.shape)
print(test_tweets.shape)
print(validate_tweets.shape)

torch.Size([1778865, 200])
torch.Size([251066, 200])
torch.Size([512329, 200])


In [113]:
# Using only 10% of training for time constraints
num_samples = int(train_tweets.shape[0] * 0.3)

random_indices = torch.randperm(train_tweets.shape[0])[:num_samples]

train_tweets = train_tweets[random_indices]
train_metadata = train_metadata[random_indices]
train_labels = train_labels[random_indices]

print(train_tweets.shape)

torch.Size([533659, 200])


# Passing tweets through LSTM

In [114]:
# LSTM Model
class LSTMModel(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)
        
        # Define a fully connected output layer
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # Pass the input through the LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        # Get the output from the last time step
        output = self.fc(lstm_out[:, -1, :])  # Only take the last output for prediction
        return output

In [115]:
# LSTM Model Parameters

input_size = 200 # Number of features in input data(200 for 200D embeddings)
hidden_size = 50
output_size = 32
number_layers = 1 # Number of LSTM Layers

# Creating LSTM Model

lstm_model = LSTMModel(input_size = input_size, hidden_size = hidden_size, output_size = output_size, 
                      num_layers = number_layers)

In [116]:
train_input_data = train_tweets.unsqueeze(1)
train_input_data = train_input_data.to(torch.float32)

test_input_data = test_tweets.unsqueeze(1)
test_input_data = test_input_data.to(torch.float32)

validate_input_data = validate_tweets.unsqueeze(1)
validate_input_data = validate_input_data.to(torch.float32)

# Getting lstm_output for datasets
train_lstm_output = lstm_model(train_input_data)
train_lstm_output = train_lstm_output.detach()

test_lstm_output = lstm_model(test_input_data)
test_lstm_output = test_lstm_output.detach()

validate_lstm_output = lstm_model(validate_input_data)
validate_lstm_output = validate_lstm_output.detach()

In [117]:
print(train_lstm_output.shape)

torch.Size([533659, 32])


# Concatenating with aux input (user metadata)

In [118]:
x_train = torch.hstack((train_lstm_output, train_metadata)).to(torch.float32)

x_test = torch.hstack((test_lstm_output, test_metadata)).to(torch.float32)

x_validate = torch.hstack((validate_lstm_output, validate_metadata)).to(torch.float32)

In [119]:
y_train = train_labels.view(-1, 1).to(torch.float32)

y_test = test_labels.view(-1, 1).to(torch.float32)

y_validate = validate_labels.view(-1, 1).to(torch.float32)

In [120]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_validate.shape, y_validate.shape)

torch.Size([533659, 40]) torch.Size([533659, 1])
torch.Size([251066, 40]) torch.Size([251066, 1])
torch.Size([512329, 40]) torch.Size([512329, 1])


In [121]:
x_train.requires_grad_(False)
y_train.requires_grad_(False)

tensor([[1.],
        [1.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]])

# Creating Neural Network

In [122]:
class NeuralNetwork(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # First fully connected layer
        self.fc2 = nn.Linear(hidden_size, output_size)  # Second fully connected layer
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation after the first layer
        x = self.fc2(x)  # Output layer
        return x

In [123]:
# Network Parameters

input_size = 40  # Number of input features (tweet embeddings + metadata)
hidden_size = 50  # Number of hidden units in the first fully connected layer
output_size = 1  # Output size (binary classification: 1 output)
batch_size = 64  # Batch size for training

In [124]:
# Creating dataloader

dataset = TensorDataset(x_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [133]:
# Initialize mode, loss function, and optimizer
model = NeuralNetwork(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.001)  

In [134]:
# Step 5: Training loop
num_epochs = 10  # Number of epochs to train

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    for inputs, labels in dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch [1/10], Loss: 333.7279
Epoch [2/10], Loss: 195.5378
Epoch [3/10], Loss: 93.2539
Epoch [4/10], Loss: 17.6485
Epoch [5/10], Loss: 0.6456
Epoch [6/10], Loss: 0.7144
Epoch [7/10], Loss: 0.6380
Epoch [8/10], Loss: 0.7227
Epoch [9/10], Loss: 0.6481
Epoch [10/10], Loss: 0.7950


# Metrics

In [135]:
# Set the model to evaluation mode 
model.eval()

# Make predictions
with torch.no_grad():  # Disable gradient calculation to save memory and computations
    predictions = model(x_test)
    
predictions_binary = (predictions > 0).float()
print("Predictions:", predictions_binary)

Predictions: tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])


In [136]:
# Function to calculate TP, FP, TN, FN
def calculate_confusion_values(y_true, y_pred):
    """
    Calculate TP, FP, TN, FN for binary classification.
    
    Args:
        y_true (tensor): Ground truth labels.
        y_pred (tensor): Predicted labels.
        
    Returns:
        tuple: (TP, FP, TN, FN)
    """
    TP = ((y_true == 1) & (y_pred == 1)).sum().item()  # True Positive: True labels are 1, Predicted labels are 1
    FP = ((y_true == 0) & (y_pred == 1)).sum().item()  # False Positive: True labels are 0, Predicted labels are 1
    TN = ((y_true == 0) & (y_pred == 0)).sum().item()  # True Negative: True labels are 0, Predicted labels are 0
    FN = ((y_true == 1) & (y_pred == 0)).sum().item()  # False Negative: True labels are 1, Predicted labels are 0
    
    return TP, FP, TN, FN

In [137]:
TP, FP, TN, FN = calculate_confusion_values(y_test, predictions_binary)

In [138]:
# accuracy
accuracy = (TP + TN) / (TP + FP + TN + FN) * 100.0
print("Accuracy:", accuracy)

# F1
f1_score = TP / (TP + 0.5 * (FP + FN)) * 100.0
print("F1-Score", f1_score)

# MCC
mcc_score = ((TP * TN) - (FP * FN)) / (((TP +FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5) * 100.0

print("MCC Score:", mcc_score)

Accuracy: 77.0749524029538
F1-Score 73.03022777431552
MCC Score: 53.128694254098384


In [139]:
print(len(predictions.unique()))

118166


In [140]:
print(TP, FP, TN, FN)

77928 26950 115581 30607
