In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaModel, RobertaTokenizer
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Custom Attention Layer in PyTorch
class CustomAttention(nn.Module):
    def __init__(self, input_dim):
        super(CustomAttention, self).__init__()
        self.W = nn.Parameter(torch.randn(input_dim, 1))
        self.b = nn.Parameter(torch.zeros(input_dim))

    def forward(self, x):
        e = F.relu(torch.matmul(x, self.W) + self.b)
        a = torch.softmax(e, dim=1)
        output = torch.sum(x * a, dim=1)
        return output

# Define the PyTorch Model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.cnn = nn.Conv1d(in_channels=768, out_channels=64, kernel_size=3, padding=1)
        self.attention = CustomAttention(input_dim=768)
        self.fc1 = nn.Linear(768 + 64, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids, attention_mask=attention_mask)[0]
        cnn_output = self.cnn(roberta_output.transpose(1, 2))
        cnn_output = F.max_pool1d(cnn_output, kernel_size=cnn_output.shape[2]).squeeze(2)
        attention_output = self.attention(roberta_output)
        combined = torch.cat((cnn_output, attention_output), 1)
        x = F.relu(self.fc1(combined))
        x = torch.sigmoid(self.fc2(x)).squeeze(1)
        return x

In [None]:
# Data Preparation
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
          tweet,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
        return {
          'tweet_text': tweet,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.float)
        }




In [None]:
# Function to preprocess text data
def preprocess_text(text):
    # Implement text cleaning here (e.g., removing URLs, non-alphanumeric characters, etc.)
    return text



# Function to preprocess text data
def preprocess_text(text):
    # Implement text cleaning here (e.g., removing URLs, non-alphanumeric characters, etc.)
    return text

# Load and preprocess data
df_train = pd.read_csv(".../combined_tweets_train_data.csv")
df_test = pd.read_csv(".../combined_tweets_test_data.csv")

df_train['tweet'] = df_train[' Tweet Text'].apply(preprocess_text)
df_test['tweet'] = df_test[' Tweet Text'].apply(preprocess_text)

# Encode labels as binary
encoded_dict = {"Urgent": 0, "Not Urgent": 1}
df_train['event'] = df_train[' Informativeness'].map(encoded_dict)
df_test['event'] = df_test[' Informativeness'].map(encoded_dict)



# Create datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset = TweetDataset(
    tweets=df_train.tweet.to_numpy(),
    labels=df_train.event.to_numpy(),
    tokenizer=tokenizer,
    max_len=100
)
test_dataset = TweetDataset(
    tweets=df_test.tweet.to_numpy(),
    labels=df_test.event.to_numpy(),
    tokenizer=tokenizer,
    max_len=100
)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)




In [None]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Model Initialization and transfer to device
model = MyModel().to(device)

# Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.BCELoss()

Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Aggregate loss
        total_loss += loss.item()

        # Calculate accuracy
        predictions = (outputs > 0.5).float()  # Convert to binary predictions
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

    # Calculate average loss and accuracy over the epoch
    avg_loss = total_loss / len(train_loader)
    avg_accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}")

Epoch 1/10 - Loss: 0.3819, Accuracy: 0.8261
Epoch 2/10 - Loss: 0.2902, Accuracy: 0.8776
Epoch 3/10 - Loss: 0.2424, Accuracy: 0.8996
Epoch 4/10 - Loss: 0.1935, Accuracy: 0.9220
Epoch 5/10 - Loss: 0.1535, Accuracy: 0.9385
Epoch 6/10 - Loss: 0.1236, Accuracy: 0.9530
Epoch 7/10 - Loss: 0.1004, Accuracy: 0.9618
Epoch 8/10 - Loss: 0.0861, Accuracy: 0.9675
Epoch 9/10 - Loss: 0.0737, Accuracy: 0.9714
Epoch 10/10 - Loss: 0.0663, Accuracy: 0.9751


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import torch

# Ensure the model is in evaluation mode
model.eval()

predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

# Convert predictions to binary
binary_predictions = [1 if p > 0.5 else 0 for p in predictions]



In [None]:
# Assuming predictions are single probability values for the positive class
auc_roc = roc_auc_score(true_labels, predictions)
print(f'AUC-ROC: {auc_roc}')

# Classification report and confusion matrix
print(classification_report(true_labels, binary_predictions))
print(f'AUC-ROC: {auc_roc}')




AUC-ROC: 0.9340515918312533
              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89      3339
         1.0       0.88      0.76      0.82      2248

    accuracy                           0.86      5587
   macro avg       0.87      0.85      0.85      5587
weighted avg       0.86      0.86      0.86      5587

AUC-ROC: 0.9340515918312533
