In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaModel, RobertaTokenizer
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Custom Attention Layer in PyTorch
class CustomAttention(nn.Module):
    def __init__(self, input_dim):
        super(CustomAttention, self).__init__()
        self.W = nn.Parameter(torch.randn(input_dim, 1))
        self.b = nn.Parameter(torch.zeros(input_dim))

    def forward(self, x):
        e = F.relu(torch.matmul(x, self.W) + self.b)
        a = torch.softmax(e, dim=1)
        output = torch.sum(x * a, dim=1)
        return output

# Define the PyTorch Model
class MyModel(nn.Module):
    def __init__(self, num_classes):
        super(MyModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.cnn = nn.Conv1d(in_channels=768, out_channels=64, kernel_size=3, padding=1)
        self.attention = CustomAttention(input_dim=768)
        self.fc1 = nn.Linear(768 + 64, 256)
        self.fc2 = nn.Linear(256, num_classes)  # Change for multiclass

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids, attention_mask=attention_mask)[0]
        cnn_output = self.cnn(roberta_output.transpose(1, 2))
        cnn_output = F.max_pool1d(cnn_output, kernel_size=cnn_output.shape[2]).squeeze(2)
        attention_output = self.attention(roberta_output)
        combined = torch.cat((cnn_output, attention_output), 1)
        x = F.relu(self.fc1(combined))
        x = self.fc2(x)  # No sigmoid activation
        return x

# Data Preparation
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
          tweet,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
        return {
          'tweet_text': tweet,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)  # Change for multiclass
        }



In [None]:
# Function to preprocess text data
def preprocess_text(text):
    # Implement text cleaning here (e.g., removing URLs, non-alphanumeric characters, etc.)
    return text

# Load and preprocess data
df_train = pd.read_csv(".../train2.csv")
df_test = pd.read_csv(".../test2.csv")

# Assuming preprocess_text function is defined
df_train['tweet'] = df_train['tweet'].apply(preprocess_text)
df_test['tweet'] = df_test['tweet'].apply(preprocess_text)

# Encode labels for multiclass
# Manually re-number the dictionary values
encoded_dict = {
    "admiration": 0,
    "appreciation": 1,
    "business": 2,
    "casualty": 3,
    "climate and environmental issues": 4,
    "communication": 5,
    "damage": 6,
    "die": 7,
    "disaster preparedness": 8,
    "education": 9,
    "empathy": 10,
    "health": 11,
    "humanitarian assistance": 12,
    "immigration": 13,
    "information dissemination": 14,
    "inquiry": 15,
    "life": 16,
    "memories": 17,
    "news": 17,
    "others": 19,
    "personal matters": 20,
    "politics": 21,
    "resources": 22,
    "safety": 23,
    "sport": 24,
    "spiritual": 25,
    "transportation": 26,
    "travel": 27,
    "warning": 28
}

df_train['event'] = df_train['event'].map(encoded_dict)
df_test['event'] = df_test['event'].map(encoded_dict)

# Create datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset = TweetDataset(
    tweets=df_train.tweet.to_numpy(),
    labels=df_train.event.to_numpy(),
    tokenizer=tokenizer,
    max_len=100
)
test_dataset = TweetDataset(
    tweets=df_test.tweet.to_numpy(),
    labels=df_test.event.to_numpy(),
    tokenizer=tokenizer,
    max_len=100
)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Model Initialization and transfer to device
num_classes = len(encoded_dict)
model = MyModel(num_classes).to(device)

# Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()  # Change for multiclass



Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training Loop
num_epochs = 10  # Set the number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_examples = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_examples += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_examples
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')


Epoch 1/10, Loss: 1.0858, Accuracy: 0.7121
Epoch 2/10, Loss: 0.4206, Accuracy: 0.8748
Epoch 3/10, Loss: 0.2788, Accuracy: 0.9146
Epoch 4/10, Loss: 0.2032, Accuracy: 0.9371
Epoch 5/10, Loss: 0.1569, Accuracy: 0.9523
Epoch 6/10, Loss: 0.1231, Accuracy: 0.9613
Epoch 7/10, Loss: 0.1043, Accuracy: 0.9672
Epoch 8/10, Loss: 0.0849, Accuracy: 0.9725
Epoch 9/10, Loss: 0.0843, Accuracy: 0.9725
Epoch 10/10, Loss: 0.0725, Accuracy: 0.9758


In [None]:
# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

# Classification report and confusion matrix
print(classification_report(true_labels, predictions))



              precision    recall  f1-score   support

           0       0.99      1.00      1.00       137
           1       0.96      0.95      0.95       159
           2       1.00      0.99      0.99       302
           3       0.95      0.97      0.96       303
           4       0.99      1.00      0.99        97
           5       0.83      0.74      0.78        88
           6       0.87      0.81      0.84       411
           7       0.88      0.82      0.85       131
           8       0.85      0.78      0.81       447
           9       1.00      1.00      1.00       154
          10       0.85      0.74      0.79      1030
          11       0.99      1.00      1.00       145
          12       0.82      0.89      0.85       422
          13       0.97      0.98      0.98       124
          14       0.87      0.91      0.89      2110
          15       0.95      0.78      0.86        80
          16       1.00      1.00      1.00       106
          17       0.90    