In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [27]:
from datasets import load_dataset
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from torch.utils.data import random_split
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
label_columns = ['related', 'PII', 'request', 'offer', 'aid_related', 'medical_help',
                 'medical_products', 'search_and_rescue', 'security', 'military',
                 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
                 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related',
                  'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops',
                 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm',
                 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']
num_labels = len(label_columns)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_for_bert(data):
    # Tokenize the messages and prepare the labels
    input_ids = []
    attention_masks = []
    labels = []

    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i]['message'],
            add_special_tokens=True,
            max_length=128,  # Adjust as needed
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append([data[i][label] for label in label_columns])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=torch.float)

    return input_ids, attention_masks, labels

# Apply preprocessing
train_input_ids, train_attention_masks, train_labels = preprocess_for_bert(dataset['train'])
val_input_ids, val_attention_masks, val_labels = preprocess_for_bert(dataset['validation'])
test_input_ids, test_attention_masks, test_labels = preprocess_for_bert(dataset['test'])

In [31]:
from torch.utils.data import TensorDataset

batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
validation_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
validation_sampler = torch.utils.data.SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [20]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [46]:
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 4  # Adjust as needed
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)
loss_fn = torch.nn.BCEWithLogitsLoss()



In [36]:
def multilabel_accuracy(preds, labels, threshold=0.5):
    preds = torch.sigmoid(preds)
    preds = (preds > threshold).float()
    correct = (preds == labels).float()
    acc = correct.sum() / correct.numel()
    return acc
    accuracy = multilabel_accuracy(outputs.logits, batch_labels)

In [50]:
def validate(model, dataloader, loss_fn, device):
    model.eval()
    total_loss, total_accuracy = 0, 0

    for batch in dataloader:
        inputs, attention_masks, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids=inputs, attention_mask=attention_masks)
            loss = loss_fn(outputs.logits, labels)
            acc = multilabel_accuracy(outputs.logits, labels)

        total_loss += loss.item()
        total_accuracy += acc.item()

    avg_loss = total_loss / len(dataloader)
    avg_acc = total_accuracy / len(dataloader)
    return avg_loss, avg_acc





Validation Loss: 0.13183386660880925, Validation Accuracy: 0.9556428667921457


In [52]:
for epoch in range(epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for batch in train_dataloader:
        inputs, attention_masks, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()

        outputs = model(input_ids=inputs, attention_mask=attention_masks)


        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        acc = multilabel_accuracy(outputs.logits, labels)

        total_loss += loss.item()
        total_accuracy += acc.item()


    avg_train_loss = total_loss / len(train_loader)
    avg_train_acc = total_accuracy / len(train_loader)


    print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_train_loss:.4f}, Accuracy: {avg_train_acc:.4f}")


    val_loss, val_accuracy = validate(model, validation_dataloader, loss_fn, device)
    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')



Epoch 1/4 - Loss: 0.0587, Accuracy: 0.9778
Validation Loss: 0.1622785369616858, Validation Accuracy: 0.9509352490028239
Epoch 2/4 - Loss: 0.0470, Accuracy: 0.9813
Validation Loss: 0.17869779267796657, Validation Accuracy: 0.9503000919122874
Epoch 3/4 - Loss: 0.0375, Accuracy: 0.9842
Validation Loss: 0.18305416822633042, Validation Accuracy: 0.9510393616575632
Epoch 4/4 - Loss: 0.0290, Accuracy: 0.9866
Validation Loss: 0.1906284407267104, Validation Accuracy: 0.9527156367805434


In [54]:
def test(model, dataloader, loss_fn, device):
    model.eval()
    total_loss, total_accuracy = 0, 0

    for batch in dataloader:
        inputs, attention_masks, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids=inputs, attention_mask=attention_masks)
            loss = loss_fn(outputs.logits, labels)
            acc = multilabel_accuracy(outputs.logits, labels)

        total_loss += loss.item()
        total_accuracy += acc.item()

    avg_loss = total_loss / len(dataloader)
    avg_acc = total_accuracy / len(dataloader)
    return avg_loss, avg_acc

# Example usage after completing all training epochs
test_loss, test_accuracy = test(model, test_dataloader, loss_fn, device)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


Test Loss: 0.15467721171902887, Test Accuracy: 0.9618018262314074
