# Setup

In [122]:
import os
import pandas as pd
import torch

## Load Dataset

- `data/crisisbench/all_data_en`: all combined english dataset used for the experiments
    - `crisis_consolidated_humanitarian_filtered_lang_en_dev.tsv`
    - `crisis_consolidated_humanitarian_filtered_lang_en_test.tsv`
    - `crisis_consolidated_humanitarian_filtered_lang_en_train.tsv`


In [123]:
df = {}

df["train"] = pd.read_csv("./data/crisisbench/all_data_en/crisis_consolidated_humanitarian_filtered_lang_en_train.tsv", sep="\t")
print(f"df_train: N = {len(df['train'])}")
df['dev'] = pd.read_csv("./data/crisisbench/all_data_en/crisis_consolidated_humanitarian_filtered_lang_en_dev.tsv", sep="\t")
print(f"df_dev: N = {len(df['dev'])}")
df['test'] = pd.read_csv("./data/crisisbench/all_data_en/crisis_consolidated_humanitarian_filtered_lang_en_test.tsv", sep="\t")
print(f"df_test: N = {len(df['test'])}")

df_train: N = 61164
df_dev: N = 8935
df_test: N = 17356


In [124]:
train_class_label = set(df["train"]["class_label"])
dev_class_label = set(df['dev']["class_label"])
test_class_label = set(df['test']["class_label"])

assert len(train_class_label) == len(dev_class_label) and  len(train_class_label) == len(test_class_label) 

train_class_label

{'affected_individual',
 'caution_and_advice',
 'displaced_and_evacuations',
 'donation_and_volunteering',
 'infrastructure_and_utilities_damage',
 'injured_or_dead_people',
 'missing_and_found_people',
 'not_humanitarian',
 'requests_or_needs',
 'response_efforts',
 'sympathy_and_support'}

## Target Labels

### Time-critical

- 'affected_individual'
  -  CrisisLexT26 (Affected individuals): Deaths, injuries, missing, found, or displaced `people`, and/or personal updates.
  - examples 
    - Although one person confirmed  dead by police, BBC understands death toll at least three. #clutha #helicoptercrash htt…
    - 4WABC-TV: FDNY confirms that there are fatalities in Metro North derailment. Other news outlets reporting 4 deaths.

- 'caution_and_advice'
  - CrisisLexT26 (Caution and advice) : If a message conveys/reports information about some `warning` or a piece of `advice` about a possible hazard of an incident.
  - examples
    - Be informed always. . . #RubyPH http://t.co/u1x521x0Is
    - RT @ChileanProbs: 8.3 earthquake in the north of Chile! Tsunami alert up north, Peru and Ecuador!
    - @JimFreund: Apparently we have no exclusivity.  The tornado watch is for all SE NY.  http://1.usa.gov/mSPGdf	en	1	caution_and_advice
    - Japan issues tsunami alert after Chile quake, expecting no damage: Japan has issued a tsunami alert following ... http://t.co/GerjHpPaNN

- 'displaced_and_evacuations'
    - People who have relocated due to the crisis, even for a short time (includes evacuations)
    - examples
      - RT @rociolewis: @TheEllenShow Chile has gone through a recent earthquake and now a fire, thousands are homeless. Please spread the word foräó_
      - RT @AnasMallick: More than 5 dozen #Earthquake victims, mostly women and children, shifted to #Karachi from #Awaran.
      - Hurricane Odile hits Baja California - Click2Houston

- 'infrastructure_and_utilities_damage'
  - Houses, buildings, roads damaged or utilities such as water, electricity, interrupted
  - Buildings or roads damaged or operational; utilities/services interrupted or restored
  - Reports of damaged buildings, roads, bridges, or utilities/services interrupted or restored.

- 'injured_or_dead_people'
  - Reports of casualties and/or injured `people` due to the crisis.
  - Injured and dead
  - If a message reports the information about `casualties` or damage done by an incident.

- 'missing_and_found_people'
  - `Missing`, trapped, or found people—Questions and/or reports about missing or found people.
  - People `missing` or found.
  - If a message reports about the missing or found person effected by an incident or seen a celebrity visit on ground zero

### Support and Relief

- 'requests_or_needs'
  - Needs of those affecte
  - Something (e.g. food, water, shelter) or someone (e.g. volunteers, doctors) is needed
  - examples
    - These have warned that diphtheria, cholera and malaria could spread in an epidemic of "apocalyptic proportions" if medical, food, water and other types of aid are not allowed in, along with trained personnel to administer the support.

- 'donation_and_volunteering'
  - Reports of urgent needs or donations of shelter and/or supplies such as food, water, clothing, money, medical supplies or blood; and volunteering services
  - Needs, requests, or offers of money, blood, shelter, supplies, and/or services by volunteers or professionals.
  - Donations of money
  - If a message speaks about money raised, donation offers, goods/services offered or asked by the victims of an incident.
  - Donations of supplies and/or volunteer work
  - Money requested, donated or spent
  - Needs or donations of shelter and/or supplies such as food, water, clothing, medical supplies or blood
  - Services needed or offered by volunteers or professionals
  - examples
    - "You know me : I don't like giving away money. But Nepal needs our help. Donate to @decappeal today"

- 'response_efforts'
  - Affected populations receiving food, water, shelter, medication, etc. from humanitarian/emergency response organizations
  - All info about responders. Affected populations receiving food, water, shelter, medication, etc. from humanitarian/emergency response organizations.

### Non-informative

- 'not_humanitarian'
  - Not applicable
  -  Not related to this crisis
  - Refers to the crisis, but does not contain useful information that helps you understand the situation; 2. Not related to the Typhoon, or not relevant for emergency/humanitarian response; 3. Related to the crisis, but not informative: if it refers to the crisis, but does not contain useful information that helps understand the situation.
  - examples
    - Had a long night. Time to sleep and rest for a while. I survived #RubyPH!	
    - #Baltimore is on fire and #Nepal death toll is rising....yet I still don't think people are paying attention
    - A subtle pressure in the Force drew Jacen's attention to his aide, Orlopp. He turned to find the Jenet just looking up f
    - IAF Planes Bring Back 546 Indians From Quake-hit Nepal | The New Indian Express http://t.co/8BPG5NCT2W | http://t.co/69mLhfefhr #AllTheNews
    - HERO ALERT! please share á¼¼Dá½Š8âœ¨ https://t.co/UED0PojAPx #motorcycle https://t.co/6saBdgri4c	en	NA	not_humanitarian

- 'sympathy_and_support'
  - To hear about the state of Sardinia where I spent the majority of my summers, is extremely saddening. Hope they can get through it.#sardinia

In [125]:
# Informative needed for handling urgent incidents 
time_critical = ['affected_individual', 'caution_and_advice', 'displaced_and_evacuations', 'infrastructure_and_utilities_damage', 'injured_or_dead_people', 'missing_and_found_people']

# Helping the survivor
support_and_relief = ['requests_or_needs', 'donation_and_volunteering', 'response_efforts']
 
# Not solving the problem
non_informative = ['not_humanitarian', 'sympathy_and_support']

In [126]:
mapping = {}
for x in time_critical:
    mapping[x] = 'time_critical'
for x in support_and_relief:
    mapping[x] = 'support_and_relief'
for x in non_informative:
    mapping[x] = 'non_informative'

df["train"]['class_label_group'] = df["train"]['class_label'].map(mapping)
df["dev"]['class_label_group'] = df["dev"]['class_label'].map(mapping)
df["test"]['class_label_group'] = df["test"]['class_label'].map(mapping)

In [127]:
train_class_label = set(df["train"]["class_label_group"])
dev_class_label = set(df['dev']["class_label_group"])
test_class_label = set(df['test']["class_label_group"])

assert len(train_class_label) == len(dev_class_label) and  len(train_class_label) == len(test_class_label) 

train_class_label

{'non_informative', 'support_and_relief', 'time_critical'}

## Preprocessing


### Remove meaningless text

"Prior to the classification experiment, we preprocess tweets to remove symbols, emoticons, invisible and non-ASCII characters, punctuations (replaced with whitespace), numbers, URLs, and hashtag signs"

####  URL removal

"All URLs were removed from tweets, since the text of URL strings does not necessarily convey any relevant information, and can therefore be removed [39]."

- Roy, D.; Mitra, M.; Ganguly, D. To Clean or Not to Clean: Document Preprocessing and Reproducibility. J. Data Inf. Qual. (JDIQ)
2018, 10, 18.

In [128]:
print(df['train'].loc[1, 'text'])

God bless you... https://t.co/AnEy1ydkkz


In [129]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'http\S+', '', regex=True)

In [130]:
print(df['train'].loc[1, 'text'])

God bless you... 


#### Remove hashtag

In [131]:
print(df['train'].loc[4, 'text'])
print(df['train'].loc[5, 'text'])

Rescue effort expands in India, Pakistan as flood death toll tops 350   #india #asia
RT @leanielsen: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert. #PrayForChiäó_


In [132]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'#\w+', '', regex=True).str.strip()

In [133]:
print(df['train'].loc[4, 'text'])
print(df['train'].loc[5, 'text'])

Rescue effort expands in India, Pakistan as flood death toll tops 350
RT @leanielsen: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


#### Remove username

In [134]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[3, 'text'])
print(df['train'].loc[5, 'text'])

RT @perreaux: Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
I'm really just excited for new undies and pinkberry @mollymcnultzxo
RT @leanielsen: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


In [135]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'@\w+', '', regex=True).str.strip()

In [136]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[3, 'text'])
print(df['train'].loc[5, 'text'])

RT : Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
I'm really just excited for new undies and pinkberry
RT : I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


#### Remove RT

In [137]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[5, 'text'])

RT : Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
RT : I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


In [138]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'\bRT\b', '', regex=True).str.strip()

In [139]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[5, 'text'])

: Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


#### Remove symbols, emoticons, invisible and non-ASCII characters, punctuation

In [140]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[7, 'text'])
print(df['train'].loc[11, 'text'])
print(df['train'].loc[12, 'text'])
print(df['train'].loc[13, 'text'])
print(df['train'].loc[14, 'text'])
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])
print(df['train'].loc[18, 'text'])

: Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
It���s a good thing that the government have done everything to avert any lost of lives from the onslaught of typhoon hagupit in the country.
Hurricane Irma on collision course with Florida; 4 reported killed: 10 points
News Corp Papers Compare The ABC To ISIS
Traveling on Humanitarian Medical Mission to Puerto Rico ἟5἟7 hosted by
Gym time!! Back to work!!
STORMS A COMIN!!!!! I miss Fridays at your place.
LIBTARDS RUIN EVERYTHING AND BLAME EVERYONE BUT THEMSELVES.
: Found helicopters hovering above but none reached the ground for help where many are still waiting for food and shelter.â€¦


In [141]:
import re

def clean_tweet(text):
    # remove non-ASCII characters,
    # 1) non-ASCII 제거 (이모티콘/특수문자/한글 등 모두 제거)
    text = text.encode("ascii", "ignore").decode()

    # remove emoticons (e.g., :), :-D, XD)
    text = re.sub(r'[:;=8xX][-~]?[)(DPpOo/\\]+', ' ', text)

    # remove numbers
    text = re.sub(r'\d+', ' ', text)

    # remove punctuations (replaced with whitespace)
    text = re.sub(r'[.,!?;:/()\"\'\[\]{}<>@#~`+=*&^%$|-]', ' ', text)

    # remove invisible characters
    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)

    # remove duplicate spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].apply(clean_tweet)

In [142]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[7, 'text'])
print(df['train'].loc[11, 'text'])
print(df['train'].loc[12, 'text'])
print(df['train'].loc[13, 'text'])
print(df['train'].loc[14, 'text'])
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])
print(df['train'].loc[18, 'text'])

Cracked wine casks damaged historical buildings and coffee shops This Napa earthquake is the biggest first world disaster
Its a good thing that the government have done everything to avert any lost of lives from the onslaught of typhoon hagupit in the country
Hurricane Irma on collision course with Florida reported killed points
News Corp Papers Compare The ABC ToISIS
Traveling on Humanitarian Medical Mission to Puerto Rico hosted by
Gym time Back to work
STORMS A COMIN I miss Fridays at your place
LIBTARDS RUIN EVERYTHING AND BLAME EVERYONE BUT THEMSELVES
Found helicopters hovering above but none reached the ground for help where many are still waiting for food and shelter


### Text lowercasing

All tweets were converted to lowercase; according to Hickman et al. [37], lowercasing tends to be beneficial because it reduces data dimensionality, thereby increasing statistical power, and usually does not reduce validity.

- Hickman, L.; Thapa, S.; Tay, L.; Cao, M.; Srinivasan, P. Text Preprocessing for Text Mining in Organizational Research: Review
and Recommendations. Organ. Res. Methods 2022, 25, 114–146.

In [143]:
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])

STORMS A COMIN I miss Fridays at your place
LIBTARDS RUIN EVERYTHING AND BLAME EVERYONE BUT THEMSELVES


In [144]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.lower()

In [145]:
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])

storms a comin i miss fridays at your place
libtards ruin everything and blame everyone but themselves


## Remove Empty text

In [146]:
for d in ['train', 'dev', 'test']:
    mask = df[d]["text"].str.strip() == ""
    count = mask.sum()
    print(d, count)

train 75
dev 14
test 21


In [147]:
for d in ['train', 'dev', 'test']:
    prev_length = len(df[d])
    df[d] = df[d][df[d]["text"].str.strip() != ""].reset_index(drop=True)
    print(f"{prev_length - len(df[d])} rows removed")

75 rows removed
14 rows removed
21 rows removed


## Save preprocessed data

In [148]:
for d in ['train', 'dev', 'test']:
    output_path = f"./data/crisisbench/preprocessed_data_{d}.csv"
    df[d].to_csv(output_path, index=False)
    print("Saved:", output_path)

Saved: ./data/crisisbench/preprocessed_data_train.csv
Saved: ./data/crisisbench/preprocessed_data_dev.csv
Saved: ./data/crisisbench/preprocessed_data_test.csv


# Deep Learning

## Setup

In [152]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split

from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [150]:
import random

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

generator = torch.Generator()
_ = generator.manual_seed(SEED)

In [151]:
def load_data():
    df = {}
    for d in ['train', 'dev', 'test']:
        output_path = f"./data/crisisbench/preprocessed_data_{d}.csv"
        df[d] = pd.read_csv(output_path).loc[:, ['text', 'class_label_group']]
        print("Loading:", output_path)
    return df

df = load_data()

Loading: ./data/crisisbench/preprocessed_data_train.csv
Loading: ./data/crisisbench/preprocessed_data_dev.csv
Loading: ./data/crisisbench/preprocessed_data_test.csv


In [96]:
df['train'].head()
print(f"df_train: N={len(df['train'])}")

df_train: N=61164


In [97]:
df['dev'].head()
print(f"df_dev: N={len(df['dev'])}")

df_dev: N=8935


In [None]:
df['test'].head()
print(f"df_test: N={len(df['test'])}")

                                                text   class_label_group
0  staff at our feeding centre say chronic malnou...  support_and_relief
1      you comin down for the summer semesters right     non_informative
2         yea it s upstate i m like a few hours away     non_informative
3  teach every pakistani that it is not enough to...     non_informative
4  stay with for live cvg as typhoon hagupit slam...       time_critical
df_test: N=17356


## CNN

### Import Libraries

In [1]:

import math
from collections import Counter
from typing import List, Tuple, Dict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


### Hyperparameters

In [5]:

MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 64 # depends on tweet length
EMBED_DIM = 50
FILTER_SIZES = (3, 4, 5)
NUM_FILTERS = 100
DROPOUT = 0.5 # tune
BATCH_SIZE = 64 # tune 
LR = 1e-3
NUM_EPOCHS = 10
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"

GLOVE_PATH = "./data/crisisbench/glove_word_embeddings.txt"



### Tokenizer and Vocab

In [8]:
def simple_tokenize(text: str) -> List[str]:
    """
    Splits on whitespace
    """
    return text.strip().split()

def build_vocab(
    texts: List[str],
    max_size: int,
    min_freq: int = 1
) -> Dict[str, int]:
    """
    Build a word -> index vocab from training texts.
    Reserves index 0 for PAD and 1 for UNK.
    """
    counter = Counter()
    for text in texts:
        tokens = simple_tokenize(text)
        counter.update(tokens)

    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    for word, freq in counter.most_common():
        if freq < min_freq:
            continue
        if len(vocab) >= max_size:
            break
        vocab[word] = len(vocab)

    return vocab


def encode_text(
    text: str,
    vocab: Dict[str, int],
    max_len: int
) -> List[int]:
    tokens = simple_tokenize(text)
    ids = [vocab.get(tok, vocab[UNK_TOKEN]) for tok in tokens][:max_len]
    if len(ids) < max_len:
        ids += [vocab[PAD_TOKEN]] * (max_len - len(ids))
    return ids


### Dataset & DataLoader

In [9]:
class TextDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        vocab: Dict[str, int],
        max_len: int,
    ):
        assert len(texts) == len(labels)
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        text = self.texts[idx]
        label = self.labels[idx]
        input_ids = encode_text(text, self.vocab, self.max_len)
        return torch.tensor(input_ids, dtype=torch.long), label


def create_dataloaders(
    train_texts: List[str],
    train_labels: List[int],
    val_texts: List[str],
    val_labels: List[int],
    max_vocab_size: int,
    max_seq_len: int,
    batch_size: int,
) -> Tuple[DataLoader, DataLoader, Dict[str, int], int]:
    vocab = build_vocab(train_texts, max_vocab_size)
    num_classes = len(set(train_labels))

    train_dataset = TextDataset(train_texts, train_labels, vocab, max_seq_len)
    val_dataset = TextDataset(val_texts, val_labels, vocab, max_seq_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, vocab, num_classes


### Load GloVe & build embedding matrix

In [10]:
def load_glove_embeddings(
    glove_path: str,
    embed_dim: int,
) -> Dict[str, torch.Tensor]:
    """
    Load GloVe file into a dict: word -> vector (torch.Tensor).
    Expects each line: word val1 val2 ... valD
    """
    embeddings = {}
    with open(glove_path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != embed_dim + 1:
                # ignore malformed lines
                continue
            word = parts[0]
            vec = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float32)
            embeddings[word] = vec
    return embeddings


def build_embedding_matrix(
    vocab: Dict[str, int],
    glove_embeddings: Dict[str, torch.Tensor],
    embed_dim: int,
) -> torch.Tensor:
    """
    Create an embedding matrix of shape [vocab_size, embed_dim]
    where row i is the vector for the word with index i.
    Words not found in GloVe are randomly initialized (small normal).
    """
    vocab_size = len(vocab)
    embedding_matrix = torch.empty(vocab_size, embed_dim, dtype=torch.float32)

    # Initialize OOV embeddings to small random values
    torch.nn.init.normal_(embedding_matrix, mean=0.0, std=0.05)

    # Set PAD embedding to zeros
    pad_idx = vocab[PAD_TOKEN]
    embedding_matrix[pad_idx] = torch.zeros(embed_dim, dtype=torch.float32)

    oov_count = 0
    for word, idx in vocab.items():
        if word in (PAD_TOKEN, UNK_TOKEN):
            continue
        vec = glove_embeddings.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec
        else:
            oov_count += 1

    print(f"GloVe OOV words: {oov_count}/{vocab_size}")
    return embedding_matrix

### Text CNN model (with optional pretrained embeddings)

In [11]:
class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        num_classes: int,
        pad_idx: int = 0,
        num_filters: int = 100,
        filter_sizes: Tuple[int, ...] = (3, 4, 5),
        dropout: float = 0.5,
        pretrained_embeddings: torch.Tensor | None = None,
        freeze_embeddings: bool = False,
    ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=pad_idx,
        )

        if pretrained_embeddings is not None:
            if pretrained_embeddings.shape != (vocab_size, embed_dim):
                raise ValueError(
                    f"Pretrained embeddings shape {pretrained_embeddings.shape} "
                    f"does not match (vocab_size, embed_dim)=({vocab_size}, {embed_dim})"
                )
            self.embedding.weight.data.copy_(pretrained_embeddings)
            if freeze_embeddings:
                self.embedding.weight.requires_grad = False

        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=num_filters,
                kernel_size=fs,
            )
            for fs in filter_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(input_ids)          # [B, L, D]
        embedded = embedded.transpose(1, 2)           # [B, D, L]

        conv_outputs = []
        for conv in self.convs:
            x = conv(embedded)                        # [B, F, L']
            x = F.relu(x)
            x = F.max_pool1d(x, x.size(2)).squeeze(2) # [B, F]
            conv_outputs.append(x)

        cat = torch.cat(conv_outputs, dim=1)          # [B, F * len(filter_sizes)]
        cat = self.dropout(cat)
        logits = self.fc(cat)                         # [B, num_classes]
        return logits

### Training & Evaluation

In [12]:
def train_one_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
    device: torch.device,
) -> Tuple[float, float]:
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * input_ids.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy


def evaluate(
    model: nn.Module,
    dataloader: DataLoader,
    criterion: nn.Module,
    device: torch.device,
) -> Tuple[float, float]:
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for input_ids, labels in dataloader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            logits = model(input_ids)
            loss = criterion(logits, labels)

            total_loss += loss.item() * input_ids.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy

### Main CNN Train Script

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_df = df['train'].dropna(subset=['text'])
train_texts = train_df['text'].tolist()
train_label_strs = train_df['class_label_group']

val_df = df['dev'].dropna(subset=['text'])
val_texts = val_df['text'].tolist()
val_label_strs = val_df['class_label_group']

all_label_strs = sorted(set(train_label_strs) | set(val_label_strs))
label2id = {label: i for i, label in enumerate(all_label_strs)}
id2label = {i: label for label, i in label2id.items()}

train_labels = [label2id[l] for l in train_label_strs]
val_labels   = [label2id[l] for l in val_label_strs]
# Create loaders and vocab
train_loader, val_loader, vocab, num_classes = create_dataloaders(
    train_texts=train_texts,
    train_labels=train_labels,
    val_texts=val_texts,
    val_labels=val_labels,
    max_vocab_size=MAX_VOCAB_SIZE,
    max_seq_len=MAX_SEQ_LEN,
    batch_size=BATCH_SIZE,
)

print(f"Vocab size: {len(vocab)}, Num classes: {num_classes}")

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeds = load_glove_embeddings(GLOVE_PATH, EMBED_DIM)
embedding_matrix = build_embedding_matrix(vocab, glove_embeds, EMBED_DIM)

# Initialize model with pretrained embeddings
print("Model Initialization...")
model = TextCNN(
    vocab_size=len(vocab),
    embed_dim=EMBED_DIM,
    num_classes=num_classes,
    pad_idx=vocab[PAD_TOKEN],
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    dropout=DROPOUT,
    pretrained_embeddings=embedding_matrix,
    freeze_embeddings=False,   # set True if you want to freeze GloVe
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_val_acc = 0.0
print("Training...")
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss, train_acc = train_one_epoch(
        model, train_loader, optimizer, criterion, device
    )
    val_loss, val_acc = evaluate(
        model, val_loader, criterion, device
    )

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_textcnn_glove.pt")

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
    )

print(f"Best validation accuracy: {best_val_acc:.4f}")

Vocab size: 20000, Num classes: 3
Loading GloVe embeddings...
GloVe OOV words: 882/20000
Epoch 01 | Train Loss: 0.4595, Train Acc: 0.8258 | Val Loss: 0.3805, Val Acc: 0.8552
Epoch 02 | Train Loss: 0.3572, Train Acc: 0.8675 | Val Loss: 0.3513, Val Acc: 0.8696
Epoch 03 | Train Loss: 0.3088, Train Acc: 0.8865 | Val Loss: 0.3488, Val Acc: 0.8734
Epoch 04 | Train Loss: 0.2718, Train Acc: 0.9027 | Val Loss: 0.3552, Val Acc: 0.8731
Epoch 05 | Train Loss: 0.2348, Train Acc: 0.9148 | Val Loss: 0.3815, Val Acc: 0.8666
Epoch 06 | Train Loss: 0.2045, Train Acc: 0.9268 | Val Loss: 0.3986, Val Acc: 0.8658
Epoch 07 | Train Loss: 0.1774, Train Acc: 0.9370 | Val Loss: 0.4382, Val Acc: 0.8646
Epoch 08 | Train Loss: 0.1547, Train Acc: 0.9453 | Val Loss: 0.4868, Val Acc: 0.8606
Epoch 09 | Train Loss: 0.1365, Train Acc: 0.9517 | Val Loss: 0.5132, Val Acc: 0.8562
Epoch 10 | Train Loss: 0.1208, Train Acc: 0.9579 | Val Loss: 0.5814, Val Acc: 0.8569
Best validation accuracy: 0.8734


### Calculate Metrics

In [27]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Helper to get predictions + labels from a DataLoader
def get_all_preds_and_labels(
    model: nn.Module,
    dataloader: DataLoader,
    device: torch.device,
):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for input_ids, labels in dataloader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            logits = model(input_ids)          # [B, num_classes]
            preds = logits.argmax(dim=1)      # [B]

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    return all_preds, all_labels

# compute accuracy, precision, recall, F1 (macro)
def compute_classification_metrics(
    model: nn.Module,
    dataloader: DataLoader,
    device: torch.device,
    average: str = "macro",   # "macro", "micro", or "weighted"
):
    preds, labels = get_all_preds_and_labels(model, dataloader, device)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average=average,
        zero_division=0,
    )

    metrics = {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }
    return metrics

model.load_state_dict(torch.load("best_textcnn_glove.pt", map_location=device))

# For validation metrics
val_metrics = compute_classification_metrics(model, val_loader, device, average="macro")
print("Validation metrics:")
for k, v in val_metrics.items():
    print(f"{k}: {v:.4f}")

Validation metrics:
accuracy: 0.8734
precision: 0.8467
recall: 0.8108
f1: 0.8272


## Transformer (Deberta-V3)

### Build Label Mappings

convert class_label_group values from string to int (0,1,2)

In [163]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

texts = {}
labels = {}

for d in ["train", "dev", "test"]:
    df_tmp = df['train'].dropna(subset=['text'])
    texts[d] = df_tmp['text'].tolist()
    label_strs = df_tmp['class_label_group']

    all_label_strs = sorted(set(label_strs))
    print(f"original labels={all_label_strs}")
    label2id = {label: i for i, label in enumerate(all_label_strs)}
    id2label = {i: label for label, i in label2id.items()}

    labels[d] = [label2id[l] for l in label_strs]

original labels=['non_informative', 'support_and_relief', 'time_critical']
original labels=['non_informative', 'support_and_relief', 'time_critical']
original labels=['non_informative', 'support_and_relief', 'time_critical']


In [164]:
for d in ["train", "dev", "test"]:
    print(set(labels[d]))

{0, 1, 2}
{0, 1, 2}
{0, 1, 2}


### Tokenization

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 64 # adjust based on the maximum input size?

# 1. Tokenize directly (no map, no progress bars)
train_encodings = tokenizer(
    texts['train'],
    padding="max_length",
    truncation=True,
    max_length=max_length,
)

val_encodings = tokenizer(
    texts['dev'],
    padding="max_length",
    truncation=True,
    max_length=max_length,
)

# 2. Build HF Datasets from encoded inputs + labels
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "label": train_labels,
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "label": val_labels,
})

tokenized_datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
})

# 3. Set format for PyTorch
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"],
)

In [None]:
# Check appropriate token size
tmp_train = tokenizer(texts['train'], truncation=False, add_special_tokens=True)
lens_train = [len(ids) for ids in tmp_train["input_ids"]]

tmp_dev = tokenizer(texts['dev'], truncation=False, add_special_tokens=True)
lens_dev = [len(ids) for ids in tmp_dev["input_ids"]]

lengths = lens_train + lens_dev

print("median:", np.median(lengths))
print("mean:", np.mean(lengths))
print("95th percentile:", np.percentile(lengths, 95))
print("99th percentile:", np.percentile(lengths, 99))
print("max:", np.max(lengths))

### Train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro",
        zero_division=0,
    )
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

training_args = TrainingArguments(
    output_dir="./deberta-v3-crisis",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate(tokenized_datasets["validation"])
print(eval_results)
