# Preprocess Data

In [1]:
import os
import pandas as pd
import torch

## Load Dataset

- `data/crisisbench/all_data_en`: all combined english dataset used for the experiments
    - `crisis_consolidated_humanitarian_filtered_lang_en_dev.tsv`
    - `crisis_consolidated_humanitarian_filtered_lang_en_test.tsv`
    - `crisis_consolidated_humanitarian_filtered_lang_en_train.tsv`


In [2]:
df = {}

df["train"] = pd.read_csv("./data/crisisbench/all_data_en/crisis_consolidated_humanitarian_filtered_lang_en_train.tsv", sep="\t")
print(f"df_train: N = {len(df['train'])}")
df['dev'] = pd.read_csv("./data/crisisbench/all_data_en/crisis_consolidated_humanitarian_filtered_lang_en_dev.tsv", sep="\t")
print(f"df_dev: N = {len(df['dev'])}")
df['test'] = pd.read_csv("./data/crisisbench/all_data_en/crisis_consolidated_humanitarian_filtered_lang_en_test.tsv", sep="\t")
print(f"df_test: N = {len(df['test'])}")

df_train: N = 61164
df_dev: N = 8935
df_test: N = 17356


In [3]:
print(f"total N ={len(df['train']) + len(df['dev']) + len(df['test'])}")

total N =87455


In [3]:
train_class_label = set(df["train"]["class_label"])
dev_class_label = set(df['dev']["class_label"])
test_class_label = set(df['test']["class_label"])

assert len(train_class_label) == len(dev_class_label) and  len(train_class_label) == len(test_class_label) 

train_class_label

{'affected_individual',
 'caution_and_advice',
 'displaced_and_evacuations',
 'donation_and_volunteering',
 'infrastructure_and_utilities_damage',
 'injured_or_dead_people',
 'missing_and_found_people',
 'not_humanitarian',
 'requests_or_needs',
 'response_efforts',
 'sympathy_and_support'}

## Target Labels

### Time-critical

- 'affected_individual'
  -  CrisisLexT26 (Affected individuals): Deaths, injuries, missing, found, or displaced `people`, and/or personal updates.
  - examples 
    - Although one person confirmed  dead by police, BBC understands death toll at least three. #clutha #helicoptercrash htt…
    - 4WABC-TV: FDNY confirms that there are fatalities in Metro North derailment. Other news outlets reporting 4 deaths.

- 'caution_and_advice'
  - CrisisLexT26 (Caution and advice) : If a message conveys/reports information about some `warning` or a piece of `advice` about a possible hazard of an incident.
  - examples
    - Be informed always. . . #RubyPH http://t.co/u1x521x0Is
    - RT @ChileanProbs: 8.3 earthquake in the north of Chile! Tsunami alert up north, Peru and Ecuador!
    - @JimFreund: Apparently we have no exclusivity.  The tornado watch is for all SE NY.  http://1.usa.gov/mSPGdf	en	1	caution_and_advice
    - Japan issues tsunami alert after Chile quake, expecting no damage: Japan has issued a tsunami alert following ... http://t.co/GerjHpPaNN

- 'displaced_and_evacuations'
    - People who have relocated due to the crisis, even for a short time (includes evacuations)
    - examples
      - RT @rociolewis: @TheEllenShow Chile has gone through a recent earthquake and now a fire, thousands are homeless. Please spread the word foräó_
      - RT @AnasMallick: More than 5 dozen #Earthquake victims, mostly women and children, shifted to #Karachi from #Awaran.
      - Hurricane Odile hits Baja California - Click2Houston

- 'infrastructure_and_utilities_damage'
  - Houses, buildings, roads damaged or utilities such as water, electricity, interrupted
  - Buildings or roads damaged or operational; utilities/services interrupted or restored
  - Reports of damaged buildings, roads, bridges, or utilities/services interrupted or restored.

- 'injured_or_dead_people'
  - Reports of casualties and/or injured `people` due to the crisis.
  - Injured and dead
  - If a message reports the information about `casualties` or damage done by an incident.

- 'missing_and_found_people'
  - `Missing`, trapped, or found people—Questions and/or reports about missing or found people.
  - People `missing` or found.
  - If a message reports about the missing or found person effected by an incident or seen a celebrity visit on ground zero

### Support and Relief

- 'requests_or_needs'
  - Needs of those affected
  - Something (e.g. food, water, shelter) or someone (e.g. volunteers, doctors) is needed
  - examples
    - These have warned that diphtheria, cholera and malaria could spread in an epidemic of "apocalyptic proportions" if medical, food, water and other types of aid are not allowed in, along with trained personnel to administer the support.

- 'donation_and_volunteering'
  - Reports of urgent needs or donations of shelter and/or supplies such as food, water, clothing, money, medical supplies or blood; and volunteering services
  - Needs, requests, or offers of money, blood, shelter, supplies, and/or services by volunteers or professionals.
  - Donations of money
  - If a message speaks about money raised, donation offers, goods/services offered or asked by the victims of an incident.
  - Donations of supplies and/or volunteer work
  - Money requested, donated or spent
  - Needs or donations of shelter and/or supplies such as food, water, clothing, medical supplies or blood
  - Services needed or offered by volunteers or professionals
  - examples
    - "You know me : I don't like giving away money. But Nepal needs our help. Donate to @decappeal today"

- 'response_efforts'
  - Affected populations receiving food, water, shelter, medication, etc. from humanitarian/emergency response organizations
  - All info about responders. Affected populations receiving food, water, shelter, medication, etc. from humanitarian/emergency response organizations.

### Non-informative

- 'not_humanitarian'
  - Not applicable
  -  Not related to this crisis
  - Refers to the crisis, but does not contain useful information that helps you understand the situation; 2. Not related to the Typhoon, or not relevant for emergency/humanitarian response; 3. Related to the crisis, but not informative: if it refers to the crisis, but does not contain useful information that helps understand the situation.
  - examples
    - Had a long night. Time to sleep and rest for a while. I survived #RubyPH!	
    - #Baltimore is on fire and #Nepal death toll is rising....yet I still don't think people are paying attention
    - A subtle pressure in the Force drew Jacen's attention to his aide, Orlopp. He turned to find the Jenet just looking up f
    - IAF Planes Bring Back 546 Indians From Quake-hit Nepal | The New Indian Express http://t.co/8BPG5NCT2W | http://t.co/69mLhfefhr #AllTheNews
    - HERO ALERT! please share á¼¼Dá½Š8âœ¨ https://t.co/UED0PojAPx #motorcycle https://t.co/6saBdgri4c	en	NA	not_humanitarian

- 'sympathy_and_support'
  - To hear about the state of Sardinia where I spent the majority of my summers, is extremely saddening. Hope they can get through it.#sardinia

In [4]:
# Informative needed for handling urgent incidents 
time_critical = ['affected_individual', 'caution_and_advice', 'displaced_and_evacuations', 'infrastructure_and_utilities_damage', 'injured_or_dead_people', 'missing_and_found_people']

# Helping the survivor
support_and_relief = ['requests_or_needs', 'donation_and_volunteering', 'response_efforts']
 
# Not solving the problem
non_informative = ['not_humanitarian', 'sympathy_and_support']

In [5]:
mapping = {}
for x in time_critical:
    mapping[x] = 'time_critical'
for x in support_and_relief:
    mapping[x] = 'support_and_relief'
for x in non_informative:
    mapping[x] = 'non_informative'

df["train"]['class_label_group'] = df["train"]['class_label'].map(mapping)
df["dev"]['class_label_group'] = df["dev"]['class_label'].map(mapping)
df["test"]['class_label_group'] = df["test"]['class_label'].map(mapping)

In [6]:
train_class_label = set(df["train"]["class_label_group"])
dev_class_label = set(df['dev']["class_label_group"])
test_class_label = set(df['test']["class_label_group"])

assert len(train_class_label) == len(dev_class_label) and  len(train_class_label) == len(test_class_label) 

train_class_label

{'non_informative', 'support_and_relief', 'time_critical'}

In [7]:
numerical_mapping = {}
for x in time_critical:
    numerical_mapping[x] = 0 # 'time_critical'
for x in support_and_relief:
    numerical_mapping[x] = 1 # 'support_and_relief'
for x in non_informative:
    numerical_mapping[x] = 2 # 'non_informative'

df["train"]['class_label_group_num'] = df["train"]['class_label'].map(numerical_mapping)
df["dev"]['class_label_group_num'] = df["dev"]['class_label'].map(numerical_mapping)
df["test"]['class_label_group_num'] = df["test"]['class_label'].map(numerical_mapping)

In [8]:
train_class_label = set(df["train"]["class_label_group_num"])
dev_class_label = set(df['dev']["class_label_group_num"])
test_class_label = set(df['test']["class_label_group_num"])

assert len(train_class_label) == len(dev_class_label) and  len(train_class_label) == len(test_class_label) 

train_class_label

{0, 1, 2}

## Cleansing Tweet content

### Remove meaningless text

"Prior to the classification experiment, we preprocess tweets to remove symbols, emoticons, invisible and non-ASCII characters, punctuations (replaced with whitespace), numbers, URLs, and hashtag signs"

####  URL removal

"All URLs were removed from tweets, since the text of URL strings does not necessarily convey any relevant information, and can therefore be removed [39]."

- Roy, D.; Mitra, M.; Ganguly, D. To Clean or Not to Clean: Document Preprocessing and Reproducibility. J. Data Inf. Qual. (JDIQ)
2018, 10, 18.

In [9]:
print(df['train'].loc[1, 'text'])

God bless you... https://t.co/AnEy1ydkkz


In [10]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'http\S+', '', regex=True)

In [11]:
print(df['train'].loc[1, 'text'])

God bless you... 


#### Remove hashtag

In [12]:
print(df['train'].loc[4, 'text'])
print(df['train'].loc[5, 'text'])

Rescue effort expands in India, Pakistan as flood death toll tops 350   #india #asia
RT @leanielsen: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert. #PrayForChiäó_


In [13]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'#\w+', '', regex=True).str.strip()

In [14]:
print(df['train'].loc[4, 'text'])
print(df['train'].loc[5, 'text'])

Rescue effort expands in India, Pakistan as flood death toll tops 350
RT @leanielsen: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


#### Remove username

In [15]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[3, 'text'])
print(df['train'].loc[5, 'text'])

RT @perreaux: Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
I'm really just excited for new undies and pinkberry @mollymcnultzxo
RT @leanielsen: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


In [16]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'@\w+', '', regex=True).str.strip()

In [17]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[3, 'text'])
print(df['train'].loc[5, 'text'])

RT : Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
I'm really just excited for new undies and pinkberry
RT : I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


#### Remove RT

In [18]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[5, 'text'])

RT : Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
RT : I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


In [19]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.replace(r'\bRT\b', '', regex=True).str.strip()

In [20]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[5, 'text'])

: Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
: I hope everyone in Chile stays safe and are okay. Surrounding countries should watch out for the Tsunami alert.


#### Remove symbols, emoticons, invisible and non-ASCII characters, punctuation

In [21]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[7, 'text'])
print(df['train'].loc[11, 'text'])
print(df['train'].loc[12, 'text'])
print(df['train'].loc[13, 'text'])
print(df['train'].loc[14, 'text'])
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])
print(df['train'].loc[18, 'text'])

: Cracked wine casks, damaged historical  buildings and coffee shops. This Napa earthquake is the biggest first world disaster â€¦
It���s a good thing that the government have done everything to avert any lost of lives from the onslaught of typhoon hagupit in the country.
Hurricane Irma on collision course with Florida; 4 reported killed: 10 points
News Corp Papers Compare The ABC To ISIS
Traveling on Humanitarian Medical Mission to Puerto Rico ἟5἟7 hosted by
Gym time!! Back to work!!
STORMS A COMIN!!!!! I miss Fridays at your place.
LIBTARDS RUIN EVERYTHING AND BLAME EVERYONE BUT THEMSELVES.
: Found helicopters hovering above but none reached the ground for help where many are still waiting for food and shelter.â€¦


In [22]:
import re

def clean_tweet(text):
    # remove non-ASCII characters,
    # 1) non-ASCII 제거 (이모티콘/특수문자/한글 등 모두 제거)
    text = text.encode("ascii", "ignore").decode()

    # remove emoticons (e.g., :), :-D, XD)
    text = re.sub(r'[:;=8xX][-~]?[)(DPpOo/\\]+', ' ', text)

    # remove numbers
    text = re.sub(r'\d+', ' ', text)

    # remove punctuations (replaced with whitespace)
    text = re.sub(r'[.,!?;:/()\"\'\[\]{}<>@#~`+=*&^%$|-]', ' ', text)

    # remove invisible characters
    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)

    # remove duplicate spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].apply(clean_tweet)

In [23]:
print(df['train'].loc[2, 'text'])
print(df['train'].loc[7, 'text'])
print(df['train'].loc[11, 'text'])
print(df['train'].loc[12, 'text'])
print(df['train'].loc[13, 'text'])
print(df['train'].loc[14, 'text'])
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])
print(df['train'].loc[18, 'text'])

Cracked wine casks damaged historical buildings and coffee shops This Napa earthquake is the biggest first world disaster
Its a good thing that the government have done everything to avert any lost of lives from the onslaught of typhoon hagupit in the country
Hurricane Irma on collision course with Florida reported killed points
News Corp Papers Compare The ABC ToISIS
Traveling on Humanitarian Medical Mission to Puerto Rico hosted by
Gym time Back to work
STORMS A COMIN I miss Fridays at your place
LIBTARDS RUIN EVERYTHING AND BLAME EVERYONE BUT THEMSELVES
Found helicopters hovering above but none reached the ground for help where many are still waiting for food and shelter


### Text lowercasing

All tweets were converted to lowercase; according to Hickman et al. [37], lowercasing tends to be beneficial because it reduces data dimensionality, thereby increasing statistical power, and usually does not reduce validity.

- Hickman, L.; Thapa, S.; Tay, L.; Cao, M.; Srinivasan, P. Text Preprocessing for Text Mining in Organizational Research: Review
and Recommendations. Organ. Res. Methods 2022, 25, 114–146.

In [24]:
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])

STORMS A COMIN I miss Fridays at your place
LIBTARDS RUIN EVERYTHING AND BLAME EVERYONE BUT THEMSELVES


In [25]:
for d in ['train', 'dev', 'test']:
    df[d]['text'] = df[d]['text'].str.lower()

In [26]:
print(df['train'].loc[16, 'text'])
print(df['train'].loc[17, 'text'])

storms a comin i miss fridays at your place
libtards ruin everything and blame everyone but themselves


## Remove Empty text

In [27]:
for d in ['train', 'dev', 'test']:
    mask = df[d]["text"].str.strip() == ""
    count = mask.sum()
    print(d, count)

train 75
dev 14
test 21


In [28]:
for d in ['train', 'dev', 'test']:
    prev_length = len(df[d])
    df[d] = df[d][df[d]["text"].str.strip() != ""].reset_index(drop=True)
    print(f"{prev_length - len(df[d])} rows removed")

75 rows removed
14 rows removed
21 rows removed


## Save preprocessed data

In [30]:
df['train'].head()

Unnamed: 0,id,event,source,text,lang,lang_conf,class_label,class_label_group,class_label_group_num
0,18582,disaster_events,drd-figureeight-multimedia,approximately km long firebreaks have been con...,en,1.0,infrastructure_and_utilities_damage,time_critical,0
1,592616302138658817,2015_nepal_earthquake,crisisnlp-volunteers,god bless you,en,,not_humanitarian,non_informative,2
2,503643491143282688,2014_california_earthquake,crisisnlp-cf,cracked wine casks damaged historical building...,en,,infrastructure_and_utilities_damage,time_critical,0
3,323833109051228160,2013_boston_bombings-ontopic,crisislext6,i m really just excited for new undies and pin...,en,1.0,not_humanitarian,non_informative,2
4,508333923886067712,2014_pakistan_floods,crisisnlp-cf,rescue effort e ands in india pakistan as floo...,en,1.0,injured_or_dead_people,time_critical,0


In [29]:
for d in ['train', 'dev', 'test']:
    output_path = f"./data/crisisbench/preprocessed_data_{d}.csv"
    df[d].to_csv(output_path, index=False)
    print("Saved:", output_path)

Saved: ./data/crisisbench/preprocessed_data_train.csv
Saved: ./data/crisisbench/preprocessed_data_dev.csv
Saved: ./data/crisisbench/preprocessed_data_test.csv
