#### Preprocess once + Save processed CSV

In [4]:
# ============================================
# 00_preprocess_disaster_tweets.ipynb
# Purpose: Preprocess dataset and save a clean CSV
# ============================================

import pandas as pd
import re
import ast
import os

# 1) Load raw dataset
df = pd.read_csv("DisasterTweets.csv")

print("Raw data shape:", df.shape)
df.head()


Raw data shape: (2559, 13)


Unnamed: 0,Name,UserName,Timestamp,Verified,Tweets,Comments,Retweets,Likes,Impressions,Tags,Tweet Link,Tweet ID,Disaster
0,Drought Center,@DroughtCenter,2024-02-29T13:30:07.000Z,False,US Drought Monitor 2-29-24\n\nHappy Leap Day! ...,0,17,13,18000,"['#droughtmonitor', '#drought', '#drought2024'...",https://twitter.com/DroughtCenter/status/17631...,1.76e+18,Disaster
1,Prabhakar Goud Kurmimdla,@PrabhakarGoud_K,2024-02-27T05:20:43.000Z,False,Synonym is #Drought,0,0,1,13,['#Drought'],https://twitter.com/PrabhakarGoud_K/status/176...,1.76e+18,Disaster
2,Humanity First International,@HFI1995,2024-03-03T07:03:34.000Z,False,"Across South America in the last two months, #...",0,9,19,419,"['#floods', '#landslides', '#drought', '#wildf...",https://twitter.com/HFI1995/status/17641848294...,1.76e+18,Disaster
3,NCWQ Worldwide News And Disasters Explorer,@RTheExplorer1,2024-02-29T10:20:18.000Z,False,Wildfires Going On In Texas #wildfires #texa...,0,0,1,34,"['#wildfires', '#texasfires']",https://twitter.com/RTheExplorer1/status/17631...,1.76e+18,Disaster
4,BestDealsEver,@MilwaukeeHotBuy,2024-02-28T17:58:01.000Z,False,START YOUR STAND UP COMEDY CAREER FOR $11.99 ...,0,0,0,210,"['#thevoice', '#rhonj', '#taylorswift', '#mia'...",https://twitter.com/MilwaukeeHotBuy/status/176...,1.76e+18,Disaster


#### Parse Tags Column

In [7]:
def parse_tags(x):
    if pd.isna(x):
        return []
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except:
        return []

df["Tags_list"] = df["Tags"].apply(parse_tags)

#### Clean text

In [10]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)  
    text = re.sub(r"@\w+", " ", text)              
    text = re.sub(r"#", " #", text)                
    text = re.sub(r"[^a-z0-9#\s]", " ", text)      
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["processed_text"] = df["Tweets"].astype(str).apply(clean_text)
df[["Tweets", "processed_text"]].head()

Unnamed: 0,Tweets,processed_text
0,US Drought Monitor 2-29-24\n\nHappy Leap Day! ...,us drought monitor 2 29 24 happy leap day for ...
1,Synonym is #Drought,synonym is #drought
2,"Across South America in the last two months, #...",across south america in the last two months #f...
3,Wildfires Going On In Texas #wildfires #texa...,wildfires going on in texas #wildfires #texasf...
4,START YOUR STAND UP COMEDY CAREER FOR $11.99 ...,start your stand up comedy career for 11 99 pu...


#### Target labels

In [13]:
LABEL_RULES = {
    "flood":      ["#flood", "#floods", "flood", "flooding", "inundation", "landslide", "landslides"],
    "wildfire":   ["#wildfire", "#wildfires", "wildfire", "forest fire", "bushfire", "fires"],
    "earthquake": ["#earthquake", "earthquake", "quake", "aftershock", "seismic"],
    "drought":    ["#drought", "drought", "water shortage", "dry spell"],
    "storm":      ["#storm", "#hurricane", "#cyclone", "#typhoon", "storm", "hurricane", "cyclone", "typhoon", "tornado"],
}

def make_label(processed_text, tags_list):
    combined = processed_text + " " + " ".join([str(t).lower() for t in tags_list])
    for label, keys in LABEL_RULES.items():
        for k in keys:
            if k in combined:
                return label
    return "other"

df["label"] = df.apply(lambda r: make_label(r["processed_text"], r["Tags_list"]), axis=1)

print(df["label"].value_counts())

label
drought       664
wildfire      569
flood         539
earthquake    495
storm         286
other           6
Name: count, dtype: int64


#### Save a clean training file 

In [16]:
processed_df = df[["processed_text", "label"]].copy()

os.makedirs("data/processed", exist_ok=True)
OUT_PATH = "data/processed/disaster_tweets_preprocessed.csv"

processed_df.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
print("Processed data shape:", processed_df.shape)
processed_df.head()

Saved: data/processed/disaster_tweets_preprocessed.csv
Processed data shape: (2559, 2)


Unnamed: 0,processed_text,label
0,us drought monitor 2 29 24 happy leap day for ...,drought
1,synonym is #drought,drought
2,across south america in the last two months #f...,flood
3,wildfires going on in texas #wildfires #texasf...,wildfire
4,start your stand up comedy career for 11 99 pu...,storm


'zip' is not recognized as an internal or external command,
operable program or batch file.
