This notebook will perform our temporal train/test split of the dataset, after merging the two existing datasets into one and filtering out entries in the Disaster Declaration Summaries (DDS) that do not exist in the Mission Assignments (MA)

In [519]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [520]:
pd.set_option('display.max_columns', 80)

In [521]:
ma_filepath = 'mission_assignments.parquet'
dds_filepath = 'disaster_declaration_summaries.parquet'
train_filepath = 'combined_training_set.parquet'
test_filepath = 'combined_test_set.parquet'


In [522]:
# Load initial datasets

df_dds = pd.read_parquet(dds_filepath)
df_ma = pd.read_parquet(ma_filepath)
print(df_dds.shape, df_ma.shape)

(68485, 28) (40340, 39)


In [523]:
df_dds['designatedIncidentTypes'].head(10)

0       R
1       R
2       R
3    None
4    None
5    None
6    None
7    None
8    None
9       R
Name: designatedIncidentTypes, dtype: object

Adding lists and dictionaries for later use.

In [524]:
#dictionary to convert state/territory designators to full word strings
state_dict = {'AL':'Alabama','AK':'Alaska','AZ':'Arizona','AR':'Arkansas','CA':'California','CO':'Colorado','CT':'Connecticut',
             'DE':'Delaware', 'FL':'Florida','GA':'Georgia','HI':'Hawaii','ID':'Idaho','IL':'Illinois','IN':'Indiana','IA':'Iowa',
             'KS':'Kansas','KY':'Kentucky','LA':'Louisiana','ME':'Maine','MD':'Maryland','MA':'Massachusetts','MI':'Michigan',
             'MN':'Minnesota','MS':'Mississippi','MO':'Missouri','MT':'Montana','NE':'Nebraska','NV':'Nevada','NH':'New Hampshire',
             'NM':'New Mexico','NY':'New York','NJ':'New Jersey','NC':'North Carolina','ND':'North Dakota','OH':'Ohio',
             'OK':'Oklahoma','OR':'Oregon','PA':'Pennsylvania','RI':'Rhode Island','SC':'South Carolina','SD':'South Dakota',
             'TN':'Tennessee','TX':'Texas','UT':'Utah','VT':'Vermont','VA_state':'Virginia','WA':'Washington','WV':'West Virginia',
             'WI':'Wisconsin','WY':'Wyoming','DC':'Washington, DC','GU':'Guam','PR':'Puerto Rico','AS':'American Samoa',
             'MP':'Northern Mariana Islands','FM':'Federated States of Micronesia','MH':'Marshall Islands','PW':'Palau'}

state_list = ['AL','AZ','AR','CA','CO','CT','DE', 'FL','GA','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI',
             'MN','MS','MO','MT','NE','NV','NH','NM','NY','NJ','NC','ND','OH','OK','OR','PA','RI','SC','SD',
             'TN','TX','UT','VT','VA','WA','WV','WI','WY']

#set using only natural disasters that could be potentially caused by climate change
natural_disaster = ['Fire','Flood','Severe Storm','Straight-Line Winds','Winter Storm','Hurricane','Tornado','Tropical Storm',
                   'Mud/Landslide','Snowstorm','Coastal Storm','Severe Ice Storm','Typhoon','Freezing','Drought','Fishing Losses',
                   'Tropical Depression']

#manmade or other disasters that would not be caused by climate change
nonweather_disaster = ['Earthquake','Other','Biological','Dam/Levee Break','Volcanic Eruption','Toxic Substances','Chemical',
                      'Terrorist','Human Cause','Tsunami','Civil Unrest','Nuclear','Explosion','Tidal Wave']

#dictionary to convert disaster codes to strings representing each type of disaster
disaster_dict = {'0':'Not applicable','1':'Explosion','2':'Straight-Line Winds','3':'Tidal Wave','4':'Tropical Storm',
                '5':'Winter Storm','A':'Tsunami','B':'Biological','C':'Coastal Storm','D':'Drought','E':'Earthquake',
                'F':'Flood','G':'Freezing','H':'Hurricane','I':'Terrorist','J':'Typhoon','K':'Dam/Levee Break','L':'Chemical',
                'M':'Mud/Landslide','N':'Nuclear','O':'Severe Ice Storm','P':'Fishing Losses','Q':'Crop Losses','R':'Fire',
                'S':'Snowstorm','T':'Tornado','U':'Civil Unrest', 'V':'Volcanic Eruption','W':'Severe Storm','X':'Toxic Substances',
                'Y':'Human Cause','Z':'Other', '8':'Tropical Depression'}

agencyid_dict = {'CISA':'DHS-CISA','DHSMGMT':'DHS-MGMT','USDANRCS':'USDA-NRCS','GSA-':'GSA','VA-':'VA','EPA-':'EPA','DOT-':'DOT',
                'CNCS-':'CNCS','FCC-':'FCC','DOED':'DOE','DHUD':'HUD','DOD-':'DOD','VA -':'VA','USDAOCIO':'USDA-OCIO','FPS':'DHS-FPS',
                'TSA':'DHS-TSA','ICE':'DHS-ICE','USCIS':'DHS-CIS','DLA':'DOD-DLA','CBP':'DHS-CBP','NPS':'DOI-NPS','NPPD':'DHS-CISA',
                'CDC':'HHS-CDC','USAF':'DOD-USAF','OSHA':'DOL-OSHA','DHS-MGT':'DHS-MGMT','USGS':'DOI-USGS','USCG':'DHS-USCG',
                'USDJ':'DOJ','DHS-MGA':'DHS-IA','FLETC':'DHS-FLETC','DHS-FLET':'DHS-FLETC','USFS':'USDA-FS','HHS -PSC':'HHS-PSC'}

In [525]:
df_dds['designatedIncidentTypes'].sample(30)

44565       None
32098       None
24232       None
66448          W
56780       None
20373       None
35158       None
62940        W,F
58913          W
43675       None
29144       None
63993    4,W,H,T
7649        None
21157       None
47683       None
47316          T
62250          W
33905       None
62082       None
54413          F
58001       None
12549       None
11003       None
54403        F,T
12095       None
38746       None
25730       None
58727          W
39761       None
51356        F,T
Name: designatedIncidentTypes, dtype: object

Data cleaning for MA includes 

In [526]:
df_ma=df_ma[(df_ma['declarationType']!='SU')&(df_ma['maAmendNumber']==0)&(df_ma['supportFunction']<=15)]

df_ma['supportFunction'].fillna(value=0,inplace=True)

# df_ma['stt'].replace({'VA':'VA_state'},inplace=True)

df_ma['agencyId'].replace(agencyid_dict,inplace=True)

column_list_ma = ['incidentId','stt','incidentType','region','maType','maPriority','supportFunction','agencyId', 'maId',
              'declarationType', 'assistanceRequested', 'statementOfWork']
df_ma = df_ma.reindex(columns=column_list_ma)

df_ma.drop_duplicates(inplace=True)

df_ma.rename(columns={'incidentType': 'incidentTypeMA'},
             inplace=True)

# df_ma.drop(columns=[
#     'lastRefresh',
#     'hash',
#     'id',],
#     inplace=True)

df_ma.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ma['supportFunction'].fillna(value=0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ma['agencyId'].replace(agencyid_dict,inplace=True)


(7044, 12)

In [527]:
df_ma['agencyId'].value_counts()

agencyId
DOD          743
GSA          485
HHS          411
EPA          360
COE-SAD      348
            ... 
DC-CSOSA       1
DOC-NTIA       1
DOC-BIS        1
USDA-OCIO      1
USDA-OCP       1
Name: count, Length: 109, dtype: int64

Data cleaning for DDS includes keeping of specific columns and filtering of year and declaration type

In [528]:
# select columns necessary for data analysis, add empty columns for each natural disaster type

column_list_dds = ['femaDeclarationString','state','incidentType','incidentBeginDate','fipsStateCode','region',
               'designatedIncidentTypes','declarationTitle', 'incidentId','declarationType']

df_dds = df_dds.reindex(
    columns=column_list_dds,
    fill_value=0)

# Add time information to DDS

df_dds['incidentBeginDate']=pd.to_datetime(df_dds['incidentBeginDate'])
df_dds['year'] = df_dds['incidentBeginDate'].dt.year
df_dds['month'] = df_dds['incidentBeginDate'].dt.month
df_dds['day'] = df_dds['incidentBeginDate'].dt.day

# Filter out values before 2012

df_dds=df_dds[(df_dds['year']>=2012) & (df_dds['declarationType']!='FM')]
# df_dds.drop(columns=[
#     'lastRefresh',
#     'hash',
#     'id',],
#     inplace=True)
print(df_dds.shape)

#ensures that incident type is reflected in designated incident types
df_dds['designatedIncidentTypes'].fillna(df_dds['incidentType'], inplace = True)

df_dds.drop_duplicates(inplace=True)
df_dds.reset_index(inplace = True,
                   drop=True)

df_dds.shape


(26041, 13)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dds['designatedIncidentTypes'].fillna(df_dds['incidentType'], inplace = True)


(1123, 13)

In [529]:
dds_column_list = df_dds.columns.to_list()
dds_column_list

['femaDeclarationString',
 'state',
 'incidentType',
 'incidentBeginDate',
 'fipsStateCode',
 'region',
 'designatedIncidentTypes',
 'declarationTitle',
 'incidentId',
 'declarationType',
 'year',
 'month',
 'day']

In [530]:
df_ma.rename(columns={'stt':'state'},inplace=True)
ma_column_list = df_ma.columns.to_list()
ma_column_list

['incidentId',
 'state',
 'incidentTypeMA',
 'region',
 'maType',
 'maPriority',
 'supportFunction',
 'agencyId',
 'maId',
 'declarationType',
 'assistanceRequested',
 'statementOfWork']

In [531]:
print(df_dds['incidentId'].nunique(), df_ma['incidentId'].nunique())

662 326


In [532]:
overlapping_columns = list(set(ma_column_list).intersection(set(dds_column_list)))
overlapping_columns

['region', 'incidentId', 'declarationType', 'state']

In [574]:
MA_disaster_combined=df_ma.merge(
    df_dds, 
    how='left',
    on=overlapping_columns,
    validate='m:m')

In [575]:
MA_disaster_combined.shape

(7699, 21)

In [576]:
MA_disaster_combined.drop_duplicates(inplace=True)
MA_disaster_combined.shape


(7699, 21)

In [577]:
MA_disaster_combined[MA_disaster_combined['designatedIncidentTypes'].isna()]

Unnamed: 0,incidentId,state,incidentTypeMA,region,maType,maPriority,supportFunction,agencyId,maId,declarationType,assistanceRequested,statementOfWork,femaDeclarationString,incidentType,incidentBeginDate,fipsStateCode,designatedIncidentTypes,declarationTitle,year,month,day
469,2023052201,GU,Tropical Storm,9,FOS,Normal,4.0,USDA-FS,4715DRGUUSDA-FS01,DR,"Activate ESF 4 USFS to the RRCC, IOF JFO or ot...","As directed by and in coordination with FEMA, ...",,,NaT,,,,,,
470,2023052201,GU,Tropical Storm,9,FOS,Normal,11.0,USDA-APH,4715DRGUUSDA-APH02,DR,"Activate ESF 11 USDA liaison(s) to the RRCC, ...","As directed by and in coordination with FEMA, ...",,,NaT,,,,,,
471,2023052201,GU,Tropical Storm,9,FOS,High,11.0,USDA-APH,4715DRGUUSDA-APH01,DR,USDA liaison(s) to the NRCC to perform duties ...,"As directed by and in coordination with FEMA, ...",,,NaT,,,,,,
472,2023052201,GU,Tropical Storm,9,DFA,High,8.0,HHS-ASPR,4715DRGUHHS-ASPR04,DR,HHS to provide DMAT team in support of MAWAR\r...,In support of Guam as directed by and in coord...,,,NaT,,,,,,
473,2023052201,GU,Tropical Storm,9,FOS,High,8.0,HHS-ASPR,4715DRGUHHS-ASPR03,DR,"Activate HHS to RRCC, IOF, JFO, RFO, or other ...","As directed by and in coordination with FEMA, ...",,,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,2024101001,FL,Tropical Depression,4,FOS,High,3.0,COE-LRD,4834DRFLCOE-LRD02,DR,USACE pre-position type 2 temporary emergency ...,"As directed by and in coordination with FEMA, ...",,,NaT,,,,,,
7172,2024101001,FL,Tropical Depression,4,DFA,High,3.0,COE-LRD,4834DRFLCOE-LRD01,DR,Temporary Emergency Power: Need federal suppor...,"In support of the State of Florida request, as...",,,NaT,,,,,,
7173,2024101001,FL,Tropical Depression,4,FOS,Lifesaving,9.0,COE-HQ,4834DRFLCOE-HQ01,DR,This is a re-issuance of MA 3622EM-FL-COE-HQ-0...,"As directed by and in coordination with FEMA, ...",,,NaT,,,,,,
7176,2024101001,FL,Tropical Depression,4,DFA,High,3.0,COE-SAD,4834DRFLCOE-SAD03,DR,Request to run on Blue Roof-\r\nIn support of ...,"In support of the Florida request, as directed...",,,NaT,,,,,,


In [578]:
MA_disaster_combined['designatedIncidentTypes'].fillna(MA_disaster_combined['incidentTypeMA'], inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  MA_disaster_combined['designatedIncidentTypes'].fillna(MA_disaster_combined['incidentTypeMA'], inplace = True)


In [579]:
MA_disaster_combined['designatedIncidentTypes'] = MA_disaster_combined['designatedIncidentTypes'].str.split(',').apply(
    lambda lst: [s.strip() for s in lst] if isinstance(lst, list) else lst).apply(
    lambda lst: [disaster_dict.get(s, s) for s in lst] if isinstance(lst, list) else lst)

MA_disaster_combined['designatedIncidentTypes'] = MA_disaster_combined.apply(
    lambda row: list(set([str(row['incidentType'])] + row['designatedIncidentTypes'])), axis=1
    )

MA_disaster_combined['designatedIncidentTypes'] = MA_disaster_combined['designatedIncidentTypes'].apply(
    lambda lst: ','.join(lst) if isinstance(lst, list) else str(lst))

In [581]:
MA_disaster_combined[['incidentTypeMA','designatedIncidentTypes']].sample(30)

Unnamed: 0,incidentTypeMA,designatedIncidentTypes
3988,Biological,Biological
5143,Hurricane,Hurricane
1418,Hurricane,Hurricane
7504,Severe Storm,"Tornado,Straight-Line Winds,Flood,Severe Storm"
2160,Biological,Biological
3637,Biological,Biological
1562,Hurricane,Hurricane
3484,Biological,Biological
7693,Tropical Storm,"Tropical Storm,Hurricane,Severe Storm"
5659,Hurricane,Hurricane


In [582]:
MA_disaster_combined.isna().sum()

incidentId                   0
state                        0
incidentTypeMA               0
region                       0
maType                       0
maPriority                   0
supportFunction              0
agencyId                     0
maId                         0
declarationType              0
assistanceRequested          0
statementOfWork              0
femaDeclarationString      156
incidentType               156
incidentBeginDate          156
fipsStateCode              156
designatedIncidentTypes      0
declarationTitle           156
year                       156
month                      156
day                        156
dtype: int64

In [583]:
MA_disaster_combined['incidentId'].nunique()

326

In [584]:
MA_disaster_combined.dropna(inplace=True)
MA_disaster_combined.isna().sum()

incidentId                 0
state                      0
incidentTypeMA             0
region                     0
maType                     0
maPriority                 0
supportFunction            0
agencyId                   0
maId                       0
declarationType            0
assistanceRequested        0
statementOfWork            0
femaDeclarationString      0
incidentType               0
incidentBeginDate          0
fipsStateCode              0
designatedIncidentTypes    0
declarationTitle           0
year                       0
month                      0
day                        0
dtype: int64

In [585]:
MA_disaster_combined['incidentId'].nunique()


321

In [586]:
MA_disaster_combined[MA_disaster_combined['year']<2024].to_parquet(train_filepath)
MA_disaster_combined[MA_disaster_combined['year']>=2024].to_parquet(test_filepath)