This notebook will perform our temporal train/test split of the dataset, after merging the two existing datasets into one and filtering out entries in the Disaster Declaration Summaries (DDS) that do not exist in the Mission Assignments (MA)

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split


In [3]:
pd.set_option('display.max_columns', 80)

In [4]:
ma_filepath = 'mission_assignments.parquet'
dds_filepath = 'disaster_declaration_summaries.parquet'
train_filepath = 'combined_training_set_nontime.parquet'
test_filepath = 'combined_test_set_nontime.parquet'
random_state = 42

In [5]:
# Load initial datasets

df_dds = pd.read_parquet(dds_filepath)
df_ma = pd.read_parquet(ma_filepath)
print(df_dds.shape, df_ma.shape)

(68485, 28) (40340, 39)


In [6]:
df_dds['designatedIncidentTypes'].head(10)

0       R
1       R
2       R
3    None
4    None
5    None
6    None
7    None
8    None
9       R
Name: designatedIncidentTypes, dtype: object

Adding lists and dictionaries for later use.

In [7]:
#dictionary to convert state/territory designators to full word strings
state_dict = {'AL':'Alabama','AK':'Alaska','AZ':'Arizona','AR':'Arkansas','CA':'California','CO':'Colorado','CT':'Connecticut',
             'DE':'Delaware', 'FL':'Florida','GA':'Georgia','HI':'Hawaii','ID':'Idaho','IL':'Illinois','IN':'Indiana','IA':'Iowa',
             'KS':'Kansas','KY':'Kentucky','LA':'Louisiana','ME':'Maine','MD':'Maryland','MA':'Massachusetts','MI':'Michigan',
             'MN':'Minnesota','MS':'Mississippi','MO':'Missouri','MT':'Montana','NE':'Nebraska','NV':'Nevada','NH':'New Hampshire',
             'NM':'New Mexico','NY':'New York','NJ':'New Jersey','NC':'North Carolina','ND':'North Dakota','OH':'Ohio',
             'OK':'Oklahoma','OR':'Oregon','PA':'Pennsylvania','RI':'Rhode Island','SC':'South Carolina','SD':'South Dakota',
             'TN':'Tennessee','TX':'Texas','UT':'Utah','VT':'Vermont','VA_state':'Virginia','WA':'Washington','WV':'West Virginia',
             'WI':'Wisconsin','WY':'Wyoming','DC':'Washington, DC','GU':'Guam','PR':'Puerto Rico','AS':'American Samoa',
             'MP':'Northern Mariana Islands','FM':'Federated States of Micronesia','MH':'Marshall Islands','PW':'Palau'}

state_list = ['AL','AZ','AR','CA','CO','CT','DE', 'FL','GA','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI',
             'MN','MS','MO','MT','NE','NV','NH','NM','NY','NJ','NC','ND','OH','OK','OR','PA','RI','SC','SD',
             'TN','TX','UT','VT','VA','WA','WV','WI','WY']

#set using only natural disasters that could be potentially caused by climate change
natural_disaster = ['Fire','Flood','Severe Storm','Straight-Line Winds','Winter Storm','Hurricane','Tornado','Tropical Storm',
                   'Mud/Landslide','Snowstorm','Coastal Storm','Severe Ice Storm','Typhoon','Freezing','Drought','Fishing Losses',
                   'Tropical Depression']

#manmade or other disasters that would not be caused by climate change
nonweather_disaster = ['Earthquake','Other','Biological','Dam/Levee Break','Volcanic Eruption','Toxic Substances','Chemical',
                      'Terrorist','Human Cause','Tsunami','Civil Unrest','Nuclear','Explosion','Tidal Wave']

#dictionary to convert disaster codes to strings representing each type of disaster
disaster_dict = {'0':'Not applicable','1':'Explosion','2':'Straight-Line Winds','3':'Tidal Wave','4':'Tropical Storm',
                '5':'Winter Storm','A':'Tsunami','B':'Biological','C':'Coastal Storm','D':'Drought','E':'Earthquake',
                'F':'Flood','G':'Freezing','H':'Hurricane','I':'Terrorist','J':'Typhoon','K':'Dam/Levee Break','L':'Chemical',
                'M':'Mud/Landslide','N':'Nuclear','O':'Severe Ice Storm','P':'Fishing Losses','Q':'Crop Losses','R':'Fire',
                'S':'Snowstorm','T':'Tornado','U':'Civil Unrest', 'V':'Volcanic Eruption','W':'Severe Storm','X':'Toxic Substances',
                'Y':'Human Cause','Z':'Other', '8':'Tropical Depression'}

agencyid_dict = {'CISA':'DHS-CISA','DHSMGMT':'DHS-MGMT','USDANRCS':'USDA-NRCS','GSA-':'GSA','VA-':'VA','EPA-':'EPA','DOT-':'DOT',
                'CNCS-':'CNCS','FCC-':'FCC','DOED':'DOE','DHUD':'HUD','DOD-':'DOD','VA -':'VA','USDAOCIO':'USDA-OCIO','FPS':'DHS-FPS',
                'TSA':'DHS-TSA','ICE':'DHS-ICE','USCIS':'DHS-CIS','DLA':'DOD-DLA','CBP':'DHS-CBP','NPS':'DOI-NPS','NPPD':'DHS-CISA',
                'CDC':'HHS-CDC','USAF':'DOD-USAF','OSHA':'DOL-OSHA','DHS-MGT':'DHS-MGMT','USGS':'DOI-USGS','USCG':'DHS-USCG',
                'USDJ':'DOJ','DHS-MGA':'DHS-IA','FLETC':'DHS-FLETC','DHS-FLET':'DHS-FLETC','USFS':'USDA-FS','HHS -PSC':'HHS-PSC'}

In [8]:
df_dds['designatedIncidentTypes'].sample(30)

39915     None
2356      None
41130        F
59627        W
68477     None
4596      None
47342        T
35694     None
17321      5,S
63078    W,F,T
49362     None
6821      None
55723        W
51986      R,W
34427     None
64659      W,F
2178      None
55140        F
10423     None
53010        W
65695        W
45847     None
18730     None
37673     None
33205        Z
20807     None
61318        W
19623     None
34070     None
15466     None
Name: designatedIncidentTypes, dtype: object

Data cleaning for MA includes 

In [9]:
df_ma=df_ma[(df_ma['declarationType']!='SU')&(df_ma['maAmendNumber']==0)&(df_ma['supportFunction']<=15)]

df_ma['supportFunction'].fillna(value=0,inplace=True)

# df_ma['stt'].replace({'VA':'VA_state'},inplace=True)

df_ma['agencyId'].replace(agencyid_dict,inplace=True)

column_list_ma = ['incidentId','stt','incidentType','region','maType','maPriority','supportFunction','agencyId', 'maId',
              'declarationType', 'assistanceRequested', 'statementOfWork']
df_ma = df_ma.reindex(columns=column_list_ma)

df_ma.drop_duplicates(inplace=True)

df_ma.rename(columns={'incidentType': 'incidentTypeMA'},
             inplace=True)

# df_ma.drop(columns=[
#     'lastRefresh',
#     'hash',
#     'id',],
#     inplace=True)

df_ma.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ma['supportFunction'].fillna(value=0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ma['agencyId'].replace(agencyid_dict,inplace=True)


(7044, 12)

In [10]:
df_ma['agencyId'].value_counts()

agencyId
DOD          743
GSA          485
HHS          411
EPA          360
COE-SAD      348
            ... 
DC-CSOSA       1
DOC-NTIA       1
DOC-BIS        1
USDA-OCIO      1
USDA-OCP       1
Name: count, Length: 109, dtype: int64

Data cleaning for DDS includes keeping of specific columns and filtering of year and declaration type

In [11]:
# select columns necessary for data analysis, add empty columns for each natural disaster type

column_list_dds = ['femaDeclarationString','state','incidentType','incidentBeginDate','fipsStateCode','region',
               'designatedIncidentTypes','declarationTitle', 'incidentId','declarationType']

df_dds = df_dds.reindex(
    columns=column_list_dds,
    fill_value=0)

# Add time information to DDS

df_dds['incidentBeginDate']=pd.to_datetime(df_dds['incidentBeginDate'])
df_dds['year'] = df_dds['incidentBeginDate'].dt.year
df_dds['month'] = df_dds['incidentBeginDate'].dt.month
df_dds['day'] = df_dds['incidentBeginDate'].dt.day

# Filter out values before 2012

df_dds=df_dds[(df_dds['year']>=2012) & (df_dds['declarationType']!='FM')]
# df_dds.drop(columns=[
#     'lastRefresh',
#     'hash',
#     'id',],
#     inplace=True)
print(df_dds.shape)

#ensures that incident type is reflected in designated incident types
df_dds['designatedIncidentTypes'].fillna(df_dds['incidentType'], inplace = True)

df_dds.drop_duplicates(inplace=True)
df_dds.reset_index(inplace = True,
                   drop=True)

df_dds.shape


(26041, 13)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dds['designatedIncidentTypes'].fillna(df_dds['incidentType'], inplace = True)


(1123, 13)

In [12]:
dds_column_list = df_dds.columns.to_list()
dds_column_list

['femaDeclarationString',
 'state',
 'incidentType',
 'incidentBeginDate',
 'fipsStateCode',
 'region',
 'designatedIncidentTypes',
 'declarationTitle',
 'incidentId',
 'declarationType',
 'year',
 'month',
 'day']

In [13]:
df_ma.rename(columns={'stt':'state'},inplace=True)
ma_column_list = df_ma.columns.to_list()
ma_column_list

['incidentId',
 'state',
 'incidentTypeMA',
 'region',
 'maType',
 'maPriority',
 'supportFunction',
 'agencyId',
 'maId',
 'declarationType',
 'assistanceRequested',
 'statementOfWork']

In [14]:
print(df_dds['incidentId'].nunique(), df_ma['incidentId'].nunique())

662 326


In [15]:
overlapping_columns = list(set(ma_column_list).intersection(set(dds_column_list)))
overlapping_columns

['declarationType', 'state', 'region', 'incidentId']

In [48]:
MA_disaster_combined=df_ma.merge(
    df_dds, 
    how='left',
    on=overlapping_columns,
    validate='m:m')

In [49]:
MA_disaster_combined.shape

(7699, 21)

In [50]:
MA_disaster_combined.drop_duplicates(inplace=True)
MA_disaster_combined.shape


(7699, 21)

In [51]:
MA_disaster_combined['designatedIncidentTypes'].fillna(MA_disaster_combined['incidentTypeMA'], inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  MA_disaster_combined['designatedIncidentTypes'].fillna(MA_disaster_combined['incidentTypeMA'], inplace = True)


In [52]:
MA_disaster_combined['designatedIncidentTypes'] = MA_disaster_combined['designatedIncidentTypes'].str.split(',').apply(
    lambda lst: [s.strip() for s in lst] if isinstance(lst, list) else lst).apply(
    lambda lst: [disaster_dict.get(s, s) for s in lst] if isinstance(lst, list) else lst).apply(
    lambda lst: ','.join(lst) if isinstance(lst, list) else str(lst))

In [53]:
MA_disaster_combined[['incidentTypeMA','designatedIncidentTypes']].sample(30)

Unnamed: 0,incidentTypeMA,designatedIncidentTypes
7481,Hurricane,"Tropical Storm,Mud/Landslide,Flood"
6993,Tropical Depression,"Tropical Depression,Hurricane"
2101,Biological,Biological
1810,Fire,Fire
3874,Biological,Biological
4673,Hurricane,Hurricane
53,Tropical Storm,"Straight-Line Winds,Tropical Storm,Tropical De..."
6166,Flood,Flood
3641,Biological,Biological
6453,Flood,Flood


In [54]:
MA_disaster_combined.isna().sum()

incidentId                   0
state                        0
incidentTypeMA               0
region                       0
maType                       0
maPriority                   0
supportFunction              0
agencyId                     0
maId                         0
declarationType              0
assistanceRequested          0
statementOfWork              0
femaDeclarationString      156
incidentType               156
incidentBeginDate          156
fipsStateCode              156
designatedIncidentTypes      0
declarationTitle           156
year                       156
month                      156
day                        156
dtype: int64

In [55]:
print(MA_disaster_combined[MA_disaster_combined['incidentTypeMA'].isna()]['incidentId'].value_counts())
print(MA_disaster_combined[MA_disaster_combined['incidentTypeMA'].isna()]['incidentId'].nunique())

ids_without_year = MA_disaster_combined[MA_disaster_combined['incidentTypeMA'].isna()]['incidentId'].tolist()
ids_without_year

Series([], Name: count, dtype: int64)
0


[]

In [57]:
MA_disaster_combined['incidentId'].nunique()

326

In [58]:
MA_disaster_combined.dropna(inplace=True)

In [59]:
MA_disaster_combined['incidentId'].nunique()


321

In [60]:
split_info = StratifiedShuffleSplit(n_splits=1,
                                    test_size=.2,
                                    random_state=random_state)

grouping_col = 'incidentId'
stratifying_col = 'incidentType'

temp_df = MA_disaster_combined.drop_duplicates(subset=[grouping_col]).copy()
group_stratify_map = temp_df[stratifying_col]

group_counts = group_stratify_map.value_counts()
rare_stratify_values = group_counts[group_counts == 1].index.tolist()

forced_train_group_ids = temp_df[temp_df[stratifying_col].isin(rare_stratify_values)][grouping_col].values
safe_groups_df = temp_df[~temp_df[stratifying_col].isin(rare_stratify_values)]
safe_group_ids = safe_groups_df[grouping_col].values
safe_stratify_map = safe_groups_df[stratifying_col].values

train_safe_group_ids, test_group_ids, _, _ = train_test_split(
    safe_group_ids,
    safe_stratify_map,
    test_size=0.2,
    random_state=42,
    # This split is now safe because all classes in safe_stratify_map have count >= 2
    stratify=safe_stratify_map 
)

# 5. Combine the forced rare groups with the safely split training groups
final_train_group_ids = np.concatenate([train_safe_group_ids, forced_train_group_ids])

# 6. Apply the final masks to the original full DataFrame
train_mask = MA_disaster_combined[grouping_col].isin(final_train_group_ids)
test_mask = MA_disaster_combined[grouping_col].isin(test_group_ids)

df_train = MA_disaster_combined[train_mask]
df_test = MA_disaster_combined[test_mask]

print(f"Total groups in Train set: {len(final_train_group_ids)}")
print(f"Total groups in Test set: {len(test_group_ids)}")
print(f"Number of forced rare groups: {len(forced_train_group_ids)}")

# group_stratify_map = MA_disaster_combined.drop_duplicates(subset=[grouping_col])[stratifying_col].values
# group_ids = MA_disaster_combined.drop_duplicates(subset=[grouping_col])[grouping_col].values



# train_group_ids, test_group_ids, _, _ = train_test_split(
#     group_ids,
#     group_stratify_map,
#     test_size=0.2,
#     random_state=42,
#     stratify=group_stratify_map  # Stratify the group IDs based on S
# )

# train_mask = MA_disaster_combined[grouping_col].isin(train_group_ids)
# test_mask = MA_disaster_combined[grouping_col].isin(test_group_ids)

# # D. Final Train/Test DataFrames
# df_train = df[train_mask]
# df_test = df[test_mask]
# # stratify_info = MA_disaster_combined['incidentType'].values

# # for train_index, test_index in split_info.split(MA_disaster_combined, stratify_info):
# #     # Use .iloc to slice the DataFrame based on indices
# #     df_train = MA_disaster_combined.iloc[train_index]
# #     df_test = MA_disaster_combined.iloc[test_index]


Total groups in Train set: 257
Total groups in Test set: 64
Number of forced rare groups: 4


In [61]:
print(df_train.shape, df_test.shape)

(6765, 21) (778, 21)


In [62]:
print(df_train['incidentType'].value_counts(), df_test['incidentType'].value_counts())

incidentType
Hurricane            2702
Biological           1930
Tropical Storm        465
Flood                 462
Fire                  437
Severe Storm          350
Typhoon               115
Tornado                74
Other                  50
Severe Ice Storm       40
Coastal Storm          36
Mud/Landslide          25
Volcanic Eruption      25
Earthquake             22
Dam/Levee Break        12
Chemical                7
Snowstorm               5
Winter Storm            4
Terrorist               4
Name: count, dtype: int64 incidentType
Hurricane           485
Flood                92
Earthquake           57
Fire                 56
Severe Storm         50
Tornado              12
Winter Storm          6
Other                 5
Severe Ice Storm      5
Snowstorm             4
Tropical Storm        4
Typhoon               2
Name: count, dtype: int64


In [63]:
df_train.to_parquet(train_filepath)
df_test.to_parquet(test_filepath)