In [31]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
import time  # To track the elapsed time
import matplotlib.colors as mcolors
import concurrent.futures
import re
!pip install fuzzywuzzy
from fuzzywuzzy import process 
import matplotlib.patches as mpatches
from google.cloud import storage



# Functions

In [2]:
# Function to shift the precipitation data columns based on the incidentBeginDate rather than incidentEndDate
def shift_precipitation_data(df, column, shift_days):
    """ Shift the given column by shift_days and return the modified dataframe. """
    df[column] = df[column].shift(-shift_days)
    return df

In [3]:
# Function to merge precipitation data for a given county FIPS
def merge_precip_data(filtered_disasters_county, county_fips):
    # Convert 'incidentBeginDate' to a date format (without time) for matching
    filtered_disasters_county['incidentBeginDate'] = pd.to_datetime(filtered_disasters_county['incidentBeginDate']).dt.date
    
    # Load MSWEP precipitation data for this county
    mswep_file = os.path.join(mswep_dir, f'{county_fips}_precip_processed.csv')
    if os.path.exists(mswep_file):
        mswep_data = pd.read_csv(mswep_file)
        mswep_data['county'] = mswep_data['county'].astype(str).str.zfill(5)  # Ensure county FIPS is a 5-character string
        
        # Convert 'date' to datetime format and extract only the date (without time)
        mswep_data['date'] = pd.to_datetime(mswep_data['date']).dt.date
        
        # Shift necessary MSWEP columns
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_sum', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_sum_percentile_modeled', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_1d', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_1d_percentile_modeled', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_3d', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_3d_percentile_modeled', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_5d', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_5d_percentile_modeled', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_7d', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_7d_percentile_modeled', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_14d', 29)
        mswep_data = shift_precipitation_data(mswep_data, 'MSWEP_precipitation_30d_max_14d_percentile_modeled', 29)

        # Merge MSWEP data with disaster data based on 'incidentBeginDate'
        filtered_disasters_county = pd.merge(filtered_disasters_county, mswep_data, left_on=['incidentBeginDate', 'fullFIPS'], right_on=['date', 'county'], how='left')

        # Drop unwanted columns from the merged DataFrame
        filtered_disasters_county = filtered_disasters_county.drop(columns=['county', 'date', 'year'], errors='ignore')
      
    # Load ERA5 precipitation data for this county
    era5_file = os.path.join(era5_dir, f'{county_fips}_precip_processed.csv')
    if os.path.exists(era5_file):
        era5_data = pd.read_csv(era5_file)
        era5_data['county'] = era5_data['county'].astype(str).str.zfill(5)  # Ensure county FIPS is a 5-character string
        
        # Convert 'date' to datetime format and extract only the date
        era5_data['date'] = pd.to_datetime(era5_data['date']).dt.date
        
        # Shift necessary ERA5 columns
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_sum', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_sum_percentile_modeled', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_1d', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_1d_percentile_modeled', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_3d', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_3d_percentile_modeled', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_5d', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_5d_percentile_modeled', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_7d', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_7d_percentile_modeled', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_14d', 29)
        era5_data = shift_precipitation_data(era5_data, 'ERA5_precipitation_30d_max_14d_percentile_modeled', 29)

        # Merge ERA5 data with disaster data based on 'incidentBeginDate'
        filtered_disasters_county = pd.merge(filtered_disasters_county, era5_data, left_on=['incidentBeginDate', 'fullFIPS'], right_on=['date', 'county'], how='left')

        # Drop unwanted columns from the merged DataFrame
        filtered_disasters_county = filtered_disasters_county.drop(columns=['county', 'date', 'year'], errors='ignore')
    
    return filtered_disasters_county

In [4]:
# Function to process a single county
def process_county(county_fips, county_group, start_time):
    # Ensure fullFIPS is a string with leading zeros
    county_group['fullFIPS'] = county_group['fullFIPS'].astype(str).str.zfill(5)
    
    # Merge precipitation data for this county
    merged_county_data = merge_precip_data(county_group, county_fips)
    
    # Calculate elapsed time for this county
    county_elapsed_time = time.time() - start_time
    print(f"Processed county {county_fips} in {county_elapsed_time:.2f} seconds")
    
    return merged_county_data

In [5]:
def download_gcs_folder(gcs_folder_path, local_destination):
    """
    Downloads a folder from Google Cloud Storage to a local directory using the Google Cloud Storage client library.
    
    Parameters:
    - gcs_folder_path (str): The GCS path of the folder (e.g., 'bucket-name/path/to/folder').
    - local_destination (str): The local directory where the folder will be downloaded.
    
    Returns:
    - None
    """
    # Extract bucket name and prefix from the GCS path
    if not gcs_folder_path.startswith("gs://"):
        raise ValueError("The GCS folder path must start with 'gs://'")
    
    gcs_folder_path = gcs_folder_path[5:]  # Remove 'gs://'
    bucket_name, folder_prefix = gcs_folder_path.split("/", 1)
    
    # Initialize the Google Cloud Storage client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    # List all blobs in the folder
    blobs = bucket.list_blobs(prefix=folder_prefix)
    
    # Ensure the local destination exists
    os.makedirs(local_destination, exist_ok=True)
    
    for blob in blobs:
        # Define the local file path
        relative_path = os.path.relpath(blob.name, folder_prefix)
        local_file_path = os.path.join(local_destination, relative_path)
        
        # Ensure the directory exists
        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
        
        # Download the blob
        print(f"Downloading {blob.name} to {local_file_path}")
        blob.download_to_filename(local_file_path)
    
    print(f"Successfully downloaded folder {gcs_folder_path} to {local_destination}")

In [6]:
def download_gcs_file(gcs_file_path, local_destination):
    """
    Downloads a single file from Google Cloud Storage to a local directory using the Google Cloud Storage client library.
    
    Parameters:
    - gcs_file_path (str): The GCS path of the file (e.g., 'gs://bucket-name/path/to/file.csv').
    - local_destination (str): The local file path where the file will be downloaded.
    
    Returns:
    - None
    """
    from google.cloud import storage
    import os
    
    # Ensure the GCS file path starts with 'gs://'
    if not gcs_file_path.startswith("gs://"):
        raise ValueError("The GCS file path must start with 'gs://'")
    
    # Extract bucket name and file path
    gcs_file_path = gcs_file_path[5:]  # Remove 'gs://'
    bucket_name, file_path = gcs_file_path.split("/", 1)
    
    # Initialize the Google Cloud Storage client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(file_path)
    
    # Ensure the local directory exists
    os.makedirs(os.path.dirname(local_destination), exist_ok=True)
    
    # Download the blob to the local file
    print(f"Downloading {file_path} to {local_destination}")
    blob.download_to_filename(local_destination)
    
    print(f"Successfully downloaded file {gcs_file_path} to {local_destination}")

# Filter Selection

In [39]:
pull = False # if you need to pull data from cloud
push = False # if you need to delete the data after
precipPresent = True
monthlyMask = True
percentileMask = True

mon_thres = 0 # Filter out below mon_thres monthly precipitation values as erroneous
perc_thres = 0 # Filter out below perc_thres th percentile values as erroneous for weekly precipitation

In [8]:
if pull:
    os.makedirs('MSWEP_Daily_Precip_Processed_County', exist_ok=True)
    os.makedirs('ERA5_Daily_Precip_Processed_County', exist_ok=True)
    
    download_gcs_folder('gs://leap-persistent/adamnayak/flood-insurance/MSWEP/MSWEP_Daily_Precip_Processed_County', 'MSWEP_Daily_Precip_Processed_County')
    download_gcs_folder('gs://leap-persistent/adamnayak/flood-insurance/ERA5/ERA5_Daily_Precip_Processed_County', 'ERA5_Daily_Precip_Processed_County')

    # Get the current directory
    current_directory = os.getcwd()
    
    # Construct the local destination path
    local_destination = os.path.join(current_directory, 'PRISM_Monthly_Precip_Processed_County.csv')
    
    # Call the function to download
    download_gcs_file('gs://leap-persistent/adamnayak/flood-insurance/PRISM/PRISM_Monthly_Precip_Processed_County.csv', local_destination)

# Load Disasters Data

In [9]:
disasters = pd.read_csv('../Local_Data/FEMA_Disaster_Asst_Data/DisasterDeclarationsSummaries.csv')

# List of incident types to filter
incident_types = [
    'Coastal Storm',
    'Dam/Levee Break',
    'Flood',
    'Hurricane',
    'Severe Storm',
    'Tropical Storm',
    'Typhoon',
] #'Winter Storm', 'Snowstorm', 'Severe Ice Storm'

# Print number of unfiltered records
print(f"{len(disasters)} unfiltered disaster total records found")

# Filtering the dataframe
filtered_disasters = disasters[disasters['incidentType'].isin(incident_types)]

# Print number of unfiltered records
print(f"{len(filtered_disasters)} flood disaster total records found")

67209 unfiltered disaster total records found
44916 flood disaster total records found


  disasters = pd.read_csv('../Game/FEMA_Disaster_Asst_Data/DisasterDeclarationsSummaries.csv')


In [10]:
# Combine "Coastal Storm" and "Severe Storm" into a new category "Storm"
filtered_disasters['incidentType'] = filtered_disasters['incidentType'].replace({
    'Coastal Storm': 'Storm',
    'Severe Storm': 'Storm'
})

# Combine "Hurricane" and "Tropical Storm" into "Hurricane/Tropical Cyclone" in `final_disasters_df`
filtered_disasters['incidentType'] = filtered_disasters['incidentType'].replace(
    {'Hurricane': 'Hurricane/Tropical Cyclone', 'Tropical Storm': 'Hurricane/Tropical Cyclone'}
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_disasters['incidentType'] = filtered_disasters['incidentType'].replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_disasters['incidentType'] = filtered_disasters['incidentType'].replace(


In [11]:
# Map of two-letter state codes to FIPS codes
state_fips_mapping = {
    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', 'CO': '08', 'CT': '09', 'DE': '10', 
    'DC': '11', 'FL': '12', 'GA': '13', 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
    'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', 'MA': '25', 'MI': '26', 'MN': '27', 
    'MS': '28', 'MO': '29', 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', 'NM': '35',
    'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44',
    'SC': '45', 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', 'VA': '51', 'WA': '53', 
    'WV': '54', 'WI': '55', 'WY': '56', 'PR': '72'  # Add more as needed
}

# Map the state column to FIPS codes
filtered_disasters['stateFIPS'] = filtered_disasters['state'].map(state_fips_mapping)

# Ensure the county code is 3 characters, padded with leading zeros if necessary
filtered_disasters['fipsCountyCode'] = filtered_disasters['fipsCountyCode'].astype(str).str.zfill(3)

# Combine stateFIPS and fipsCountyCode to create the full 5-digit FIPS code
filtered_disasters['fullFIPS'] = filtered_disasters['stateFIPS'] + filtered_disasters['fipsCountyCode']

# Now filter based on the full FIPS code
contiguous_us_states_fips = [f"{i:02d}" for i in range(1, 57) if i not in [2, 15, 60]]
filtered_disasters_contiguous = filtered_disasters[filtered_disasters['fullFIPS'].str[:2].isin(contiguous_us_states_fips)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_disasters['stateFIPS'] = filtered_disasters['state'].map(state_fips_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_disasters['fipsCountyCode'] = filtered_disasters['fipsCountyCode'].astype(str).str.zfill(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_disasters

In [12]:
# Ensure the date fields are in datetime format
df = filtered_disasters_contiguous.copy()
df['declarationDate'] = pd.to_datetime(df['declarationDate'])
df['disasterCloseoutDate'] = pd.to_datetime(df['disasterCloseoutDate'])
df['incidentBeginDate'] = pd.to_datetime(df['incidentBeginDate'])
df['incidentEndDate'] = pd.to_datetime(df['incidentEndDate'])

# Calculate the number of days from declarationDate
df['days_to_disasterCloseout'] = (df['disasterCloseoutDate'] - df['declarationDate']).dt.days
df['days_to_incidentBegin'] = (df['incidentBeginDate'] - df['declarationDate']).dt.days
df['days_to_incidentEnd'] = (df['incidentEndDate'] - df['declarationDate']).dt.days

# Pull Processed Precipitation Data from the Cloud

In [13]:
# Path to MSWEP and ERA5 directories
mswep_dir = 'MSWEP_Daily_Precip_Processed_County'
era5_dir = 'ERA5_Daily_Precip_Processed_County'

# Start time for performance tracking
start_time = time.time()

# Use ThreadPoolExecutor to process counties in parallel
merged_disasters = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create a dictionary to map each future to its county_fips and county_group
    futures = {
        executor.submit(process_county, county_fips, county_group, start_time): county_fips
        for county_fips, county_group in filtered_disasters_contiguous.groupby('fullFIPS')
    }
    
    # As each future completes, gather the result and print progress
    for future in concurrent.futures.as_completed(futures):
        county_fips = futures[future]
        try:
            merged_disasters.append(future.result())
        except Exception as e:
            print(f"An error occurred while processing county {county_fips}: {e}")

# Concatenate all county dataframes back into a single dataframe
final_disasters_df = pd.concat(merged_disasters, ignore_index=True)

# Print the total elapsed time
total_elapsed_time = time.time() - start_time
print(f"All counties processed in {total_elapsed_time:.2f} seconds")

Processed county 01000 in 0.02 seconds
Processed county 01003 in 1.63 seconds
Processed county 01025 in 1.64 seconds
Processed county 01005 in 1.77 seconds
Processed county 01001 in 1.78 seconds
Processed county 01007 in 1.82 seconds
Processed county 01017 in 1.82 seconds
Processed county 01011 in 1.83 seconds
Processed county 01023 in 1.85 seconds
Processed county 01013 in 1.86 seconds
Processed county 01021 in 1.87 seconds
Processed county 01009 in 1.88 seconds
Processed county 01035 in 1.89 seconds
Processed county 01037 in 1.91 seconds
Processed county 01031 in 1.92 seconds
Processed county 01027 in 1.93 seconds
Processed county 01029 in 1.94 seconds
Processed county 01015 in 1.95 seconds
Processed county 01019 in 1.96 seconds
Processed county 01039 in 1.98 seconds
Processed county 01033 in 1.99 seconds
Processed county 01041 in 2.84 seconds
Processed county 01043 in 3.12 seconds
Processed county 01053 in 3.15 seconds
Processed county 01049 in 3.16 seconds
Processed county 01047 in

In [14]:
PRISM_precip = pd.read_csv('PRISM_Monthly_Precip_Processed_County.csv')

In [16]:
# Step 1: Extract year and month from incidentBeginDate
final_disasters_df['year'] = pd.to_datetime(final_disasters_df['incidentBeginDate']).dt.year
final_disasters_df['month'] = pd.to_datetime(final_disasters_df['incidentBeginDate']).dt.month

# Ensure county is the correct type
PRISM_precip['county'] = PRISM_precip['county'].astype(str).str.zfill(5)
final_disasters_df['fullFIPS'] = final_disasters_df['fullFIPS'].astype(str).str.zfill(5)

In [17]:
# Step 2: Merge on year, month, and fullFIPS/county
final_disasters_df = final_disasters_df.merge(
    PRISM_precip[['year', 'month', 'county', 'PRISM_precipitation', 'PRISM_percentile']],
    left_on=['year', 'month', 'fullFIPS'],  # Columns in final_disasters_df
    right_on=['year', 'month', 'county'],  # Columns in PRISM_precip
    how='left'  # Retain all rows from final_disasters_df
)

# Step 3: rename columns
final_disasters_df.rename(columns={
    'PRISM_precipitation': 'PRISM_mon_precipitation',
    'PRISM_percentile': 'PRISM_mon_percentile'
}, inplace=True)

# Step 4: Drop the temporary 'year' and 'month' columns if not needed further
final_disasters_df.drop(['year', 'month', 'county'], axis=1, inplace=True)

In [18]:
# MSWEP Return Periods
# First, convert MSWEP percentiles from 0-100 to 0-1 by dividing by 100
final_disasters_df['returnPeriod_MSWEP_1d'] = final_disasters_df['MSWEP_precipitation_30d_max_1d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_MSWEP_3d'] = final_disasters_df['MSWEP_precipitation_30d_max_3d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_MSWEP_5d'] = final_disasters_df['MSWEP_precipitation_30d_max_5d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_MSWEP_7d'] = final_disasters_df['MSWEP_precipitation_30d_max_7d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_MSWEP_14d'] = final_disasters_df['MSWEP_precipitation_30d_max_14d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_MSWEP_30d'] = final_disasters_df['MSWEP_precipitation_30d_sum_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)

In [19]:
# ERA5 Return Periods
# First, convert ERA5 percentiles from 0-100 to 0-1 by dividing by 100
final_disasters_df['returnPeriod_ERA5_1d'] = final_disasters_df['ERA5_precipitation_30d_max_1d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_ERA5_3d'] = final_disasters_df['ERA5_precipitation_30d_max_3d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_ERA5_5d'] = final_disasters_df['ERA5_precipitation_30d_max_5d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_ERA5_7d'] = final_disasters_df['ERA5_precipitation_30d_max_7d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_ERA5_14d'] = final_disasters_df['ERA5_precipitation_30d_max_14d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
final_disasters_df['returnPeriod_ERA5_30d'] = final_disasters_df['ERA5_precipitation_30d_sum_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)

In [20]:
# PRISM Return Periods
final_disasters_df['returnPeriod_PRISM'] = final_disasters_df['PRISM_mon_percentile'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)

In [21]:
final_disasters_df.to_csv('PRISM_MSWEP_ERA5_Processed_Disasters.csv')

In [40]:
final_disasters_df = pd.read_csv('PRISM_MSWEP_ERA5_Processed_Disasters.csv')

In [41]:
# Print number of unfiltered records
print(f"{len(final_disasters_df)} unfiltered total records found")

42682 unfiltered total records found


In [42]:
if precipPresent:
    # Group by fullFIPS and check if all values for precipitation_PRISM are 0
    counties_with_all_zero_precip = final_disasters_df.groupby('fullFIPS').filter(
        lambda x: (x['ERA5_precipitation_30d_sum'] == 0).all())
    
    # Get the count of rows per county where all values are zero
    county_counts = counties_with_all_zero_precip['fullFIPS'].value_counts()
    
    # Print the counties and their counts
    print("Counties where all values of precipitation are 0:")
    for county, count in county_counts.items():
        print(f"{county}: {count}")
    print(len(county_counts))

    # Get the unique county codes to mask out
    counties_to_mask_out = counties_with_all_zero_precip['fullFIPS'].unique()
    
    # Mask out these counties from final_disasters_df
    final_disasters_df = final_disasters_df[~final_disasters_df['fullFIPS'].isin(counties_to_mask_out)]
    
    # Group by fullFIPS and check if all values for precipitation_PRISM are 0
    counties_with_all_zero_precip = final_disasters_df.groupby('fullFIPS').filter(
        lambda x: (x['MSWEP_precipitation_30d_sum'] == 0).all())
    
    # Get the count of rows per county where all values are zero
    county_counts = counties_with_all_zero_precip['fullFIPS'].value_counts()
    
    # Print the counties and their counts
    print("Counties where all values of precipitation are 0:")
    for county, count in county_counts.items():
        print(f"{county}: {count}")
    print(len(county_counts))

    # Get the unique county codes to mask out
    counties_to_mask_out = counties_with_all_zero_precip['fullFIPS'].unique()
    
    # Mask out these counties from final_disasters_df
    final_disasters_df = final_disasters_df[~final_disasters_df['fullFIPS'].isin(counties_to_mask_out)]

Counties where all values of precipitation are 0:
0
Counties where all values of precipitation are 0:
0


In [43]:
if monthlyMask:
    total_records_before = len(final_disasters_df)
    final_disasters_df = final_disasters_df[final_disasters_df['MSWEP_precipitation_30d_sum'].round(1) > mon_thres]
    total_records_after = len(final_disasters_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total declarations filtered out under 30d MSWEP: {records_filtered_out}")
    final_disasters_df = final_disasters_df[final_disasters_df['ERA5_precipitation_30d_sum'].round(1) > mon_thres]
    total_records_after = len(final_disasters_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total declarations filtered out under 30d ERA5: {records_filtered_out}")

Total declarations filtered out under 30d MSWEP: 8949
Total declarations filtered out under 30d ERA5: 4062


In [44]:
if percentileMask:
    final_disasters_df = final_disasters_df[final_disasters_df['ERA5_precipitation_30d_max_7d'].round(1) > perc_thres]
    total_records_after = len(final_disasters_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total declarations filtered out under 7d ERA5: {records_filtered_out}")
    total_records_before = len(final_disasters_df)
    final_disasters_df = final_disasters_df[final_disasters_df['MSWEP_precipitation_30d_max_7d'].round(1) > perc_thres]
    total_records_after = len(final_disasters_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total declarations filtered out under 7d MSWEP: {records_filtered_out}")

Total declarations filtered out under 7d ERA5: 4176
Total declarations filtered out under 7d MSWEP: 0


In [45]:
# Print number of filtered records
print(f"{len(final_disasters_df)} filtered total records found")

25495 filtered total records found


In [46]:
final_disasters_df.to_csv('final_filtered_disasters.csv')

In [29]:
if push:
    !rm -r MSWEP_Daily_Precip_Processed_County
    !rm -r ERA5_Daily_Precip_Processed_County
    !rm 'PRISM_Monthly_Precip_Processed_County.csv'
    # Note that we want to keep ERA5_Merged_Claims folder until all datasets are merged to the claims