In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import geopandas as gpd
import matplotlib.colors as mcolors
import matplotlib as mpl
import matplotlib.patches as mpatches
from matplotlib.colors import LogNorm, Normalize
from matplotlib.gridspec import GridSpec
import seaborn as sns

# Load Claims File

In [2]:
# Load the combined Processed_Claims.csv file
processed_claims_file = "PRISM_MSWEP_ERA5_Processed_Claims.csv"
processed_claims_df = pd.read_csv(processed_claims_file)

# Ensure that 'countyCode' is properly formatted as a 5-character string
processed_claims_df['countyCode'] = processed_claims_df['countyCode'].astype(int).astype(str)
processed_claims_df['countyCode'] = processed_claims_df['countyCode'].apply(lambda x: str(x).zfill(5))

  processed_claims_df = pd.read_csv(processed_claims_file)


In [3]:
# Print number of unfiltered records
print(f"{len(processed_claims_df)} unfiltered total claims records found")

2536258 unfiltered total claims records found


# Filter Selection

In [4]:
numClaims = True
buildingDamage = True
precipPresent = True
monthlyMask = True
percentileMask = True

m = 1 # Filter out counties with less than m data points
mon_thres = 0 # Filter out below mon_thres monthly precipitation values as erroneous
perc_thres = 0 # Filter out below perc_thres th percentile values as erroneous for weekly precipitation

# Filter

In [5]:
if numClaims:
    # Calculate the number of unique counties before filtering
    total_counties_before = processed_claims_df['countyCode'].nunique()
    county_counts = processed_claims_df['countyCode'].value_counts()
    counties_with_enough_data = county_counts[county_counts >= m].index
    filtered_claims_df = processed_claims_df[processed_claims_df['countyCode'].isin(counties_with_enough_data)]
    
    # Calculate the number of unique counties after filtering
    total_counties_after = filtered_claims_df['countyCode'].nunique()
    
    # Calculate the number of counties that were filtered out
    counties_filtered_out = total_counties_before - total_counties_after
    
    # Print the number of counties filtered out
    print(f"Total counties filtered out: {counties_filtered_out}")

Total counties filtered out: 0


In [6]:
if precipPresent:
    # Group by countyCode and check if all values for precipitation_PRISM are 0
    counties_with_all_zero_precip = processed_claims_df.groupby('countyCode').filter(
        lambda x: (x['PRISM_mon_precipitation'] == 0).all())
    
    # Get the count of rows per county where all values are zero
    county_counts = counties_with_all_zero_precip['countyCode'].value_counts()
    
    # Print the counties and their counts
    print("Counties where all values of PRISM precipitation are 0:")
    for county, count in county_counts.items():
        print(f"{county}: {count}")
    print(len(county_counts))

    # Get the unique county codes to mask out
    counties_to_mask_out = counties_with_all_zero_precip['countyCode'].unique()
    
    # Mask out these counties from filtered_claims_df
    filtered_claims_df = filtered_claims_df[~filtered_claims_df['countyCode'].isin(counties_to_mask_out)]
    
    # Group by countyCode and check if all values for precipitation_PRISM are 0
    counties_with_all_zero_precip = processed_claims_df.groupby('countyCode').filter(
        lambda x: (x['ERA5_precipitation_30d_sum'] == 0).all())
    
    # Get the count of rows per county where all values are zero
    county_counts = counties_with_all_zero_precip['countyCode'].value_counts()
    
    # Print the counties and their counts
    print("Counties where all values of ERA5 precipitation are 0:")
    for county, count in county_counts.items():
        print(f"{county}: {count}")
    print(len(county_counts))

    # Get the unique county codes to mask out
    counties_to_mask_out = counties_with_all_zero_precip['countyCode'].unique()
    
    # Mask out these counties from filtered_claims_df
    filtered_claims_df = filtered_claims_df[~filtered_claims_df['countyCode'].isin(counties_to_mask_out)]

    # Group by countyCode and check if all values for precipitation_PRISM are 0
    counties_with_all_zero_precip = processed_claims_df.groupby('countyCode').filter(
        lambda x: (x['MSWEP_precipitation_30d_sum'] == 0).all())
    
    # Get the count of rows per county where all values are zero
    county_counts = counties_with_all_zero_precip['countyCode'].value_counts()
    
    # Print the counties and their counts
    print("Counties where all values of MSWEP precipitation are 0:")
    for county, count in county_counts.items():
        print(f"{county}: {count}")
    print(len(county_counts))

    # Get the unique county codes to mask out
    counties_to_mask_out = counties_with_all_zero_precip['countyCode'].unique()
    
    # Mask out these counties from filtered_claims_df
    filtered_claims_df = filtered_claims_df[~filtered_claims_df['countyCode'].isin(counties_to_mask_out)]

Counties where all values of PRISM precipitation are 0:
0
Counties where all values of ERA5 precipitation are 0:
0
Counties where all values of MSWEP precipitation are 0:
0


In [7]:
if buildingDamage:
    # Calculate the total number of records before filtering
    total_records_before = len(filtered_claims_df)
    
    # Filter the dataframe where buildingDamageAmount is more than $1000
    filtered_claims_df = filtered_claims_df[filtered_claims_df['buildingDamageAmount'] > 1000]
    
    # Calculate the total number of records after filtering
    total_records_after = len(filtered_claims_df)
    
    # Calculate the number of records that were filtered out
    records_filtered_out = total_records_before - total_records_after
    
    # Print the number of records filtered out
    print(f"Total claim records filtered out: {records_filtered_out}")

Total claim records filtered out: 715620


In [8]:
if monthlyMask:
    total_records_before = total_records_after
    filtered_claims_df = filtered_claims_df[filtered_claims_df['PRISM_mon_precipitation'].round(1) > mon_thres]
    total_records_after = len(filtered_claims_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total claim records filtered out under monthly PRISM: {records_filtered_out}")
    filtered_claims_df = filtered_claims_df[filtered_claims_df['MSWEP_precipitation_30d_sum'].round(1) > mon_thres]
    total_records_after = len(filtered_claims_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total claim records filtered out under monthly MSWEP: {records_filtered_out}")
    filtered_claims_df = filtered_claims_df[filtered_claims_df['MSWEP_precipitation_30d_sum'].round(1) > mon_thres]
    total_records_after = len(filtered_claims_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total claim records filtered out under monthly ERA5: {records_filtered_out}")

Total claim records filtered out under monthly PRISM: 31840
Total claim records filtered out under monthly MSWEP: 111910
Total claim records filtered out under monthly ERA5: 0


In [9]:
if percentileMask:
    total_records_before = total_records_after
    filtered_claims_df = filtered_claims_df[filtered_claims_df['ERA5_precipitation_30d_max_7d'].round(1) > perc_thres]
    total_records_after = len(filtered_claims_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total claim records filtered out under 7d ERA5: {records_filtered_out}")
    filtered_claims_df = filtered_claims_df[filtered_claims_df['MSWEP_precipitation_30d_max_7d'].round(1) > perc_thres]
    total_records_after = len(filtered_claims_df)
    records_filtered_out = total_records_before - total_records_after
    total_records_before = total_records_after
    print(f"Total claim records filtered out under 7d MSWEP: {records_filtered_out}")

Total claim records filtered out under 7d ERA5: 220041
Total claim records filtered out under 7d MSWEP: 0


In [10]:
print(f"final total records analyzed: {len(filtered_claims_df)}")

final total records analyzed: 1456847


# Add Return Periods

In [11]:
# Calculate PRISM return period
filtered_claims_df['returnPeriod_PRISM'] = filtered_claims_df['PRISM_mon_percentile'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)

In [12]:
# MSWEP Return Periods
# First, convert MSWEP percentiles from 0-100 to 0-1 by dividing by 100
filtered_claims_df['returnPeriod_MSWEP_1d'] = filtered_claims_df['MSWEP_precipitation_30d_max_1d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_MSWEP_3d'] = filtered_claims_df['MSWEP_precipitation_30d_max_3d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_MSWEP_5d'] = filtered_claims_df['MSWEP_precipitation_30d_max_5d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_MSWEP_7d'] = filtered_claims_df['MSWEP_precipitation_30d_max_7d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_MSWEP_14d'] = filtered_claims_df['MSWEP_precipitation_30d_max_14d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_MSWEP_30d'] = filtered_claims_df['MSWEP_precipitation_30d_sum_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)

In [13]:
# ERA5 Return Periods
# First, convert ERA5 percentiles from 0-100 to 0-1 by dividing by 100
filtered_claims_df['returnPeriod_ERA5_1d'] = filtered_claims_df['ERA5_precipitation_30d_max_1d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_ERA5_3d'] = filtered_claims_df['ERA5_precipitation_30d_max_3d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_ERA5_5d'] = filtered_claims_df['ERA5_precipitation_30d_max_5d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_ERA5_7d'] = filtered_claims_df['ERA5_precipitation_30d_max_7d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_ERA5_14d'] = filtered_claims_df['ERA5_precipitation_30d_max_14d_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)
filtered_claims_df['returnPeriod_ERA5_30d'] = filtered_claims_df['ERA5_precipitation_30d_sum_percentile_modeled'].apply(
    lambda x: 1000 if x >= 99.9 else 1 / (1 - (x / 100))
)

# Conversions

In [14]:
# Adjust units to mm
filtered_claims_df['ERA5_precipitation'] = filtered_claims_df['ERA5_precipitation']*1000
filtered_claims_df['ERA5_precipitation_30d_max_1d'] = filtered_claims_df['ERA5_precipitation_30d_max_1d']*1000
filtered_claims_df['ERA5_precipitation_30d_max_3d'] = filtered_claims_df['ERA5_precipitation_30d_max_3d']*1000
filtered_claims_df['ERA5_precipitation_30d_max_5d'] = filtered_claims_df['ERA5_precipitation_30d_max_5d']*1000
filtered_claims_df['ERA5_precipitation_30d_max_7d'] = filtered_claims_df['ERA5_precipitation_30d_max_7d']*1000
filtered_claims_df['ERA5_precipitation_30d_max_14d'] = filtered_claims_df['ERA5_precipitation_30d_max_14d']*1000
filtered_claims_df['ERA5_precipitation_30d_sum'] = filtered_claims_df['ERA5_precipitation_30d_sum']*1000

# Save

In [15]:
filtered_claims_df.to_csv('final_filtered_claims.csv')