In [None]:
import pandas as pd
import numpy as np
!pip install st_dbscan
from st_dbscan import ST_DBSCAN
from datetime import datetime
import time
import matplotlib.pyplot as plt
import os
import geopandas as gpd
import pandas as pd
from sklearn.cluster import DBSCAN
import seaborn as sns
from itertools import product
from sklearn.preprocessing import MinMaxScaler
from ST_Cluster import *

# Clustering Params

In [None]:
space_thres_list = [0.5, 1, 2, 3, 4]
time_thres_list = [3, 5, 7, 14, 30]
num_thres_list = [3, 5, 7, 10]

# Create a folder for temporary files
temp_folder = "Clusters/2025_all"
os.makedirs(temp_folder, exist_ok=True)
damage_only = True

save = False
save_small=True

# Claims Data Load

In [None]:
# Load raw data
claims = pd.read_csv("../3_Failure_Modes/FimaNfipClaims_Aug2025.csv", low_memory=False)

## Preprocess

In [None]:
# Drop rows where 'dateOfLoss' is NA
claims = claims.dropna(subset=['dateOfLoss'])

# Drop rows where 'latitude' or 'longitude' is NA or blank
claims = claims.dropna(subset=['latitude', 'longitude'])
claims = claims[(claims['latitude'] != '') & (claims['longitude'] != '')]

# Remove rows where 'buildingDamageAmount' is less than 1
if damage_only:
    claims = claims[claims['buildingDamageAmount'] >= 1]

# Preprocess 'dateOfLoss' to numeric days
claims['dateOfLoss'] = pd.to_datetime(claims['dateOfLoss']).dt.tz_localize(None)
origin_date = claims['dateOfLoss'].min()
claims['daysSinceStart'] = (claims['dateOfLoss'] - origin_date).dt.days
claims['date'] = pd.to_datetime(claims['daysSinceStart'], unit='D', origin=origin_date)

# Create a unique index for each row
claims['index'] = claims.index

# Drop rows where 'countyCode' is NaN
claims = claims.dropna(subset=['countyCode'])

# Ensure that 'countyCode' is properly formatted as a 5-character string
claims['countyCode'] = claims['countyCode'].astype(int).astype(str)
claims['countyCode'] = claims['countyCode'].apply(lambda x: str(x).zfill(5))

# Define the bounds for the contiguous US
bounds = {
    "min_lon": -130,
    "max_lon": -65,
    "min_lat": 24,
    "max_lat": 50
}

# Filter the DataFrame
claims = claims.dropna(subset=['latitude', 'longitude']).loc[
    (claims['longitude'] >= bounds["min_lon"]) &
    (claims['longitude'] <= bounds["max_lon"]) &
    (claims['latitude'] >= bounds["min_lat"]) &
    (claims['latitude'] <= bounds["max_lat"])
]

In [None]:
claims, sensitivities = sensitivity_analysis(claims, space_thres_list, time_thres_list, num_thres_list)

In [None]:
if save:
    # Save the sensitivity analysis
    sensitivities.to_csv(f'{temp_folder}/cluster_sensitivities_cl.csv', index=False)

    # Save the clustered claims
    claims.to_csv(f"{temp_folder}/clustered_claims_sensitivity.csv", index=False)

In [None]:
if save_small:
    claims[['id','dateOfLoss','longitude','latitude','st_cluster_3_5_7']].to_csv(f"{temp_folder}/clustered_claims_export.csv", index=False)