In [7]:
import numpy as np
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Polygon
import pandas as pd
import Levenshtein as lev

df = pd.read_csv("cel_med.csv", on_bad_lines = "skip")


def compute_levenshtein_matrix(strings):
    n = len(strings)
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            distance_matrix[i, j] = lev.distance(strings[i], strings[j])
    return distance_matrix

min_date = '1677-09-21'
max_date = '2262-04-11'


def filter_dates(date_str):
    try:
        date = pd.to_datetime(date_str)
        if date < pd.to_datetime(min_date) or date > pd.to_datetime(max_date):
            return pd.NaT
        return date
    except ValueError:
        return pd.NaT


df['startdate'] = df['startdate'].apply(filter_dates)
df['enddate'] = df['enddate'].apply(filter_dates)


df['startdate_num'] = df['startdate'].view('int64') / 1e9
df['enddate_num'] = df['enddate'].view('int64') / 1e9


In [8]:

locality_names = df['localityname'].fillna("").tolist()
lev_distance_matrix = compute_levenshtein_matrix(locality_names)


mds = MDS(n_components=2, dissimilarity='precomputed', random_state=2024)
lev_features = mds.fit_transform(lev_distance_matrix)


df['lev_feature_1'] = lev_features[:, 0]
df['lev_feature_2'] = lev_features[:, 1]

features = df[['latitude1', 'longitude1', 'lev_feature_1', 'lev_feature_2', 'startdate_num', 'enddate_num']]

features = features.dropna()

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


KeyboardInterrupt



In [None]:
dbscan = DBSCAN(eps=0.1, min_samples=3)
clusters = dbscan.fit_predict(features_scaled)

df['cluster'] = -1 
df.loc[features.index, 'cluster'] = clusters

def create_convex_hulls(df):
    polygons = []
    for cluster in df['cluster'].unique():
        if cluster == -1:
            continue
        points = df[df['cluster'] == cluster][['longitude1', 'latitude1']].values
        if len(points) < 3:
            continue
        try:
            hull = ConvexHull(points)
            vertices = points[hull.vertices]
            polygon = Polygon(vertices)
            polygons.append({'cluster': cluster, 'geometry': polygon})
        except Exception as e:
            continue
    return polygons

In [None]:
us_long_bounds = [-125, -66.93457]
us_lat_bounds = [24.396308, 49.384358]
data_us = df[
    (df['longitude1'] >= us_long_bounds[0]) & (df['longitude1'] <= us_long_bounds[1]) &
    (df['latitude1'] >= us_lat_bounds[0]) & (df['latitude1'] <= us_lat_bounds[1])
]

polygons = create_convex_hulls(data_us)

gdf = gpd.GeoDataFrame(polygons)

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
us = world[(world.name == "United States of America")]

us = us.cx[us_long_bounds[0]:us_long_bounds[1], us_lat_bounds[0]:us_lat_bounds[1]]

fig, ax = plt.subplots(figsize=(15, 10))
us.plot(ax=ax, color='white', edgecolor='black')
gdf.plot(ax=ax, column='cluster', cmap='tab20', legend=True, alpha=0.5)

ax.set_xlim(us_long_bounds)
ax.set_ylim(us_lat_bounds)
ax.set_aspect('equal')

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('DBSCAN Clusters as Regions (Polygons) Overlaid on Continental US Map')
plt.show()
