In [1]:
import pandas as pd
import geopandas as gpd
import h3, shapely, us, os
import matplotlib.pyplot as plt
import urllib.parse
from sqlalchemy import create_engine  
from shapely.ops import unary_union
from shapely.geometry import mapping, Polygon
import xgboost as xgb

from pathlib import Path



### Prepare US state geo data

In [2]:
def generate_state_abbreviations():
    return {state.name: state.abbr for state in us.STATES_AND_TERRITORIES}

state_abbreviations = generate_state_abbreviations()
print(state_abbreviations)

{'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY', 'American Samoa': 'AS', 'Guam': 'GU', 'Northern Mariana Islands': 'MP', 'Puer

In [3]:
# Load US states GeoDataFrame
us_states = gpd.read_file("https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_5m.json")
us_states['area'] = us_states.geometry.area
us_states = us_states.sort_values(by='area', ascending=True)
us_states['st'] = us_states['NAME'].map(state_abbreviations)
us_states.head(5)


  us_states['area'] = us_states.geometry.area


Unnamed: 0,GEO_ID,STATE,NAME,LSAD,CENSUSAREA,geometry,area,st
8,0400000US11,11,District of Columbia,,61.048,"POLYGON ((-77.03860 38.79151, -77.03890 38.800...",0.01838,
39,0400000US44,44,Rhode Island,,1033.814,"MULTIPOLYGON (((-71.38359 41.46478, -71.38928 ...",0.309236,RI
7,0400000US10,10,Delaware,,1948.543,"MULTIPOLYGON (((-75.56493 39.58325, -75.57627 ...",0.545358,DE
51,0400000US72,72,Puerto Rico,,3423.775,"MULTIPOLYGON (((-65.32770 18.29584, -65.33745 ...",0.76508,PR
6,0400000US09,9,Connecticut,,4842.355,"POLYGON ((-71.79924 42.00807, -71.79792 41.935...",1.395161,CT


### For each US state, spatial join and aggregate Tornado data into H3 hexagons

In [4]:
def calculate_years_diff(dates):
    if len(dates) > 1:
        sorted_dates = sorted(dates)
        return [(sorted_dates[i + 1] - sorted_dates[i]).days / 365 for i in range(len(sorted_dates) - 1)]
    else:
        return []

In [5]:
def calculate_days_diff(dates):
    if len(dates) > 1:
        sorted_dates = sorted(dates)
        return [(sorted_dates[i + 1] - sorted_dates[i]).days for i in range(len(sorted_dates) - 1)]
    else:
        return []


In [6]:
import os
import glob
import geopandas as gpd
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Set directory path
directory = " tornado/data/h3_by_state/"

# Change working directory
os.chdir(directory)

output_directory = "h3_hexagons_geopackage"


# Loop through each state name
for state_name in us_states['st']:
    try:
        print("________________________________________________________________")
        
        # Set directory path
        directory = " tornado/data/h3_by_state/"

        # Change working directory
        os.chdir(directory)

        # Load H3 by individual state: i.e. AK for Alaska
        print(f"Read in geoDataframe for : {state_name}")

        state = f"{state_name}"

        # Filter the state records to a single state
        st_boundary = us_states[us_states['st'] == f"{state}"]
        st_boundary = st_boundary.to_crs(4326)

        print(f"Loading H3 id's and polygons for: {state}")

        # Build variable to call the full_name of the state
        full_name = st_boundary['NAME'].iloc[0]

        print("We've created the two state objects: full_name and abbreviated name")

        hexagons = gpd.read_file(f"{directory}h3_{full_name}.gpkg")
        hexagons['state'] = f"{state}"
        lookup = hexagons['state'][0]

        # Load event data {tornadoes} by individual state
        event = gpd.read_file(" tornado/data/input/tornado_tracks.shp")
        event = event.to_crs(4326)
        event_df = event[event['st'] == f"{lookup}"]
        print("Created event geoDataframe")


        # Clip the data using GeoPandas clip
        event_df = gpd.clip(event_df, st_boundary)
        event_df['buffer_geom'] = event_df.buffer(0.008)
        event_df = event_df.drop('geometry', axis=1)
        event_df = event_df.rename(columns={'buffer_geom': 'geometry'})
        print(f"Loading Tornado event layer by the state of: {lookup}")

        # Convert column A from float to string and remove ".0"
        event_df['yr'] = event_df['yr'].astype(int).astype(str)
        event_df['mo'] = event_df['mo'].astype(int).astype(str)
        event_df['dy'] = event_df['dy'].astype(int).astype(str)
        #event_df['date'] = pd.to_datetime(event_df['date'], errors='coerce').dt.date
        event_df = event_df[['geometry', 'yr', 'mo', 'dy', 'date', 'time', 'tz', 'st', 'mag', 'inj',
        'fat', 'loss', 'closs', 'len', 'wid']]
        print("Start Join of geoDataframe")

        # Use geopandas to spatial join *intersect* the two tables
        join_df = gpd.sjoin(hexagons, event_df, how='inner', predicate='intersects')
        print(f"Resulted join has a record count of: {len(join_df.index)}")

        # Sort and group the data to produce aggregate layer
        join_df = join_df.sort_values(by=['h3_hexagon', 'date'], ascending=True)
        group_df = join_df.groupby('h3_hexagon').agg({
            'yr': list, 
            'mo': list, 
            'date': list, 
            'mag': list, 
            'loss': list, 
            'closs': list, 
            'inj': list, 
            'fat': list, 
            'len': list, 
            'wid': list
            })
        print("Aggregating event geoDataframe")

        # Calculate days difference between dates in the 'date' column
        #group_df['days_diff'] = group_df['date'].apply(calculate_days_diff)
        
        #group_df['years_diff'] = group_df['date'].apply(calculate_years_diff)


        print("Dissolving event geoDataframe")
        # Run a Group By and Sum operation to produce aggregate layer
        dissolve_df = join_df.dissolve(
                by="h3_hexagon",
                aggfunc={'state': 'count',
                        'mag':'mean', 'inj': 'sum', 
                        'fat': 'sum', 'loss': 'sum', 
                        'closs': 'sum', 'len': 'mean', 'wid': 'mean'})


        dissolve_df = dissolve_df.rename(columns={'h3_hexagon':'h3_id', 'name':'tornado_count'})

        # Merge the two dataframes to produce final aggregate layer
        final = dissolve_df.join(group_df, lsuffix='_h3_history', rsuffix='_stats')
        final = final.rename(columns={'state': 'tornado_count'})

        df_melt = final.assign(names=final.date.str.split(","))
        
        print("Prepare Melt event geoDataframe")
        final = df_melt.date.apply(pd.Series) \
        .merge(df_melt, right_index=True, left_index=True)

        #final_stats = final.drop([0, 1, 'geometry'], axis=1)
        final_geom = final.drop([0], axis=1)

        #final.to_csv(f"data/{lookup}_final.csv")

        print("Successfully dissolved...")

        # Save to CSV file
        final_geom.to_csv(f"data/{lookup}_final_geom.csv")
        #final_stats.to_csv(f"data/{lookup}_final_stats.csv")
        print("Wrote to csv...")
    except Exception as e:
        print(f"Error processing {state_name}: {e}")


________________________________________________________________
Read in geoDataframe for : nan
Loading H3 id's and polygons for: nan
Error processing nan: single positional indexer is out-of-bounds
________________________________________________________________
Read in geoDataframe for : RI
Loading H3 id's and polygons for: RI
We've created the two state objects: full_name and abbreviated name
Created event geoDataframe
Loading Tornado event layer by the state of: RI
Start Join of geoDataframe
Resulted join has a record count of: 944
Aggregating event geoDataframe
Dissolving event geoDataframe
Prepare Melt event geoDataframe
Successfully dissolved...
Wrote to csv...
________________________________________________________________
Read in geoDataframe for : DE
Loading H3 id's and polygons for: DE
We've created the two state objects: full_name and abbreviated name
Created event geoDataframe
Loading Tornado event layer by the state of: DE
Start Join of geoDataframe
Resulted join has a r

Exception ignored in: <function BaseGeometry.__del__ at 0x126fc0700>
Traceback (most recent call last):
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/shapely/geometry/base.py", line 209, in __del__
    self._empty(val=None)
  File "/Users/ /miniforge3/envs/awesome/lib/python3.9/site-packages/shapely/geometry/base.py", line 194, in _empty
    self._lgeos.GEOSGeom_destroy(self.__geom__)
KeyboardInterrupt: 


Created event geoDataframe
Loading Tornado event layer by the state of: NH
Start Join of geoDataframe
Resulted join has a record count of: 9619
Aggregating event geoDataframe
Dissolving event geoDataframe
Prepare Melt event geoDataframe
Successfully dissolved...
Wrote to csv...
________________________________________________________________
Read in geoDataframe for : VT
Loading H3 id's and polygons for: VT
We've created the two state objects: full_name and abbreviated name
Created event geoDataframe
Loading Tornado event layer by the state of: VT
Start Join of geoDataframe
Resulted join has a record count of: 3453
Aggregating event geoDataframe
Dissolving event geoDataframe
Prepare Melt event geoDataframe


In [1]:
import os
import glob
import geopandas as gpd
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# Set directory path
directory = " tornado/data/h3_by_state/"

# Change working directory
os.chdir(directory)

output_directory = "h3_hexagons_geopackage"

# Loop through each state name
for state_name in us_states['st']:
    try:
        print("________________________________________________________________")

        # Set directory path
        directory = " tornado/data/h3_by_state/"

        # Change working directory
        os.chdir(directory)

        # Load H3 by individual state: i.e. AK for Alaska
        print(f"Read in geoDataframe for : {state_name}")

        state = f"{state_name}"

        # Filter the state records to a single state
        st_boundary = us_states[us_states['st'] == f"{state}"]
        st_boundary = st_boundary.to_crs(4326)

        print(f"Loading H3 id's and polygons for: {state}")

        # Build variable to call the full_name of the state
        full_name = st_boundary['NAME'].iloc[0]

        print("We've created the two state objects: full_name and abbreviated name")

        hexagons = gpd.read_file(f"{directory}h3_{full_name}.gpkg")
        hexagons['state'] = f"{state}"
        lookup = hexagons['state'][0]

        # Load event data {tornadoes} by individual state
        event = gpd.read_file(" tornado/data/input/tornado_tracks.shp")
        event = event.to_crs(4326)
        event_df = event[event['st'] == f"{lookup}"]
        print("Created event geoDataframe")

        # Clip the data using GeoPandas clip
        event_df = gpd.clip(event_df, st_boundary)
        event_df['buffer_geom'] = event_df.buffer(0.008)
        event_df = event_df.drop('geometry', axis=1)
        event_df = event_df.rename(columns={'buffer_geom': 'geometry'})
        print(f"Loading Tornado event layer by the state of: {lookup}")

        # Convert column A from float to string and remove ".0"
        event_df['yr'] = event_df['yr'].astype(int).astype(str)
        event_df['mo'] = event_df['mo'].astype(int).astype(str)
        event_df['dy'] = event_df['dy'].astype(int).astype(str)
        event_df['date'] = pd.to_datetime(event_df['date'], errors='coerce')
        event_df = event_df[['geometry', 'yr', 'mo', 'dy', 'date', 'time', 'tz', 'st', 'mag', 'inj',
                             'fat', 'loss', 'closs', 'len', 'wid']]
        print("Start Join of geoDataframe")

        # Use geopandas to spatial join *intersect* the two tables
        join_df = gpd.sjoin(hexagons, event_df, how='inner', predicate='intersects')
        print(f"Resulted join has a record count of: {len(join_df.index)}")

        # Sort and group the data to produce aggregate layer
        join_df = join_df.sort_values(by=['h3_hexagon', 'date'], ascending=True)
        
        group_df = join_df.groupby('h3_hexagon').agg({
            'yr': list,
            'mo': list,
            'date': list,
            'mag': list,
            'loss': list,
            'closs': list,
            'inj': list,
            'fat': list,
            'len': list,
            'wid': list
        })
        print("Aggregating event geoDataframe")

        # Calculate days difference between dates in the 'date' column
        group_df['days_diff'] = group_df['date'].apply(calculate_days_diff)

        # Calculate years difference between dates in the 'date' column
        group_df['years_diff'] = group_df['date'].apply(calculate_years_diff)

        final['years_diff'] = group_df['years_diff']


# ... (rest of the code)

        # Merge the two dataframes to produce final aggregate layer
        final = dissolve_df.join(group_df, lsuffix='_h3_history', rsuffix='_stats')
        final = final.rename(columns={'state': 'tornado_count'})

        df_melt = final.assign(names=final.date.str.split(","))

        print("Prepare Melt event geoDataframe")
        final = df_melt.date.apply(pd.Series) \
            .merge(df_melt, right_index=True, left_index=True)

        # final_stats = final.drop([0, 1, 'geometry'], axis=1)
        final_geom = final.drop([0], axis=1)

        print("Successfully dissolved...")

        # Save to CSV file
        final_geom.to_csv(f"data/{lookup}_final_geom.csv")

        print("Wrote to csv...")
    except Exception as e:
        print(f"Error processing {state_name}: {e}")

NameError: name 'us_states' is not defined