In [None]:
# Import Packages 

In [1]:
import re
from copy import copy

import pandas as pd
import numpy as np
from shapely.geometry import Point
import geopandas as gpd
from sklearn.neighbors import BallTree

In [209]:
def average_str(s):
    s = s.split(" (")[0].split("-")
    s = [float(s) for s in s]
    s = np.average(s)
    
def add_sg_data_to_tank_data(tank_data, chemical_data):
    sg_data = []
    for cas_number in tank_data["cas_number"]:
        if cas_number is None:
            sg_data.append(None)
        elif cas_number is np.NaN:
            sg_data.append(None)
        else:
            subset_of_chemical_data_by_tank = chemical_data[chemical_data['CAS#'].isin(cas_number)]
            if len(subset_of_chemical_data_by_tank) > 0:
                sg_data.append(subset_of_chemical_data_by_tank)
            else:
                sg_data.append(None)
    tank_data["sg"] = sg
    return tank_data

In [2]:
tile_level_annotations = gpd.read_file("/hpc/group/borsuklab/ast/tile_level_annotation_multiple_capture_date_neighbor_tile_removed/tile_level_annotation_multiple_capture_date_neighbor_tile_removed.geojson")

In [3]:
# Read in tri data #/work/csr33/spatial_matching/tri/
tri_2022_us_path = "2022_us.csv"
tri_2022_us = pd.read_csv(tri_2022_us_path)

  tri_2022_us = pd.read_csv(tri_2022_us_path)


In [4]:
#add geometries
Geometry = [Point(xy) for xy in zip(tri_2022_us['13. LONGITUDE'], tri_2022_us['12. LATITUDE'])] 
tri_2022_us = gpd.GeoDataFrame(tri_2022_us, crs="EPSG:4326", geometry=Geometry)

#remove rows without locations
tri_2022_us = tri_2022_us[~np.isnan(tri_2022_us["12. LATITUDE"])]
tri_2022_us = tri_2022_us[~np.isnan(tri_2022_us["13. LONGITUDE"])]

#subset tri data based on naics codes
naics_industry_codes = pd.read_csv('/hpc/home/csr33/spatial-match-ast-chemicals/naics_industry_keys.csv')
tri_2022_us =tri_2022_us[tri_2022_us["19. INDUSTRY SECTOR CODE"].isin(naics_industry_codes["2022 NAICS US Code"].tolist())]

# Get unique tri locations
unique_tri_2022_us_values, unique_tri_2022_us_indices =  np.unique(tri_2022_us[["12. LATITUDE","13. LONGITUDE"]].values, return_index= True, axis = 0)
tri_2022_us_unique_locations = tri_2022_us.iloc[unique_tri_2022_us_indices]

# Create a BallTree for quick nearest neighbor lookup
btree = BallTree(tri_2022_us_unique_locations.geometry.apply(lambda x: (x.x, x.y)).tolist()) 

# Find closest point for each polygon
closest_points = []
for polygon in tile_level_annotations.geometry:
    point = polygon.representative_point()
    # Query ball tree to find closest point
    dist, idx = btree.query([(point.x, point.y)], k=1) 
    if dist > 0.1: 
        # No point within 1 km
        closest_points.append(None)
    else:
        closest_points.append(tri_2022_us_unique_locations.geometry.iloc[idx[0][0]])
        
# Add closest points back to polygons GeoDataFrame
tile_level_annotations['closest_point'] = closest_points

# Group by 'geometry' and aggregate 'name' and 'value' into lists
tri_2022_us_chemical = tri_2022_us.groupby('geometry').agg({"34. CHEMICAL": list, "37. CAS#": list}).reset_index()

#add chemical data to tile level annotations
merged_df = pd.merge(tile_level_annotations, tri_2022_us_chemical, left_on='closest_point', right_on='geometry', how='left')
tile_level_annotations[["chemical_name", "cas_number"]] = merged_df[["34. CHEMICAL", "37. CAS#"]]

In [215]:
len([1,2,2,3,4,5,6])//2

3

# read in and formal chemical data

In [183]:
chemical_data = pd.read_csv("/hpc/group/borsuklab/csr33/chemical_data/niosh_pocket_guide/NIOSH Pocket Guide.csv")
# remove rows where specific gravity is none
chemical_data = chemical_data.dropna(subset=['Specific gravity'])
# remove rows where specific gravity is for a metal
# Remove rows with 'metal'
chemical_data = chemical_data[~chemical_data['Specific gravity'].str.contains('metal', case=False)]
chemical_data = chemical_data[~chemical_data['Specific gravity'].isin(["?", "? "])]
chemical_data['Specific gravity'] = [str(average_str(sg)) if "-" in sg else sg for sg in chemical_data['Specific gravity']]
chemical_data['Specific gravity'] = [float(re.search(r'(\d+(\.\d+)?)', sg).group(0)) for sg in chemical_data['Specific gravity']]

In [203]:
tile_level_annotations = add_sg_data_to_tank_data(tile_level_annotations, chemical_data)

In [211]:
import os
ast_scaled_data = pd.read_parquet(os.path.join("/work/csr33/fragility", "model_data", "buckling", 
                                      f'buckling_scaled_simulated_data.parquet'), engine='pyarrow')

In [212]:
ast_scaled_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,diameter,external_tank_height,tank_shell_thickness,tank_contents_specific_gravity,inundation,velocity,flotation_Y,sliding_Y,flood_shell_buckling_Y,diameter_scaled,inundation_scaled,external_tank_height_scaled,tank_contents_specific_gravity_scaled,velocity_scaled
id,category,__null_dask_index__,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
61768,Category5,38284000,4.8,6.367624,0.005,0.800623,8.5,4.189224,1.0,0.0,1.0,-1.145026,-0.220864,-0.731698,-0.831360,-0.304363
61768,Category5,38284001,4.8,7.224267,0.005,0.798770,8.5,5.727290,1.0,0.0,1.0,-1.145026,-0.220864,-0.520394,-0.881066,0.263193
61768,Category5,38284002,4.8,8.554616,0.005,0.825083,8.5,5.415298,1.0,0.0,1.0,-1.145026,-0.220864,-0.192244,-0.175384,0.148066
61768,Category5,38284003,4.8,8.631876,0.005,0.794375,8.5,4.833314,1.0,0.0,1.0,-1.145026,-0.220864,-0.173187,-0.998929,-0.066690
61768,Category5,38284004,4.8,5.835329,0.005,0.807424,8.5,4.681660,1.0,0.0,1.0,-1.145026,-0.220864,-0.862996,-0.648976,-0.122651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127438,Category4,34455595,7.8,16.724643,0.005,0.917315,13.5,5.925903,1.0,0.0,1.0,-0.942586,0.610214,1.823015,2.298137,0.336483
127438,Category4,34455596,7.8,17.993693,0.005,0.908443,13.5,7.843119,1.0,0.0,1.0,-0.942586,0.610214,2.136044,2.060221,1.043948
127438,Category4,34455597,7.8,7.833560,0.005,0.818688,13.5,7.123234,1.0,0.0,1.0,-0.942586,0.610214,-0.370103,-0.346896,0.778306
127438,Category4,34455598,7.8,14.493391,0.005,0.814214,13.5,5.108177,1.0,0.0,1.0,-0.942586,0.610214,1.272643,-0.466871,0.034737
