In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
from pathlib import Path
from os.path import join
os.environ["USE_PYGEOS"] = "0"
import geopandas as gpd
import pandas as pd
import numpy as np

from util.files import *
from util.const import *
from util.ddfs import *

In [3]:
# FIPS will be passed in as an argument, one day...
FIPS = '34007'
# STATE ABBR and NATION will be derived from FIPS, one day...
STATEABBR = 'NJ'
NATION = 'US'

# Prepare data for ensemble generation

In [4]:
# Generate structure ensemble
# Merge hazard data in
# Sample from the depth grids
# Add our vulnerability uncertainty
# (it's conditioned on the depth value in 
# a particular state of the world)

## Load and subset exposure data

In [5]:
# Load the single family homes,
# the fd_id/reference file
# the fd_id/depths file
# the fd_id flood zone file
nsi_struct = gpd.read_file(join(EXP_DIR_I, FIPS, 'nsi_sf.gpkg'))
nsi_ref = pd.read_parquet(join(EXP_DIR_I, FIPS, 'nsi_ref.pqt'))
nsi_depths = pd.read_parquet(join(EXP_DIR_I, FIPS, 'nsi_depths.pqt'))
nsi_fz = pd.read_parquet(join(EXP_DIR_I, FIPS, 'nsi_fz.pqt'))

In [6]:
# Need foundation type, number stories, structure value
# for our ensemble. Structure value will be the center of 
# the distribution and will be passed to the loss estimation
# function. Foundation type will be drawn from the implicit
# distribution in the NSI data. For each census block, 
# we are going to get the multinomial probabilities of 
# a building having a certain foundation type & number of stories
# Ideally, we would do this conditioned on prefirm but the
# building year column is based on median year built from ACS
# data
# From the foundation type that is drawn from the multinomial in 
# the ensemble, we will get the FFE from the distribution 
# defined in the code for the Wing et al. 2022 paper
# The point estimate version will just use default values

# Start by retaining only relevant columns in nsi_struct
# Then subset this and nsi_ref to the fd_id in nsi_depths
keep_cols = ['fd_id', 'occtype', 'found_type', 'val_struct']
nsi_res = nsi_struct[keep_cols]

# Let's merge in refs into nsi_res
nsi_res = nsi_res.merge(nsi_ref, on='fd_id')

# We're also going to merge in fzs
nsi_res = nsi_res.merge(nsi_fz[['fd_id', 'fld_zone']], on='fd_id')

# Split occtype to get the number of stories and basement
# We only need to keep stories for the purposes
# of estimating the distribution that stories comes from
# We will draw basement from the foundation type
# distribution which also gives us first floor elevation
structs = nsi_res['occtype'].str.split('-').str[1]
basements = structs.str[2:]
stories = structs.str[:2]

nsi_res = nsi_res.assign(stories=stories)

# Retain only the rows that correspond to structures
# that are exposed to flood depths
nsi_res_f = nsi_res[nsi_res['fd_id'].isin(nsi_depths['fd_id'])]

# Merge in the depths to the struct df you are working with
# Also merge in the refs - there are inconsistencies
# with the cbfips column from nsi directly and the
# block data I downloaded from the census webpage
# You retain more structures if you use the block data 
full_df = nsi_res_f.merge(nsi_depths, on='fd_id')

# This dataset can be directly used for estimating the 
# benchmark losses of using NSI as-is
# Use the Hazus DDFs with no uncertainty

In [7]:
# Let's get the fld_zone column processed for the way it needs
# to be done for using hazus ddfs
# Get the first character of the flood zone and only retain it
# if it's a V zone. We are going to use A zone for A and outside
# (if any) flood zone depth exposures
ve_zone = np.where(full_df['fld_zone'].str[0] == 'V',
                   'V',
                   'A')
full_df = full_df.assign(fz_ddf = ve_zone)


## Get parameters for structures

In [8]:
# We are also going to use nsi_struct merged with refs
# to determine the multinomial probabilities of basement
# and number stories (binomial) from block level which matches
# up with NSI tech reference on where data is randomly assigned
# from. While there are maps from parcel data, where available, 
# it's not clear which entries have this non-random assignment. 
# In addition, it is known that parcel aggregation datasets like
# ZTRAX may have data errors. The sources the NSI used
# have unknown validation/accuracy so we can treat these as
# part of estimating the distribution to draw from

# The method for estimating number of stories is based on assignment
# from parcel data. Where missing, square footage is divided by the 
# structure's footprint (when sq. ft. is missing, they take 86% of
# the structure's footprint as sq. ft). If > 1.25,
# a second floor is assumed
# If no footprint is available, 
# stories is randomly assigned from a distribution that varies by
# year built and census region. So, we can use census block again
# here

# The methodology for the structure valuation is obscure
# and there is no reporting on how accurate it is to some
# observed data on market values
# In a conservative thrust, we can take the reported
# coefficient of determination from Philadelphia Assesor's 
# methodology for estimating market values. This COD can be
# multiplied by the estimated value from NSI for a presumably
# conservative estimate of the standard deviation surrounding
# structure value estimates to observed market values
# We can also show in a representative example what would
# happen to the loss estimate distribution
# if the NSI COD is a factor of 2 larger. We still don't know
# if this is a reasonable representation since we assume
# there is no bias in the NSI structure valuation by
# centering the noise distribution at their estimated value. 
# In reality, the Philly assessor office reports their estimates
# are slightly biased which allows us to use a bias correction
# factor if we used that data. Down the line, comparing
# what the structure inventory distributions are using different
# data sources could be very interesting, including accounting
# for different # of RES1 buildings based on more detailed
# and survye-based methods in the city assessor data
# From the Nolte et al. (2023) large-scale parcel data good
# practices data, we know that there are many issues in using parcel
# data to fill in important data fields (even occupancy type)
# It is not the panacea it appears framed as in the NSI technical
# documentation

# There are not nearly enough observations at the block level
# to reliably estimate the parameter for binomial # stories
# or multinomial foundation type. Sometimes just one observation
# in general. Tract appears to have enough
# This check is based on the subset of tracts (or other ref)
# in nsi_res that are also in full_df (these are the ones) we need
# the probabilities for
STRUCT_REF = 'tract_id'
struct_tot = nsi_res[nsi_res[STRUCT_REF].isin(full_df[STRUCT_REF])]


### Number of stories

In [9]:
# Get the total number of structures w/ number of stories 
# in each block gruop
stories_sum = struct_tot.groupby([STRUCT_REF, 'stories']).size()
# Then get the proportion
stories_prop = stories_sum/struct_tot.groupby([STRUCT_REF]).size()
# Our parameters can be drawn from this table based on the bg_id
# of a structure we are estimating losses for
stories_param = stories_prop.reset_index().pivot(index=STRUCT_REF,
                                                 columns='stories',
                                                 values=0).fillna(0)
# Since it's a binomial distribution, we only need to specify
# one param. Arbitrarily choose 1S
# Round the param to the hundredth place
# Store in a dict
stories_param = stories_param['1S'].round(2)
STRY_DICT = dict(stories_param)



### Foundation types

In [10]:
# Repeat procedure above
found_sum = struct_tot.groupby([STRUCT_REF, 'found_type']).size()
found_prop = found_sum/struct_tot.groupby([STRUCT_REF]).size()
found_param = found_prop.reset_index().pivot(index=STRUCT_REF,
                                             columns='found_type',
                                             values=0).fillna(0)

# We want a dictionary of bg_id to a list of B, C, S
# for direct use in our multinomial distribution draw
# Store params in a list (each row is bg_id and corresponds to
# its own probabilities of each foundation type)
params = found_param.values.round(2)
# Then create our dictionary
FND_DICT = dict(zip(found_param.index, params))


## Load depth damage functions

In [11]:
# Load DDFs
naccs = pd.read_csv(join(VULN_DIR_I, 'physical', 'naccs_ddfs.csv'))
hazus = pd.read_csv(join(VULN_DIR_I, 'physical', 'hazus_ddfs.csv'))

# Load helper dictionaries
with open(join(VULN_DIR_I, 'physical', 'hazus.json'), 'r') as fp:
    HAZUS_MAX_DICT = json.load(fp)

with open(join(VULN_DIR_I, 'physical', 'hazus_nounc.json'), 'r') as fp:
    HAZUS_MAX_NOUNC_DICT = json.load(fp)

with open(join(VULN_DIR_I, 'physical', 'naccs.json'), 'r') as fp:
    NACCS_MAX_DICT = json.load(fp)

# Generate ensemble

In [12]:
# Reminder of the dataframes/dictionaries we have to help generate
# our ensemble members efficiently
# STRY_DICT
# FND_DICT
# FFE_DICT
# hazus
# naccs
# HAZUS_MAX_DICT
# NACCS_MAX_DICT
# HAZUS_MAX_NOUNC_DICT

# And some constants
# COEF_VARIATION
# N_SOW
# RET_PERS

# We need a randon number generator
rng = np.random.default_rng()

In [13]:
# Need to create a dataframe w/ 10,000 rows for each fd_id
# From full_df, keep fd_id, val_struct, bg_id, and the
# depth columns. 
# The way I usually do this is with
# df.loc[np.repeat(df.index, N)].reset_index(drop=True)
# With this approach, we can do everything in a vectorized
# form by passing array_like data of size N*len(df)
# to different rng() calls to get all the draws from
# distributions that we need
drop_cols = ['occtype', 'found_type', 'block_id', 'fld_zone',
             'tract_id', 'zcta_id', 'stories']

ens_df = full_df.drop(columns=drop_cols)
ens_df = ens_df.loc[np.repeat(ens_df.index, N_SOW)].reset_index(drop=True)
print('Created Index for Ensemble')

Created Index for Ensemble


In [15]:
# First, let's draw the depth values for each return period
# We can create a list of the depths that we draw from 
# each return period
# This should be pretty fast
# For each rp in RET_PERS
# we get the rp_Lower, rp_Mid, rp_Upper columns
# We should define subsets where 
# 1) lower == right
# 2) all else
# When 1 - take the value from high
# When 2 - do triangular

# We should end up with a numpy array of depths
# which are indexed to the fd_id_SOW_index 
# We can concatenate on index into a depths dataframe
# Then, the rest of the ensemble generation should work

In [25]:
rng.triangular(0, .1, .1)

0.08310191930420069

In [47]:
# Don't need to round - do that in process_haz
temp = ens_df[['500_Lower', '500_Mid', '500_Upper']].round()
no_tri_mask = temp['500_Lower'] == temp['500_Upper']

temp.loc[no_tri_mask, '500'] = temp.loc[no_tri_mask]['500_Lower']

temp.loc[~no_tri_mask, '500'] = rng.triangular(temp_tri['500_Lower'],
                                               temp_tri['500_Mid'],
                                               temp_tri['500_Upper'])


In [49]:
temp.head()

Unnamed: 0,500_Lower,500_Mid,500_Upper,500
0,0.0,0.0,1.0,0.653215
1,0.0,0.0,1.0,0.003464
2,0.0,0.0,1.0,0.332406
3,0.0,0.0,1.0,0.543746
4,0.0,0.0,1.0,0.40736


In [46]:
temp3

array([0.46943493, 0.42553854, 0.31828201, ..., 1.75887132, 4.92594985,
       1.38506456])

# Generate losses without uncertainty