# Configure

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
from os.path import join

os.environ["USE_PYGEOS"] = "0"
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Filepath directories

# Get the absolute path to the project directory
# Which is one directory above notebooks/
ABS_DIR = os.path.abspath(Path(os.getcwd()).parents[0])
# Get raw data directory
FR = join(ABS_DIR, 'data', 'raw')
# Get interim data directory
FI = join(ABS_DIR, 'data', 'interim')
# Get processed data directory
FP = join(ABS_DIR, 'data', 'results')

# Directories for interim exposure, vulnerability (vuln) and 
# hazard
EXP_DIR_I = join(FI, 'exposure')
VULN_DIR_I = join(FI, 'vuln')
HAZ_DIR_I = join(FI, 'haz')
REF_DIR_I = join(FI, 'ref')

# Ensure they exist
Path(EXP_DIR_I).mkdir(parents=True, exist_ok=True)
Path(VULN_DIR_I).mkdir(parents=True, exist_ok=True)
Path(HAZ_DIR_I).mkdir(parents=True, exist_ok=True)
Path(REF_DIR_I).mkdir(parents=True, exist_ok=True)

# Prepare our results directory
Path(FP).mkdir(parents=True, exist_ok=True)

# Reference fips
FIPS = '42101'

# Prepare structure data for loss ensemble

## Load and subset exposure data

In [5]:
# Load nsi_res.gpkg, nsi_ref.pqt, and nsi_depths.pqt
nsi_struct = gpd.read_file(join(EXP_DIR_I, 'nsi_res.gpkg'))
nsi_ref = pd.read_parquet(join(EXP_DIR_I, 'nsi_ref.pqt'))
nsi_depths = pd.read_parquet(join(EXP_DIR_I, 'nsi_depths.pqt'))

In [34]:
# Need foundation type, number stories, structure value
# for our ensemble. Structure value will be the center of 
# the distribution and will be passed to the loss estimation
# function. Foundation type will be drawn from the implicit
# distribution in the NSI data. For each census block, 
# we are going to get the multinomial probabilities of 
# a building having a certain foundation type & number of stories
# Ideally, we would do this conditioned on prefirm but the
# building year column is based on median year built from ACS
# data
# From the foundation type that is drawn from the multinomial in 
# the ensemble, we will get the FFE from the distribution 
# defined in the code for the Wing et al. 2022 paper
# The point estimate version will just use default values

# Start by retaining only relevant columns in nsi_struct
# Then subset this and nsi_ref to the fd_id in nsi_depths
keep_cols = ['fd_id', 'occtype', 'found_type', 'val_struct']
nsi_res = nsi_struct[keep_cols]

# Let's merge in refs into nsi_res
nsi_res = nsi_res.merge(nsi_ref, on='fd_id')

# Split occtype to get the number of stories and basement
# We only need to keep stories for the purposes
# of estimating the distribution that stories comes from
# We will draw basement from the foundation type
# distribution which also gives us first floor elevation
structs = nsi_res['occtype'].str.split('-').str[1]
basements = structs.str[2:]
stories = structs.str[:2]

nsi_res = nsi_res.assign(stories=stories)

# Retain only the rows that correspond to structures
# that are exposed to flood depths
nsi_res_f = nsi_res[nsi_res['fd_id'].isin(nsi_depths['fd_id'])]

# Merge in the depths to the struct df you are working with
# Also merge in the refs - there are inconsistencies
# with the cbfips column from nsi directly and the
# block data I downloaded from the census webpage
# You retain more structures if you use the block data 
full_df = nsi_res_f.merge(nsi_depths, on='fd_id')

# This dataset can be directly used for estimating the 
# benchmark losses of using NSI as-is
# Use the Hazus DDFs with no uncertainty

## Get parameters for structure uncertainty distributions

In [None]:
# We are also going to use nsi_struct merged with refs
# to determine the multinomial probabilities of basement
# and number stories (binomial) from block level which matches
# up with NSI tech reference on where data is randomly assigned
# from. While there are maps from parcel data, where available, 
# it's not clear which entries have this non-random assignment. 
# In addition, it is known that parcel aggregation datasets like
# ZTRAX may have data errors. The sources the NSI used
# have unknown validation/accuracy so we can treat these as
# part of estimating the distribution to draw from

# The method for estimating number of stories is based on assignment
# from parcel data. Where missing, square footage is divided by the 
# structure's footprint (when sq. ft. is missing, they take 86% of
# the structure's footprint as sq. ft). If > 1.25,
# a second floor is assumed
# If no footprint is available, 
# stories is randomly assigned from a distribution that varies by
# year built and census region. So, we can use census block again
# here

# The methodology for the structure valuation is obscure
# and there is no reporting on how accurate it is to some
# observed data on market values
# In a conservative thrust, we can take the reported
# coefficient of determination from Philadelphia Assesor's 
# methodology for estimating market values. This COD can be
# multiplied by the estimated value from NSI for a presumably
# conservative estimate of the standard deviation surrounding
# structure value estimates to observed market values
# We can also show in a representative example what would
# happen to the loss estimate distribution
# if the NSI COD is a factor of 2 larger. We still don't know
# if this is a reasonable representation since we assume
# there is no bias in the NSI structure valuation by
# centering the noise distribution at their estimated value. 
# In reality, the Philly assessor office reports their estimates
# are slightly biased which allows us to use a bias correction
# factor if we used that data. Down the line, comparing
# what the structure inventory distributions are using different
# data sources could be very interesting, including accounting
# for different # of RES1 buildings based on more detailed
# and survye-based methods in the city assessor data
# From the Nolte et al. (2023) large-scale parcel data good
# practices data, we know that there are many issues in using parcel
# data to fill in important data fields (even occupancy type)
# It is not the panacea it appears framed as in the NSI technical
# documentation

# There are not nearly enough observations at the block level
# to reliably estimate the parameter for binomial # stories
# or multinomial foundation type. Sometimes just one observation
# in general. Block group appears to have enough
# This check is based on the subset of block groups (or other ref)
# in nsi_res that are also in full_df (these are the ones) we need
# the probabilities for
struct_tot = nsi_res[nsi_res['bg_id'].isin(full_df['bg_id'])]

### Number of stories

In [69]:
# Get the total number of structures w/ number of stories 
# in each block gruop
stories_sum = struct_tot.groupby(['bg_id', 'stories']).size()
# Then get the proportion
stories_prop = stories_sum/struct_tot.groupby(['bg_id']).size()
# Our parameters can be drawn from this table based on the bg_id
# of a structure we are estimating losses for
stories_param = stories_prop.reset_index().pivot(index='bg_id',
                                                 columns='stories',
                                                 values=0).fillna(0)
# Since it's a binomial distribution, we only need to specify
# one param. Arbitrarily choose 1S
# Round the param to the hundredth place
# We can access using .loc with the bg_id
stories_param = stories_param['1S'].round(2)

### Foundation type

# Repeat procedure above
found_sum = struct_tot.groupby(['bg_id', 'found_type']).size()
found_prop = found_sum/struct_tot.groupby(['bg_id']).size()
found_param = found_prop.reset_index().pivot(index='bg_id',
                                             columns='found_type',
                                             values=0).fillna(0)

# In this case, we end up with a dataframe that is
# perfect for calling np.random.multinomial since the pvals
# argument can take an np array
# So we will just pass in bg_id to loc and call values like
# np.random.multinomial(1, found_param.loc['421010054001'].values)

### Constants

In [None]:
# Coefficient of determination for structure value

# Triangular distributions for first-floor elevation conditioned
# on foundation type

# Load and prepare DDFs

In [None]:
# Need to load in DDFs and make sure they are in the correct form
# for drawing pct_dam from a probabilistic distribution
# Separate these versions from the point estimate based one

# Run through SOWs and estimate losses

# Get losses from our NSI & Hazus DDF benchmark