<img width="50" src="https://carbonplan-assets.s3.amazonaws.com/monogram/dark-small.png" style="margin-left:0px;margin-top:20px"/>

# FIA to Parquet

_by Joe Hamman (CarbonPlan), June 30, 2020_

This notebook converts FIA csv files to Parquet format and stages them in a
Google Cloud Storage bucket.

**Inputs:**

- `ENTIRE` directory or a cloud bucket with the contents of the ENTIRE directory
  in csv format.

**Outputs:**

- One Parquet dataset per CSV:
  `gs://carbonplan-data/raw/fia/<name.lower()>.parquet`

**Notes:**

- No reprojection or processing of the data is done in this notebook.


In [None]:
import fsspec
import dask.dataframe as dd
import pathlib

In [None]:
fs = fsspec.get_filesystem_class("gs")()

csvs = fs.glob("carbonplan-data/raw/fia/*.csv")

In [None]:
from dask.distributed import Client

client = Client()
client

In [None]:
dtypes = {
    "cond": {
        "DWM_FUELBED_TYPCD": "object",
        "HABTYPCD1": "object",
        "LAND_USE_SRS": "float64",
        "MIXEDCONFCD": "object",
        "STUMP_CD_PNWRS": "object",
        "SUBCYCLE": "float64",
        "HABTYPCD1_DESCR_PUB_CD": "object",
        "HABTYPCD1_PUB_CD": "object",
        "HABTYPCD2": "object",
        "HABTYPCD2_DESCR_PUB_CD": "object",
        "HABTYPCD2_PUB_CD": "object",
    },
    "cond_dwm_calc": {
        "PILE_TL_UNADJ": "float64",
        "CWD_TL_UNADJ": "float64",
        "PHASE": "object",
        "PILE_TL_ADJ": "float64",
        "PILE_TL_COND": "float64",
        "CWD_TL_UNADJ": "float64",
        "PILE_TL_ADJ": "float64",
        "PILE_TL_COND": "float64",
    },
    "dwm_coarse_woody_debris": {
        "CONDID": "float64",
        "LENGTH": "float64",
        "MEASYEAR": "float64",
        "ORNTCD_PNWRS": "object",
        "HOLLOWCD": "object",
    },
    "dwm_duff_litter_fuel": {
        "CONDID": "float64",
        "DUFF_METHOD": "float64",
        "FUELBED_METHOD": "float64",
        "LITTER_METHOD": "float64",
        "MEASYEAR": "float64",
        "MODIFIED_IN_INSTANCE": "float64",
    },
    "dwm_fine_woody_debris": {
        "LARGECT": "float64",
        "MEASYEAR": "float64",
        "MEDIUMCT": "float64",
        "PILESCD": "float64",
        "RSNCTCD": "float64",
        "SMALLCT": "float64",
    },
    "dwm_microplot_fuel": {
        "DHRBCD": "float64",
        "DSHRBCD": "float64",
        "LITTERCD": "float64",
        "LVHRBCD": "float64",
        "LVSHRBCD": "float64",
    },
    "dwm_transect_segment": {
        "MEASYEAR": "float64",
        "SLOPE": "float64",
        "CONDID": "float64",
        "MODIFIED_IN_INSTANCE": "float64",
    },
    "dwm_visit": {
        "QASTATCD": "float64",
        "QA_STATUS": "float64",
        "SMPKNDCD": "float64",
    },
    "lichen_lab": {"MODIFIED_DATE": "object"},
    "p2veg_subplot_spp": {"MODIFIED_DATE": "object"},
    "p2veg_subp_structure": {"MODIFIED_DATE": "object"},
    "plot": {
        "CONGCD": "float64",
        "ECO_UNIT_PNW": "object",
        "ELEV": "float64",
        "INTENSITY": "float64",
        "MANUAL_DB": "float64",
        "MEASDAY": "float64",
        "MEASMON": "float64",
        "MEASYEAR": "float64",
        "P2PANEL": "float64",
        "PLOT_STATUS_CD": "float64",
        "PREV_MICROPLOT_LOC_RMRS": "object",
        "QA_STATUS": "float64",
        "SUBCYCLE": "float64",
        "SUBPANEL": "float64",
        "PAC_ISLAND_PNWRS": "object",
    },
    "plotsnap": {
        "ECO_UNIT_PNW": "object",
        "ELEV": "float64",
        "CONGCD": "float64",
        "SUBCYCLE": "float64",
    },
    "pop_eval_attribute": {"MODIFIED_DATE": "object"},
    "pop_plot_stratum_assgn": {"CN": "object"},
    "pop_stratum": {
        "ADJ_FACTOR_MACR": "float64",
        "MODIFIED_IN_INSTANCE": "float64",
    },
    "seedling": {
        "MODIFIED_IN_INSTANCE": "float64",
        "SPGRPCD": "float64",
        "TREECOUNT": "float64",
        "SUBCYCLE": "float64",
    },
    "sitetree": {
        "AGEDIA": "float64",
        "HT": "float64",
        "MODIFIED_IN_INSTANCE": "float64",
        "SIBASE": "float64",
        "SIBASE_FVS": "float64",
        "SITE_AGE_TREE_STATUS_PNWRS": "object",
        "SITE_TREE_METHOD_PNWRS": "object",
        "SITREE": "float64",
        "SITREE_FVS": "float64",
        "SPGRPCD": "float64",
        "SUBCYCLE": "float64",
        "VALIDCD": "float64",
    },
    "soils_erosion": {
        "COMPCPCT": "float64",
        "MODIFIED_DATE": "object",
        "SOILSPCT": "float64",
    },
    "soils_lab": {"QASTATCD": "float64", "VSTNBR": "float64"},
    "subplot": {"SUBCYCLE": "float64"},
    "subp_cond": {
        "MODIFIED_IN_INSTANCE": "float64",
        "SUBCYCLE": "float64",
        "CYCLE": "float64",
    },
    "tree": {
        "AGENTCD": "float64",
        "CULL": "float64",
        "P2A_GRM_FLG": "object",
        "TREECLCD": "float64",
        "TREEHISTCD": "float64",
        "MODIFIED_IN_INSTANCE": "float64",
        "GST_PNWRS": "object",
        "SPGRPCD": "float64",
        "DIAHTCD": "float64",
        "SUBCYCLE": "float64",
        "CAVITY_USE_PNWRS": "object",
    },
    "tree_grm_begin": {"TREECLCD": "float64"},
    "tree_grm_component": {
        "HT_END": "float64",
        "MODIFIED_IN_INSTANCE": "float64",
        "MICR_SUBPTYP_GRM_AL_FOREST": "float64",
        "MICR_SUBPTYP_GRM_AL_TIMBER": "float64",
        "SUBPTYP_BEGIN": "float64",
        "SUBPTYP_END": "float64",
        "SUBPTYP_MIDPT": "float64",
        "SWLG_DIA_THRESHOLD": "float64",
    },
    "tree_grm_estn": {
        "DIA_BEGIN_RECALC": "object",
        "EST_BEGIN_RECALC": "object",
        "MODIFIED_DATE": "object",
    },
    "tree_grm_threshold": {"TREECLCD": "float64"},
    "tree_regional_biomass": {"MODIFIED_IN_INSTANCE": "float64"},
    "veg_plot_species": {"SPECIMEN_COLLECTED": "float64"},
    "veg_quadrat": {"QUADRAT_STATUS": "float64"},
    "veg_subplot": {
        "VEG_SUBP_STATUS_CD": "float64",
        "VEG_SUBP_STATUS_CD_PRE2004": "float64",
    },
    "veg_subplot_spp": {
        "MODIFIED_DATE": "object",
        "QUAD_1_PRESENCE": "float64",
        "QUAD_2_PRESENCE": "float64",
        "QUAD_3_PRESENCE": "float64",
    },
}

In [None]:
sample = 256000

for csv in csvs:
    name = pathlib.PosixPath(csv).stem.lower()
    print(name)

    target = f"gs://carbonplan-data/raw/fia/{name}.parquet"

    dtype = dtypes.get(name, None)
    df = dd.read_csv(f"gs://{csv}", dtype=dtype, sample=sample)
    df.to_parquet(target)