<img width="100" src="https://carbonplan-assets.s3.amazonaws.com/monogram/dark-small.png" style="margin-left:0px;margin-top:20px"/>

# FIA to Parquet

_by Joe Hamman (CarbonPlan), June 30, 2020_

This notebook converts FIA csv files to Parquet format and stages them in a Google Cloud Storage bucket.

**Inputs:**
- `ENTIRE` directory

**Outputs:**
- One Parquet dataset per CSV: `gs://carbonplan-data/raw/fia/<name>.parquet`

**Notes:**
- No reprojection or processing of the data is done in this notebook.

In [1]:
import io
import os.path
import pathlib
import pandas as pd
import gcsfs

# run `gcloud auth login` on the command line, or try switching token to `browser`
fs = gcsfs.GCSFileSystem(project='carbonplan', token='/Users/jhamman/.config/gcloud/legacy_credentials/joe@carbonplan.org/adc.json')

In [2]:
workdir = pathlib.Path('/Users/jhamman/workdir/carbonplan_data_downloads/fia/')

In [3]:
csvs = (workdir / 'ENTIRE').glob('*csv')

In [23]:
import numpy as np

def force_float32(fname):
    
    memmap = (fname.stat().st_size > 1e8)
    
    df = pd.read_csv(fname, engine='c', low_memory=False, memory_map=memmap)
    for c in df:
        if 'f8' in df[c].dtype.str:
            df[c] = df[c].astype(np.float32)

    return df

In [24]:
failed = []
for fname in csvs:
    blob = f'carbonplan-data/raw/fia/{fname.stem}.parquet'
    print(fname)

    df = force_float32(fname)
    
    try:
        df.to_parquet(blob, compression='gzip', open_with=fs.open, row_group_offsets=1000)
        # consider using dask dataframe here to write to chunked dataframes here.
        print('  --> ', blob)
    except:
        failed.append(fname)

/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/POP_STRATUM.csv
  -->  carbonplan-data/raw/fia/POP_STRATUM.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/COND.csv
  -->  carbonplan-data/raw/fia/COND.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/COUNTY.csv
  -->  carbonplan-data/raw/fia/COUNTY.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/OZONE_SPECIES_SUMMARY.csv
  -->  carbonplan-data/raw/fia/OZONE_SPECIES_SUMMARY.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/LICHEN_VISIT.csv
  -->  carbonplan-data/raw/fia/LICHEN_VISIT.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/SOILS_VISIT.csv
  -->  carbonplan-data/raw/fia/SOILS_VISIT.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/BOUNDARY.csv
  -->  carbonplan-data/raw/fia/BOUNDARY.parquet
/Users/jhamman/workdir/carbonplan_data_downloads/fia/ENTIRE/POP_EVAL_TYP.csv
  -->  carbonplan-data/raw/fia/POP_EVAL_TY

In [14]:
csvs = (workdir / 'ENTIRE').glob('*csv')
template = f'''
  raw_{csv.stem.lower()}:
    description: {csv.stem} from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.
    metadata:
      url: https://apps.fs.usda.gov/fia/datamart/datamart.html
      tags: [forest, parquet]
    driver: parquet
    args:
      urlpath: "gs://carbonplan-data/raw/fia/{csv.stem}.parquet"
'''
for csv in csvs:
    template = f'''
  raw_{csv.stem.lower()}:
    description: {csv.stem} from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.
    metadata:
      url: https://apps.fs.usda.gov/fia/datamart/datamart.html
      tags: [forest, parquet]
    driver: parquet
    args:
      urlpath: "gs://carbonplan-data/raw/fia/{csv.stem}.parquet"'''
    print(template)


  raw_soils_erosion:
    description: SOILS_EROSION from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.
    metadata:
      url: https://apps.fs.usda.gov/fia/datamart/datamart.html
      tags: [forest, parquet]
    driver: parquet
    args:
      urlpath: "gs://carbonplan-data/raw/fia/SOILS_EROSION.parquet"

  raw_pop_plot_stratum_assgn:
    description: POP_PLOT_STRATUM_ASSGN from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.
    metadata:
      url: https://apps.fs.usda.gov/fia/datamart/datamart.html
      tags: [forest, parquet]
    driver: parquet
    args:
      urlpath: "gs://carbonplan-data/raw/fia/POP_PLOT_STRATUM_ASSGN.parquet"

  raw_dwm_visit:
    description: DWM_VISIT from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.
    metadata:
      url: https://apps.fs.usda.gov/fia/datamart/datamart.html

In [15]:
import intake

In [16]:
cat = intake.open_catalog('../../intake-catalogs/master.yaml')



In [30]:
failed = []
for name, source in cat.forest.fia.items():
    try:
        print(source.describe())
        print(name, source().to_dask())
    except:
        failed.append(name)

{'name': 'raw_soils_erosion', 'container': 'dataframe', 'plugin': ['parquet'], 'description': 'SOILS_EROSION from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.', 'direct_access': 'forbid', 'user_parameters': [], 'metadata': {'url': 'https://apps.fs.usda.gov/fia/datamart/datamart.html', 'tags': ['forest', 'parquet']}, 'args': {'urlpath': 'gs://carbonplan-data/raw/fia/SOILS_EROSION.parquet'}}
{'name': 'raw_pop_plot_stratum_assgn', 'container': 'dataframe', 'plugin': ['parquet'], 'description': 'POP_PLOT_STRATUM_ASSGN from the Forest Inventory and Analysis Database (FIADB) comma-delimited files in FIADB version 1.8.0.02 format.', 'direct_access': 'forbid', 'user_parameters': [], 'metadata': {'url': 'https://apps.fs.usda.gov/fia/datamart/datamart.html', 'tags': ['forest', 'parquet']}, 'args': {'urlpath': 'gs://carbonplan-data/raw/fia/POP_PLOT_STRATUM_ASSGN.parquet'}}
{'name': 'raw_dwm_visit', 'container': 'dataframe', 'plugin': [

In [31]:
failed

['raw_soils_erosion',
 'raw_pop_plot_stratum_assgn',
 'raw_dwm_visit',
 'raw_lichen_lab',
 'raw_plot',
 'raw_subp_cond_chng_mtrx',
 'raw_subplot_regen',
 'raw_dwm_fine_woody_debris',
 'raw_ozone_plot_summary',
 'raw_sitetree',
 'raw_pop_eval_grp',
 'raw_plotsnap',
 'raw_soils_sample_loc',
 'raw_veg_subplot',
 'raw_ozone_validation',
 'raw_ozone_biosite_summary',
 'raw_tree_grm_threshold',
 'raw_cond_dwm_calc',
 'raw_dwm_duff_litter_fuel',
 'raw_dwm_transect_segment',
 'raw_dwm_microplot_fuel',
 'raw_veg_subplot_spp',
 'raw_pop_eval_attribute',
 'raw_ozone_plot',
 'raw_veg_quadrat',
 'raw_veg_visit',
 'raw_ozone_visit',
 'raw_p2veg_subplot_spp',
 'raw_tree',
 'raw_grnd_cvr',
 'raw_subp_cond']