# Forest: Sensors

:::{note}
Under construction

Source: Pyviz Topics - [Carbon Monitoring Project](https://github.com/pyviz-topics/examples/blob/c26bf42101b53a6c356fcd5b3a784b07d6178940/carbon_flux/carbon_flux.ipynb)
:::

In [None]:
!pip -q install s3fs

In [None]:
!pip -q install geoviews

In [None]:
%%writefile catalog.yaml
sources:
  fluxnet_daily:
    driver: csv
    parameters:
      s3_path:
        description: Filename to load
        type: str
        default: earth-data/carbon_flux/nee_data_fusion/FLX_AR-SLu_FLUXNET2015_FULLSET_DD_2009-2011_1-3.csv
    cache:
      - argkey: urlpath
        regex: 'earth-data'
        type: file
    args:
      urlpath: "s3://{{ s3_path }}"
      path_as_pattern: 'FLX_{site}_FLUXNET2015_FULLSET_DD_{}.csv'
      csv_kwargs:
        assume_missing: true
        na_values: [-9999]
        parse_dates: ['TIMESTAMP']
      storage_options: {'anon': True}

  fluxnet_metadata:
    driver: csv
    cache:
      - argkey: urlpath
        regex: 'earth-data'
        type: file
    args:
      urlpath: "s3://earth-data/carbon_flux/nee_data_fusion/allflux_metadata.txt"
      csv_kwargs:
        header: null
        names: ['site', 'lat', 'lon', 'igbp', 'network']
        usecols: ['site', 'lat', 'lon', 'igbp']
      storage_options: {'anon': True}

In [None]:
import sys
import dask
import numpy as np
import pandas as pd

import holoviews as hv

import hvplot.pandas
import geoviews.tile_sources as gts

pd.options.display.max_columns = 10
hv.extension('bokeh', width=120)

In [None]:
import intake

cat = intake.open_catalog('catalog.yaml')
list(cat)

In [None]:
metadata = cat.fluxnet_metadata().read()
metadata.sample(5)

In [None]:
igbp_vegetation = {
    'WAT': '00 - Water',
    'ENF': '01 - Evergreen Needleleaf Forest',
    'EBF': '02 - Evergreen Broadleaf Forest',
    'DNF': '03 - Deciduous Needleleaf Forest',
    'DBF': '04 - Deciduous Broadleaf Forest',
    'MF' : '05 - Mixed Forest',
    'CSH': '06 - Closed Shrublands',
    'OSH': '07 - Open shrublands',
    'WSA': '08 - Woody Savannas',
    'SAV': '09 - Savannas',
    'GRA': '10 - Grasslands',
    'WET': '11 - Permanent Wetlands',
    'CRO': '12 - Croplands',
    'URB': '13 - Urban and Built-up',
    'CNV': '14 - Cropland/Nartural Vegetation Mosaics',
    'SNO': '15 - Snow and Ice',
    'BSV': '16 - Baren or Sparsely Vegetated'
}

from pandas.api.types import CategoricalDtype

dtype = CategoricalDtype(ordered=True, categories=sorted(igbp_vegetation.values()))
metadata['vegetation'] = (metadata['igbp']
                          .apply(lambda x: igbp_vegetation[x])
                          .astype(dtype))
metadata.sample(5)

In [None]:
metadata.hvplot.points('lon', 'lat', geo=True, color='vegetation',
                       height=420, width=800, cmap='Category20') * gts.OSM

#### Loading FluxNet data

In [None]:
data_columns = ['P_ERA', 'TA_ERA', 'PA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'WS_ERA',
                'VPD_ERA', 'TIMESTAMP', 'site', 'NEE_CUT_USTAR50']
soil_data_columns = ['SWC_F_MDS_1', 'SWC_F_MDS_2', 'SWC_F_MDS_3',
                     'TS_F_MDS_1', 'TS_F_MDS_2', 'TS_F_MDS_3']

keep_from_csv = data_columns + soil_data_columns

y_variable = 'NEE_CUT_USTAR50'

def season(df, metadata):
    """Add season column based on lat and month
    """
    site = df['site'].cat.categories.item()
    lat = metadata[metadata['site'] == site]['lat'].item()
    if lat > 0:
        seasons = {3: 'spring',  4: 'spring',  5: 'spring',
                   6: 'summer',  7: 'summer',  8: 'summer',
                   9: 'fall',   10: 'fall',   11: 'fall',
                  12: 'winter',  1: 'winter',  2: 'winter'}
    else:
        seasons = {3: 'fall',    4: 'fall',    5: 'fall',
                   6: 'winter',  7: 'winter',  8: 'winter',
                   9: 'spring', 10: 'spring', 11: 'spring',
                  12: 'summer',  1: 'summer',  2: 'summer'}
    return df.assign(season=df.TIMESTAMP.dt.month.map(seasons))


def clean_data(df):
    """
    Clean data columns:

    * add NaN col for missing columns
    * throw away un-needed columns
    * add day of year
    """
    df = df.assign(**{col: np.nan for col in keep_from_csv if col not in df.columns})
    df = df[keep_from_csv]

    df = df.assign(DOY=df.TIMESTAMP.dt.dayofyear)
    df = df.assign(year=df.TIMESTAMP.dt.year)
    df = season(df, metadata)

    return df

### Read and clean data

In [None]:
from s3fs import S3FileSystem

s3 = S3FileSystem(anon=True)
s3_paths = s3.glob('earth-data/carbon_flux/nee_data_fusion/FLX*')

datasets = []
skipped = []
used = []

for i, s3_path in enumerate(s3_paths):
    dd = cat.fluxnet_daily(s3_path=s3_path).to_dask()
    site = dd['site'].cat.categories.item()

    if not set(dd.columns) >= set(data_columns):
        skipped.append(site)
        continue

    datasets.append(clean_data(dd))
    used.append(site)

In [None]:
data = dask.dataframe.concat(datasets).compute()
data.columns

data['site'] = data['site'].astype('category')


### Visualizing Data Available at Sites

In [None]:
def mapper(x):
    if x in used:
        return 'valid'
    elif x in skipped:
        return 'skipped'
    else:
        return 'no data'

cmap = {'valid': 'green', 'skipped': 'red', 'no data': 'darkgray'}

QA = metadata.copy()
QA['quality'] = QA['site'].map(mapper)

all_points = QA.hvplot.points('lon', 'lat', geo=True, color='quality',
                              cmap=cmap, hover_cols=['site', 'vegetation'],
                              height=420, width=600).options(tools=['hover', 'tap'],
                                                             legend_position='top')

def veg_count(data):
    veg_count = data['vegetation'].value_counts().sort_index(ascending=False)
    return veg_count.hvplot.barh(height=420, width=500)

hist = veg_count(QA[QA.quality=='valid']).relabel('Vegetation counts for valid sites')

all_points * gts.OSM + hist

We'll make a couple of functions that generate plots on the full set of data or a subset of the data. We will use these in a dashboard below.

In [None]:
def site_timeseries(data):
    """Timeseries plot showing the mean carbon flux at each DOY as well as the min and max"""

    tseries = hv.Overlay([
        (data.groupby(['DOY', 'year'])[y_variable]
             .mean().groupby('DOY').agg([np.min, np.max])
             .hvplot.area('DOY', 'amin', 'amax', alpha=0.2, fields={'amin': y_variable})),
        data.groupby('DOY')[y_variable].mean().hvplot()])

    return tseries.options(width=800, height=400)

def site_count_plot(data):
    """Plot of the number of observations of each of the non-mandatory variables."""
    return data[soil_data_columns + ['site']].count().hvplot.bar(rot=90, width=300, height=400)

timeseries = site_timeseries(data)
count_plot = site_count_plot(data)
timeseries + count_plot