# CAMS50 VRA2016: Validated Observations
CAMS50 runs a reanalysis with validated obrvations 2 years after the fact.

In [1]:
from glob import glob
from os.path import isfile, basename, dirname
from os import remove

import pandas as pd
import xarray as xr

# only 3 decimal points on df.head() and df.describe()
pd.options.display.float_format = '{:,.3f}'.format

## Observation datasets
- `eeaVRA`: validated surface obs for data assimilation
- `eeaVAL`: validated surface obs for model evaluataion

In [2]:
lustre = "/lustre/storeA/users/alvarov/CAMS50/%s"
files = dict(
    eeaVRA=glob(lustre%'obs/VRA_2016/assimilation_*.nc'),
    eeaVAL=glob(lustre%'obs/VRA_2016/validation_*.nc'),
)
for k,v in files.items():
    print("%s: %3d files"%(k,len(v)))

eeaVRA:   6 files
eeaVAL:   6 files


In [3]:
# save collocated datasets
def save2nc(ds=None, f=lustre%'vra2016colloc.nc'):
    if isfile(f):
        data = xr.open_dataset(f, autoclose=True).load()
        if ds:
            data = data.combine_first(ds)
            for param in ds.data_vars: 
                if 'units' not in data[param].attrs:
                    data[param].attrs.update(ds[param].attrs)
            data.to_netcdf(f, mode='w')
            del(ds)
        return data
    elif ds:
        ds.to_netcdf(f, mode='w')
        return ds
    else:
        return xr.Dataset()

# Validated Observations
Observations for *O3*, *NO2*, *SO2*, *CO*, *PM25* and *PM10* in *ug/m3*, are divided on 2 datasets,
assimilation and validation.
- The dataset split is not consistent across species.
- The classification is not consistent across species.

The observations were stored in NetCDF files as part of the pre-processing for data assimilation. Station classification is not included on the NetCDF files, they need to be read from station location (text) files.

In [4]:
dset = lambda dname, poll: '%s/stations.%s.background_assimilation_set7'%(dname, poll)

def surfLoc(fname, poll):
    if fname.endswith('.nc'):
        fname = dset(dirname(fname), poll)
    ds = pd.read_csv(
        fname, sep=' ',
        names = 'station lon lat alt c0 c1'.split(),
        index_col = 'station'
    )
    ds['cls'] = ds.apply(lambda row: '{0.c0}/{0.c1}'.format(row), 'columns')
    return ds.drop(['c0','c1'], 'columns')

%time df = surfLoc(files['eeaVRA'][0], 'CO')
df.head()

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 44.4 ms


Unnamed: 0_level_0,lon,lat,alt,cls
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LU0102A,5.977,49.505,287.0,background/urban
IT0706A,9.328,45.483,123.0,background/urban
DEHB012,8.735,53.125,8.0,background/urban
PL0509A,19.697,51.404,180.0,background/urban
RO0111A,24.495,47.127,365.0,background/urban


In [5]:
def surfObs(fname, dataset):
    ds = xr.open_dataset(fname)

    # byte to sting
    ds['station'] = ds['stationID'].astype(str) # station names
    
    # clasification info, species dependant
    for param in ds.data_vars: 
        if ds[param].attrs.get('units',None) == 'ug/m3':
            ds['cls'] = surfLoc(fname, param).cls
            ds['cls'] = ds.cls.assign_coords(poll=param).expand_dims('poll')

    # add dataset coordinate
    return ds.drop(['stationID']).assign_coords(dataset=dataset).expand_dims('dataset')

%time ds = surfObs(files['eeaVRA'][0], 'eeaVRA')
ds

CPU times: user 440 ms, sys: 120 ms, total: 560 ms
Wall time: 1.27 s


<xarray.Dataset>
Dimensions:  (dataset: 1, poll: 1, station: 269, time: 8783)
Coordinates:
  * station  (station) object 'AD0942A' 'AL0203A' 'AL0206A' 'AT0ILL1' ...
  * time     (time) datetime64[ns] 2016-01-01 2016-01-01T01:00:00 ...
  * poll     (poll) <U2 'CO'
  * dataset  (dataset) <U6 'eeaVRA'
Data variables:
    lat      (dataset, station) float32 42.509693 40.62593 42.3139 47.77028 ...
    lon      (dataset, station) float32 1.539138 20.78018 19.52342 16.76639 ...
    alt      (dataset, station) float32 1080.0 848.0 13.0 117.0 3106.0 215.0 ...
    CO       (dataset, time, station) float32 600.0 2026.52 293.48 428.934 ...
    cls      (dataset, poll, station) object 'background/urban' ...
Attributes:
    source:   /home/alvarov/obs4cwf/2016_AirBase/data.background.assimilation...

## Read all observations files

In [6]:
for k,v in files.items():
    if k.startswith('eea'):
        for fname in v:
            save2nc(surfObs(fname, k))

data = save2nc()
data

  del sys.path[0]
  """


<xarray.Dataset>
Dimensions:  (dataset: 2, poll: 6, station: 2331, time: 8783)
Coordinates:
  * dataset  (dataset) object 'eeaVAL' 'eeaVRA'
  * poll     (poll) object 'CO' 'NO2' 'O3' 'PM10' 'PM25' 'SO2'
  * station  (station) object 'AD0942A' 'AD0944A' 'AD0945A' 'AL0203A' ...
  * time     (time) datetime64[ns] 2016-01-01 2016-01-01T01:00:00 ...
Data variables:
    lat      (dataset, station) float32 nan nan 42.53488 nan 40.40309 nan ...
    lon      (dataset, station) float32 nan nan 1.716986 nan 19.4862 nan nan ...
    alt      (dataset, station) float32 nan nan 2515.0 nan 25.0 nan nan nan ...
    CO       (dataset, time, station) float32 nan nan nan nan 930.32 nan nan ...
    cls      (dataset, poll, station) object '' '' '' '' '' '' '' '' '' '' ...
    NO2      (dataset, time, station) float32 nan nan nan nan 13.8073 nan ...
    PM10     (dataset, time, station) float32 nan nan nan nan nan nan nan ...
    PM25     (dataset, time, station) float32 nan nan nan nan nan nan nan ...
    

## Observations per dataset

In [7]:
data.sel(dataset='eeaVRA').drop(['lon','lat','alt']).to_dataframe().describe(percentiles=[.25,.5,.75,.9,.95])

Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,12569154.0,61165710.0,37065210.0,17951304.0,32825730.0,60641682.0
mean,327.849,13.544,16.138,12.333,4.575,36.069
std,320.329,16.902,16.704,13.224,14.099,30.915
min,6.0,0.0,0.0,0.0,0.0,0.0
25%,156.6,5.88,8.8,5.0,1.0,27.877
50%,242.984,12.0,14.75,8.958,2.398,51.37
75%,399.0,23.93,23.73,16.0,5.0,72.07
90%,608.32,40.0,36.0,26.644,10.0,91.0
95%,838.894,51.5,47.0,35.7,16.0,103.7
max,9710.0,414.0,604.72,518.474,2978.0,434.647


In [8]:
data.sel(dataset='eeaVAL').drop(['lon','lat','alt']).to_dataframe().describe(percentiles=[.25,.5,.75,.9,.95])

Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,5376648.0,26187546.0,15915144.0,7677066.0,13847418.0,25984494.0
mean,338.688,17.521,19.037,12.955,5.9,49.498
std,314.375,17.418,17.239,12.816,31.192,30.234
min,5.626,0.0,0.001,0.002,0.0,0.001
25%,170.0,6.5,9.14,5.0,1.15,27.1
50%,255.0,13.056,15.18,9.1,2.75,51.2
75%,400.0,25.5,24.09,16.571,5.1,72.83
90%,639.92,41.617,36.464,27.077,11.0,92.0
95%,876.255,53.0,47.0,36.0,18.0,104.47
max,7720.0,394.0,589.51,433.056,2595.0,298.55


## Unique stations

In [9]:
%time stat = data[['lon','lat','alt','cls']]
%time stat = stat.sel(dataset='eeaVRA').combine_first(stat.sel(dataset='eeaVAL'))
stat

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 202 µs
CPU times: user 2.24 s, sys: 2.22 s, total: 4.46 s
Wall time: 4.43 s


<xarray.Dataset>
Dimensions:  (poll: 6, station: 2331)
Coordinates:
  * poll     (poll) object 'CO' 'NO2' 'O3' 'PM10' 'PM25' 'SO2'
  * station  (station) object 'AD0942A' 'AD0944A' 'AD0945A' 'AL0203A' ...
Data variables:
    lon      (station) float32 1.539138 1.56525 1.716986 20.78018 19.4862 ...
    lat      (station) float32 42.509693 42.516945 42.53488 40.62593 ...
    alt      (station) float32 1080.0 1637.0 2515.0 848.0 25.0 13.0 525.0 ...
    cls      (poll, station) object 'background/urban' '' '' ...
Attributes:
    source:   /home/alvarov/obs4cwf/2016_AirBase/data.background.assimilation...

In [10]:
stat.to_dataframe().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lon,lat,alt,cls
poll,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CO,AD0942A,1.539,42.51,1080.0,background/urban
CO,AD0944A,1.565,42.517,1637.0,
CO,AD0945A,1.717,42.535,2515.0,
CO,AL0203A,20.78,40.626,848.0,background/suburban
CO,AL0204A,19.486,40.403,25.0,


## Station classification(s)
Make it a coordinate, as it should not change as we add more datasets

In [11]:
data['cls'] = stat.cls
data = data.set_coords('cls')
save2nc(data)

  """


<xarray.Dataset>
Dimensions:  (dataset: 2, poll: 6, station: 2331, time: 8783)
Coordinates:
    cls      (poll, station) object 'background/urban' '' '' ...
  * dataset  (dataset) object 'eeaVAL' 'eeaVRA'
  * poll     (poll) object 'CO' 'NO2' 'O3' 'PM10' 'PM25' 'SO2'
  * station  (station) object 'AD0942A' 'AD0944A' 'AD0945A' 'AL0203A' ...
  * time     (time) datetime64[ns] 2016-01-01 2016-01-01T01:00:00 ...
Data variables:
    lat      (dataset, station) float32 nan nan 42.53488 nan 40.40309 nan ...
    lon      (dataset, station) float32 nan nan 1.716986 nan 19.4862 nan nan ...
    alt      (dataset, station) float32 nan nan 2515.0 nan 25.0 nan nan nan ...
    CO       (dataset, time, station) float32 nan nan nan nan 930.32 nan nan ...
    NO2      (dataset, time, station) float32 nan nan nan nan 13.8073 nan ...
    PM10     (dataset, time, station) float32 nan nan nan nan nan nan nan ...
    PM25     (dataset, time, station) float32 nan nan nan nan nan nan nan ...
    SO2      (data