# CAMS50 VRA2016: Collocated model results
CAMS50 runs a reanalysis with validated obrvations 2 years after the fact.

In [1]:
from glob import glob
from os.path import isfile, basename, dirname
from os import remove, rename

import numpy as np
import pandas as pd
import xarray as xr
import xarray.ufuncs as xu
from dask.diagnostics import ProgressBar

# only 3 decimal points on df.head() and df.describe()
pd.options.display.float_format = '{:,.3f}'.format

for m in [np, pd, xr]:
    print("%s %s"%(m.__name__, m.__version__))

numpy 1.14.1
pandas 0.22.0
xarray 0.10.1


## Datasets
- `eeaVRA`: validated surface obs for data assimilation
- `eeaVAL`: validated surface obs for model evaluataion
- `cifsBC`: CIFS boundary conditions
- `emepHC`: hindcast run (no DA), operational version (CAMS50.201801)
- `emepSS`: hindcast run (no DA), operational version (CAMS50.201801)
- `emepEM`: hindcast run (no DA), operational version (CAMS50.201801), new TNO-CAMS 2015 emissions
- `emepAN`: (re)analysis run (DA: NO2,O3,SO2), operational version (CAMS50.201801; DA16)
- `emepCO`: (re)analysis run (DA: NO2,O3,SO2,CO), operational version (CAMS50.201801; DA16) low rejection threshold (350 ug/m3)
- `emepCOv2`: (re)analysis run (DA: NO2,O3,SO2,CO), operational version (CAMS50.201801; DA16) higher rejection threshold (700 ug/m3)
- `emepPM`: (re)analysis run (DA: NO2,O3,SO2,PM25,PM10), development version (CAMS50.201801; DA17 wo/PM feedback)

In [2]:
lustre = "/lustre/storeA/users/alvarov/CAMS50/%s"
files = dict(
    eeaVRA=glob(lustre%'obs/VRA_2016/assimilation_*.nc'),
    eeaVAL=glob(lustre%'obs/VRA_2016/validation_*.nc'),
    cifsBC=glob(lustre%'2016_VRA/VRA_2016????_EU_EVA.nc'),
    emepHC=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00-2016.nc'),
    emepSS=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00SS-2016.nc'),
    emepEM=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00EM-2016.nc'),
    emepAN=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00AN-2016Q?.nc'),
    emepCO=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00CO-2016Q?.nc'),
    emepCOv2=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00COv2-2016Q?.nc'),
    emepPM=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00PM-2016Q?.nc'),
)
for k,v in files.items():
    print("%s: %3d files"%(k,len(v)))

eeaVRA:   6 files
eeaVAL:   6 files
cifsBC: 366 files
emepHC:   1 files
emepSS:   1 files
emepEM:   1 files
emepAN:   4 files
emepCO:   4 files
emepCOv2:   4 files
emepPM:   4 files


In [3]:
# save collocated datasets
def save2nc(ds=None, f=lustre%'vra2016colloc.nc'):
    if isfile(f):
        data = xr.open_dataset(f, autoclose=True).load()
        if ds:
            data = data.combine_first(ds)
            for param in ds.data_vars: 
                if 'units' not in data[param].attrs:
                    data[param].attrs.update(ds[param].attrs)
            rename(f, f+'~')
            data.to_netcdf(f, mode='w')
            #del(ds)
        return data
    elif ds:
        ds.to_netcdf(f, mode='w')
        return ds
    else:
        return xr.Dataset()

# Validated Observations
The processing of the observatiuon datasets is dealt in a separate [notebook](stations.ipynb)

## Unique stations

In [4]:
%time stat = save2nc()[['lon','lat','alt','cls']]
%time stat = stat.sel(dataset='eeaVRA').combine_first(stat.sel(dataset='eeaVAL'))
stat

CPU times: user 272 ms, sys: 6.94 s, total: 7.21 s
Wall time: 8.42 s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.11 ms


<xarray.Dataset>
Dimensions:  (poll: 6, station: 2331)
Coordinates:
    cls      (poll, station) object 'background/urban' '' '' ...
  * station  (station) object 'AD0942A' 'AD0944A' 'AD0945A' 'AL0203A' ...
  * poll     (poll) object 'CO' 'NO2' 'O3' 'PM10' 'PM25' 'SO2'
Data variables:
    lon      (station) float64 1.539 1.565 2.25 20.78 19.49 19.52 13.67 ...
    lat      (station) float64 42.51 42.52 42.75 40.63 40.4 42.31 48.39 ...
    alt      (station) float32 1080.0 1637.0 2515.0 848.0 25.0 13.0 525.0 ...
Attributes:
    source:   /home/alvarov/obs4cwf/2016_AirBase/data.background.assimilation...

# Collocation
For point-wise collocation, the lon/lat indexers need to be xarray.DataArrays.

In [5]:
def collocate(ds, lon=stat.lon, lat=stat.lat, dlon=1/4, dlat=1/8):
    """
    collocate dataset to coordinates
      for point-wise selection lon/lat need to be DataArrays (and ds.load())
      .sel(.., tolerance=max(dlat,dlon)) raise a KeyError for points outside domain
    """
    col = ds.load().sel(lon=lon, lat=lat, method='nearest')
    return col.where(abs(col.lon-lon)<dlon*0.5)\
              .where(abs(col.lat-lat)<dlat*0.5)\
              .reset_coords()

# Boundary conditions
From CIFS reanalysis. Daily files with 3-hourly records. 
- 366 files ~333M each, total 119Gb.

In [6]:
surfBCs = lambda ds: ds.rename(dict(
    longitude='lon',
    latitude='lat',
    co='CO',
    no2='NO2',
    so2='SO2',
    go3='O3',
)).sel(level=60).drop('level')
""" PM*
    aermr01='SEASALT_F',
    aermr02='SEASALT_C',
   #aermr03='SEASALT_C',    # not used
    aermr04='DUST_SAH_F',
    aermr05='DUST_SAH_F',
    aermr06*.15='DUST_SAH_F',
    aermr06*.35='DUST_SAH_C',
   #aermr07*1.7='FFIRE_OM', # not used
   #aermr08*1.7='FFIRE_OM', # not used
    aermr09='FFIRE_BC',     # not used
    aermr10='FFIRE_BC',     # not used
    aermr11='SO4',
   #aermr12='SO2',          # not used
"""

dropBCs = "aermr01 aermr02 aermr03 aermr04 aermr05 aermr06 aermr07 aermr08 aermr09 aermr10 aermr11 aermr12 hno3 pan no hcho ch4 c5h8 oh n2o5 c2h6 c3h8 hyai hybi".split()

In [7]:
%%time
ds = xr.open_mfdataset(   
    files['cifsBC'], chunks={'time':10}, concat_dim='time', autoclose=True,
    preprocess=surfBCs, drop_variables=dropBCs,
).assign_coords(dataset='cifsBC').expand_dims('dataset')

CPU times: user 14.8 s, sys: 3.02 s, total: 17.8 s
Wall time: 5min 15s


In [8]:
%%time
with ProgressBar():
    cifs = collocate(ds, dlon=1.125, dlat=1.125)

[########################################] | 100% Completed | 13min 21.3s
CPU times: user 2min 39s, sys: 3min 25s, total: 6min 4s
Wall time: 13min 26s


## Unit conversion
CIFS concentrations come in `kg/kg`, observations are in `ug/m3`

In [9]:
def unitConv(ds):
    rho = xu.exp(ds.lnsp)/(287.05 * ds.t)    
    for param in ds.data_vars: 
        if ds[param].attrs.get('units',None) == 'kg kg**-1':
            ds[param] *= 1e9*rho
            ds[param].attrs['units'] = 'ug/m3'
    return ds.drop(['t','lnsp'])
    
%time cifs = unitConv(cifs)

CPU times: user 200 ms, sys: 40 ms, total: 240 ms
Wall time: 235 ms


## Save collocated dataset

In [10]:
%time data = save2nc(cifs)
data.sel(dataset='cifsBC').drop(['lon','lat','alt']).to_dataframe().describe()

  """


CPU times: user 17.3 s, sys: 48.7 s, total: 1min 6s
Wall time: 1min 24s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,40740192.0,40740192.0,0.0,0.0,40740192.0,40740192.0
mean,162.643,10.089,,,3.149,46.672
std,113.35,9.756,,,5.382,25.808
min,55.129,0.0,,,-0.0,-0.001
25%,143.29,3.096,,,0.867,29.73
50%,181.093,8.016,,,1.825,51.695
75%,241.426,17.132,,,4.032,71.055
max,9097.746,158.011,,,136.428,248.683


# Model runs
The EMEP domain has 3 times the records and ~8 times more grid points than the CIFS domain.
- `emepHC`,  `emepSS`, `emepEM`:
  Single hindcast run, producing one **29Gb** hourly output file.
- `emepAN`, `emepCO`, `emepCOv2`, `emepPM`: 
  4 overlaping analysis runs, each producing **~8G** hourly output files.

In [11]:
def readRun(run):   
    ds = xr.Dataset()
    for fname in files[run]:
        ds = ds.combine_first(xr.open_dataset(fname, chunks={'time':6}))
    return ds.assign_coords(dataset=run).expand_dims('dataset')

In [12]:
surfEMEP = dict(
    SURF_ug_O3='O3',
    SURF_ug_NO2='NO2',
    SURF_ug_SO2='SO2',
    SURF_ug_CO='CO',
    SURF_ug_PM25_rh50='PM25',
    SURF_ug_PM10_rh50='PM10',
)

dropEMEP = 'P0 lev ilev hyam hybm hyai hybi COLUMN_NO2_k20 COLUMN_O3_k20 AOD_550nm'.split()

In [13]:
def processEMEP(run, drop=dropEMEP, surf=surfEMEP):
    if not files.get(run, None):
        return
    ds = readRun(run)
    emep = collocate(ds.drop(drop).rename(surf), dlon=1/4, dlat=1/8)
    data = save2nc(emep)
    return data.sel(dataset=run).drop(['lon','lat','alt']).to_dataframe().describe()

## Hindcast run

In [14]:
%time processEMEP('emepHC')

  """


CPU times: user 12min 43s, sys: 4min 57s, total: 17min 40s
Wall time: 43min 24s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,79.007,5.22,8.168,6.103,2.612,19.185
std,137.516,9.184,13.551,10.583,7.997,35.07
min,0.431,0.0,0.401,0.401,0.0,0.0
25%,129.522,2.348,6.295,4.105,0.381,44.783
50%,160.201,5.146,11.486,7.958,1.151,59.918
75%,202.782,10.826,19.204,14.582,3.328,74.092
max,9503.524,139.917,536.673,456.186,431.704,253.486


## Sea salt corrected BCs
Hindcast run, same set-up as `emepHC`, but with SS correction factors

In [15]:
%time processEMEP('emepSS')

  """


CPU times: user 12min 43s, sys: 6min 4s, total: 18min 48s
Wall time: 30min 31s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,79.008,5.217,8.091,6.141,2.612,19.185
std,137.527,9.179,13.449,10.636,7.997,35.066
min,0.431,0.0,0.401,0.401,0.0,0.0
25%,129.556,2.346,6.256,4.25,0.381,44.722
50%,160.234,5.141,11.407,8.183,1.153,59.839
75%,202.811,10.819,19.083,14.77,3.332,74.03
max,9503.693,139.91,536.175,456.523,431.704,253.404


## New emissions
Hindcast run, same set-up as `emepEM`, but with the new TNO-CAMS 2015 emissions

In [16]:
%time processEMEP('emepEM')

  """


CPU times: user 13min 23s, sys: 11min 36s, total: 24min 59s
Wall time: 51min 15s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,76.456,4.861,8.001,6.036,2.144,19.114
std,120.237,8.486,13.623,10.654,8.18,34.924
min,0.145,0.0,0.401,0.401,0.0,0.0
25%,125.675,2.068,5.983,3.835,0.243,45.543
50%,155.549,4.508,10.97,7.467,0.749,59.92
75%,194.718,9.543,18.557,13.956,2.242,73.421
max,6591.203,124.227,536.72,456.536,509.225,252.417


## (Re)Analysis runs
Assimilate `O3`, `NO2` & `SO2` observations from surface stations and `NO2`  trop. columns from  OMI. Current operational setup (CAMS50.201801; DA16 modules).

In [17]:
%time processEMEP('emepAN')

  """


CPU times: user 14min 55s, sys: 11min 39s, total: 26min 34s
Wall time: 29min 44s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,78.946,7.571,10.2,7.839,1.724,19.716
std,137.125,11.849,16.334,12.635,4.311,33.833
min,0.432,0.0,0.401,0.401,0.0,0.0
25%,126.313,5.072,8.195,5.349,0.578,25.752
50%,158.428,9.777,14.831,10.353,1.391,47.18
75%,201.819,17.649,24.634,18.457,3.099,66.944
max,9501.21,194.609,537.954,461.395,331.148,263.503


## Assimilate CO observations
Analysis run, same set-up as `emepAN`, but with addtional `CO` surface observations.  Same source code as operational set-up (CAMS50.201801; DA16 modules),
with minor modification to enhable `CO` assimilation. `emepCO` has a low `CO` observation rejection threshold (350 ug/m3). `emepCOv2` has double `CO` observation rejection treshold  (700 ug/m3).

In [18]:
%time processEMEP('emepCO')

  """


CPU times: user 14min 50s, sys: 21min 20s, total: 36min 11s
Wall time: 44min 25s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,81.305,7.571,10.193,7.837,1.724,19.719
std,140.696,11.849,16.31,12.633,4.311,33.81
min,0.0,0.0,0.401,0.401,0.0,0.0
25%,110.289,5.072,8.202,5.354,0.578,25.749
50%,156.34,9.777,14.85,10.365,1.391,47.172
75%,216.484,17.649,24.652,18.461,3.099,66.927
max,9623.118,194.596,537.838,461.434,330.139,263.601


In [19]:
%time processEMEP('emepCOv2')

  """


CPU times: user 13min 5s, sys: 27min 32s, total: 40min 37s
Wall time: 35min 16s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,93.625,7.571,10.157,7.795,1.724,19.728
std,167.228,11.85,16.242,12.59,4.311,33.857
min,0.0,0.0,0.401,0.401,0.0,0.0
25%,135.734,5.072,8.181,5.343,0.578,25.754
50%,187.151,9.776,14.806,10.332,1.391,47.184
75%,264.822,17.649,24.564,18.393,3.099,66.956
max,9347.808,194.59,537.808,461.388,332.551,263.024


## Assimilate PM observations
Analysis run, same set-up as `emepAN`, but with addtional `PM2.5` and `PM10` surface observations.  Development version of the assimilation modules (CAMS50.201801; DA17 modules), configured for `PM` assimilation without feerback.

In [20]:
%%time

processEMEP(
    'emepPM',
     drop=dropEMEP+['SURF_ug_PM25_rh50','SURF_ug_PM10_rh50'],
     surf=dict(
        SURF_ug_O3='O3',
        SURF_ug_NO2='NO2',
        SURF_ug_SO2='SO2',
        SURF_ug_CO='CO',
        SURF_ug_PM25_OA='PM25', # no feedback output
        SURF_ug_PM10_OA='PM10', # no feedback output
    ),
)

  """


CPU times: user 13min 43s, sys: 26min 14s, total: 39min 57s
Wall time: 35min 41s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,78.974,9.658,10.748,8.372,2.445,20.263
std,137.249,14.59,18.596,14.035,8.074,35.545
min,0.432,0.0,-271.764,-89.807,0.0,0.0
25%,126.571,5.914,9.594,5.342,0.729,30.745
50%,158.705,11.306,15.994,10.057,1.72,51.294
75%,202.15,20.962,25.845,18.375,3.77,70.729
max,9501.987,437.713,536.15,460.36,1036.666,273.538
