# CAMS50 VRA2016: Collocated model results
CAMS50 runs a reanalysis with validated obrvations 2 years after the fact.

In [1]:
from glob import glob
from os.path import isfile, basename, dirname
from os import remove, rename

import numpy as np
import pandas as pd
import xarray as xr
import xarray.ufuncs as xu
from dask.diagnostics import ProgressBar

# only 3 decimal points on df.head() and df.describe()
pd.options.display.float_format = '{:,.3f}'.format

for m in [np, pd, xr]:
    print("%s %s"%(m.__name__, m.__version__))

numpy 1.14.2
pandas 0.23.4
xarray 0.10.8


## Datasets
- `eeaVRA`: validated surface obs for data assimilation
- `eeaVAL`: validated surface obs for model evaluataion
- `cifsBC`: CIFS boundary conditions
- `emepHC`: hindcast run (no DA), operational version (CAMS50.201801)
- `emepSS`: hindcast run (no DA), operational version (CAMS50.201801)
- `emepEM`: hindcast run (no DA), operational version (CAMS50.201801), new TNO-CAMS 2015 emissions
- `emepCM09`: hindcast run (no DA), development version (dev@59c7d07), EMEP 2009 chemical mechanism (EmChem09)
- `emepCM16`: hindcast run (no DA), development version (dev@59c7d07), EMEP 2016 chemical mechanism (EmChem16x)
- `emepAN`: (re)analysis run (DA: NO2,O3,SO2), operational version (CAMS50.201801; DA16)
- `emepCO`: (re)analysis run (DA: NO2,O3,SO2,CO), operational version (CAMS50.201801; DA16) low rejection threshold (350 ug/m3)
- `emepCOv2`: (re)analysis run (DA: NO2,O3,SO2,CO), operational version (CAMS50.201801; DA16) higher rejection threshold (700 ug/m3)
- `emepPM`: (re)analysis run (DA: NO2,O3,SO2,PM25,PM10), development version (CAMS50.201801; DA17 wo/PM feedback)
- `emepPMv2`: (re)analysis run (DA: NO2,O3,SO2,PM25,PM10), development version (CAMS50.201801; DA17 PM2.5 wo/feedback, PM10 w/feedback)
- `emepPMv3`: (re)analysis run (DA: NO2,O3,SO2,CO,PM10), development version (CAMS50.201801; DA17 PM10 w/feedback, no PM2.5)

In [2]:
lustre = "/lustre/storeA/users/alvarov/CAMS50/%s"
files = dict(
    eeaVRA=glob(lustre%'obs/VRA_2016/assimilation_*.nc'),
    eeaVAL=glob(lustre%'obs/VRA_2016/validation_*.nc'),
    cifsBC=glob(lustre%'2016_VRA/VRA_2016????_EU_EVA.nc'),
    emepHC=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00-2016.nc'),
    emepSS=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00SS-2016.nc'),
    emepEM=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00EM-2016.nc'),
    emepCM09=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00CM09-2016.nc'),
    emepCM16=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00CM16-2016.nc'),
    emepAN=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00AN-2016Q?.nc'),
    emepCO=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00CO-2016Q?.nc'),
    emepCOv2=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00COv2-2016Q?.nc'),
    emepPM=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00PM-2016Q?.nc'),
    emepPMv2=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00PMv2-2016Q?.nc'),
    emepPMv3=glob(lustre%'VRA-2016/BM_CAMS50.201801/VRA00PMv3-2016Q?.nc'),
)
for k,v in files.items():
    print("%s: %3d files"%(k,len(v)))

eeaVRA:   6 files
eeaVAL:   6 files
cifsBC: 366 files
emepHC:   1 files
emepSS:   1 files
emepEM:   1 files
emepCM09:   1 files
emepCM16:   1 files
emepAN:   4 files
emepCO:   4 files
emepCOv2:   4 files
emepPM:   4 files
emepPMv2:   4 files
emepPMv3:   4 files


In [3]:
# save collocated datasets
def save2nc(ds=None, f=lustre%'vra2016colloc.nc'):
    if isfile(f):
        data = xr.open_dataset(f, autoclose=True).load()
        if ds:
            data = data.combine_first(ds)
            for param in ds.data_vars: 
                if 'units' not in data[param].attrs:
                    data[param].attrs.update(ds[param].attrs)
            rename(f, f+'~')
            data.to_netcdf(f, mode='w')
            #del(ds)
        return data
    elif ds:
        ds.to_netcdf(f, mode='w')
        return ds
    else:
        return xr.Dataset()

# Validated Observations
The processing of the observatiuon datasets is dealt in a separate [notebook](stations.ipynb)

## Unique stations

In [4]:
%time stat = save2nc()[['lon','lat','alt','cls']]
%time stat = stat.sel(dataset='eeaVRA').combine_first(stat.sel(dataset='eeaVAL'))
stat

CPU times: user 312 ms, sys: 17.7 s, total: 18 s
Wall time: 28.7 s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 9.12 ms


<xarray.Dataset>
Dimensions:  (poll: 6, station: 2331)
Coordinates:
    cls      (poll, station) object 'background/urban' '' '' ...
  * poll     (poll) object 'CO' 'NO2' 'O3' 'PM10' 'PM25' 'SO2'
  * station  (station) object 'AD0942A' 'AD0944A' 'AD0945A' 'AL0203A' ...
Data variables:
    lon      (station) float64 1.539 1.565 2.25 20.78 19.49 19.52 13.67 ...
    lat      (station) float64 42.51 42.52 42.75 40.63 40.4 42.31 48.39 ...
    alt      (station) float32 1080.0 1637.0 2515.0 848.0 25.0 13.0 525.0 ...
Attributes:
    source:   /home/alvarov/obs4cwf/2016_AirBase/data.background.assimilation...

# Collocation
For point-wise collocation, the lon/lat indexers need to be xarray.DataArrays.

In [5]:
def collocate(ds, lon=stat.lon, lat=stat.lat, dlon=1/4, dlat=1/8):
    """
    collocate dataset to coordinates
      for point-wise selection lon/lat need to be DataArrays (and ds.load())
      .sel(.., tolerance=max(dlat,dlon)) raise a KeyError for points outside domain
    """
    col = ds.load().sel(lon=lon, lat=lat, method='nearest')
    return col.where(
        np.logical_and(abs(col.lon-lon)<dlon*0.5,
                       abs(col.lat-lat)<dlat*0.5)
    ).reset_coords()

# Boundary conditions
From CIFS reanalysis. Daily files with 3-hourly records. 
- 366 files ~333M each, total 119Gb.

In [6]:
surfBCs = lambda ds: ds.rename(dict(
    longitude='lon',
    latitude='lat',
    co='CO',
    no2='NO2',
    so2='SO2',
    go3='O3',
)).sel(level=60).drop('level')
""" PM*
    aermr01='SEASALT_F',
    aermr02='SEASALT_C',
   #aermr03='SEASALT_C',    # not used
    aermr04='DUST_SAH_F',
    aermr05='DUST_SAH_F',
    aermr06*.15='DUST_SAH_F',
    aermr06*.35='DUST_SAH_C',
   #aermr07*1.7='FFIRE_OM', # not used
   #aermr08*1.7='FFIRE_OM', # not used
    aermr09='FFIRE_BC',     # not used
    aermr10='FFIRE_BC',     # not used
    aermr11='SO4',
   #aermr12='SO2',          # not used
"""

dropBCs = "aermr01 aermr02 aermr03 aermr04 aermr05 aermr06 aermr07 aermr08 aermr09 aermr10 aermr11 aermr12 hno3 pan no hcho ch4 c5h8 oh n2o5 c2h6 c3h8 hyai hybi".split()

In [7]:
%%time
ds = xr.open_mfdataset(   
    files['cifsBC'], chunks={'time':10}, concat_dim='time', autoclose=True,
    preprocess=surfBCs, drop_variables=dropBCs,
).assign_coords(dataset='cifsBC').expand_dims('dataset')

CPU times: user 16.4 s, sys: 2.9 s, total: 19.3 s
Wall time: 3min 53s


In [8]:
%%time
with ProgressBar():
    cifs = collocate(ds, dlon=1.125, dlat=1.125)

[########################################] | 100% Completed |  9min 14.3s
CPU times: user 2min 44s, sys: 2min 41s, total: 5min 25s
Wall time: 9min 16s


## Unit conversion
CIFS concentrations come in `kg/kg`, observations are in `ug/m3`

In [9]:
def unitConv(ds):
    rho = xu.exp(ds.lnsp)/(287.05 * ds.t)    
    for param in ds.data_vars: 
        if ds[param].attrs.get('units',None) == 'kg kg**-1':
            ds[param] *= 1e9*rho
            ds[param].attrs['units'] = 'ug/m3'
    return ds.drop(['t','lnsp'])
    
%time cifs = unitConv(cifs)

CPU times: user 148 ms, sys: 32 ms, total: 180 ms
Wall time: 178 ms


## Save collocated dataset

In [10]:
%time data = save2nc(cifs)
data.sel(dataset='cifsBC').drop(['lon','lat','alt']).to_dataframe().describe()

  """


CPU times: user 5.63 s, sys: 4.11 s, total: 9.74 s
Wall time: 10.4 s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,40722624.0,40722624.0,0.0,0.0,40722624.0,40722624.0
mean,162.7,10.093,,,3.15,46.671
std,113.343,9.756,,,5.383,25.81
min,55.498,0.0,,,-0.0,-0.001
25%,143.327,3.101,,,0.868,29.717
50%,181.124,8.022,,,1.826,51.695
75%,241.463,17.137,,,4.034,71.053
max,9097.746,158.011,,,136.428,248.683


# Model runs
The EMEP domain has 3 times the records and ~8 times more grid points than the CIFS domain.
- `emepHC`,  `emepSS`, `emepEM`:
  Single hindcast run, producing one **29Gb** hourly output file.
- `emepAN`, `emepCO`, `emepCOv2`, `emepPM`, `emepPMv2`, , `emepPMv3`: 
  4 overlaping analysis runs, each producing **~8G** hourly output files.

In [6]:
def readRun(run):   
    ds = xr.Dataset()
    for fname in files[run]:
        ds = ds.combine_first(xr.open_dataset(fname, chunks={'time':6}))
    return ds.assign_coords(dataset=run).expand_dims('dataset')

In [7]:
surfEMEP = dict(
    SURF_ug_O3='O3',
    SURF_ug_NO2='NO2',
    SURF_ug_SO2='SO2',
    SURF_ug_CO='CO',
    SURF_ug_PM25_rh50='PM25',
    SURF_ug_PM10_rh50='PM10',
)

dropEMEP = 'P0 lev ilev hyam hybm hyai hybi COLUMN_NO2_k20 COLUMN_O3_k20 AOD_550nm'.split()

In [8]:
def processEMEP(run, drop=dropEMEP, surf=surfEMEP):
    if not files.get(run, None):
        return
    ds = readRun(run)
    emep = collocate(ds.drop(drop).rename(surf), dlon=1/4, dlat=1/8)
    data = save2nc(emep)
    return data.sel(dataset=run).drop(['lon','lat','alt']).to_dataframe().describe()

## Hindcast run

In [14]:
%time processEMEP('emepHC')

  """


CPU times: user 13min 54s, sys: 1min 19s, total: 15min 13s
Wall time: 1h 2min 54s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,76.621,5.231,8.092,5.943,2.65,17.988
std,144.101,9.478,13.68,10.781,8.363,34.15
min,0.482,0.0,0.401,0.401,0.0,0.0
25%,130.466,2.515,6.402,4.19,0.417,44.404
50%,161.577,5.465,11.631,8.088,1.224,59.66
75%,205.244,11.385,19.418,14.78,3.48,73.892
max,9503.524,139.917,536.673,456.186,431.704,253.486


## Sea salt corrected BCs
Hindcast run, same set-up as `emepHC`, but with SS correction factors

In [15]:
%time processEMEP('emepSS')

  """


CPU times: user 12min 44s, sys: 1min 32s, total: 14min 16s
Wall time: 28min 15s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,76.629,5.229,8.016,5.988,2.651,17.988
std,144.109,9.473,13.591,10.824,8.363,34.135
min,0.483,0.0,0.401,0.401,0.0,0.0
25%,130.5,2.513,6.364,4.333,0.418,44.343
50%,161.611,5.461,11.553,8.309,1.226,59.581
75%,205.273,11.377,19.296,14.963,3.484,73.83
max,9503.693,139.91,536.175,456.523,431.704,253.404


## New emissions
Hindcast run, same set-up as `emepEM`, but with the new TNO-CAMS 2015 emissions

In [16]:
%time processEMEP('emepEM')

  """


CPU times: user 11min 49s, sys: 1min 32s, total: 13min 22s
Wall time: 55min 51s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,73.602,4.81,7.916,5.891,2.252,17.906
std,126.695,8.809,13.793,10.875,8.587,33.959
min,0.145,0.0,0.401,0.401,0.0,0.0
25%,126.517,2.202,6.089,3.915,0.267,45.206
50%,156.731,4.783,11.111,7.588,0.801,59.691
75%,196.883,10.059,18.763,14.148,2.359,73.252
max,6591.203,124.227,536.72,456.536,509.225,252.417


## Development version
Hindcast run, same set-up as `emepHC`, but with from dev@59c7d07 and different chemical mechanism (`emepCM09`:EmChem09, `emepCM16`:EmChem16x).

In [9]:
%time processEMEP('emepCM09')

  """


CPU times: user 16min 19s, sys: 2min 48s, total: 19min 8s
Wall time: 32min 5s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,79.01,6.142,5.429,3.635,2.756,19.923
std,137.467,10.585,11.06,6.685,8.35,38.408
min,0.432,0.001,0.28,0.232,0.0,0.0
25%,128.019,2.967,3.939,2.48,0.657,48.405
50%,158.81,6.544,7.015,4.44,1.571,65.475
75%,202.075,13.494,11.75,7.869,3.837,83.006
max,9500.21,139.503,528.904,455.192,468.628,301.452


In [10]:
%time processEMEP('emepCM16')

  """


CPU times: user 17min 6s, sys: 3min 14s, total: 20min 21s
Wall time: 47min 32s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,79.173,5.088,8.296,5.918,2.589,19.691
std,138.296,9.08,13.836,10.063,8.109,37.124
min,0.432,0.0,0.28,0.232,0.0,0.0
25%,131.614,2.31,6.242,4.115,0.362,48.186
50%,161.773,4.842,11.417,7.969,1.102,64.288
75%,204.484,10.201,19.206,14.497,3.228,80.694
max,9504.599,139.616,532.961,460.211,467.572,286.646


## (Re)Analysis runs
Assimilate `O3`, `NO2` & `SO2` observations from surface stations and `NO2`  trop. columns from  OMI. Current operational setup (CAMS50.201801; DA16 modules).

In [17]:
%time processEMEP('emepAN')

  """


CPU times: user 13min 25s, sys: 5min 45s, total: 19min 10s
Wall time: 26min 39s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,76.374,7.718,9.741,7.788,1.724,18.601
std,143.803,12.123,16.781,12.74,4.42,33.921
min,0.479,0.0,0.401,0.401,0.0,0.0
25%,127.252,5.284,8.316,5.448,0.613,25.435
50%,159.806,10.119,15.007,10.5,1.451,46.913
75%,204.274,18.224,24.92,18.701,3.165,66.721
max,9501.21,194.609,537.954,461.395,331.148,263.503


## Assimilate CO observations
Analysis run, same set-up as `emepAN`, but with addtional `CO` surface observations.  Same source code as operational set-up (CAMS50.201801; DA16 modules),
with minor modification to enhable `CO` assimilation. `emepCO` has a low `CO` observation rejection threshold (350 ug/m3). `emepCOv2` has double `CO` observation rejection treshold  (700 ug/m3).

In [18]:
%time processEMEP('emepCO')

  """


CPU times: user 13min 23s, sys: 5min 24s, total: 18min 47s
Wall time: 26min 41s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,78.146,7.717,9.749,7.795,1.724,18.605
std,146.966,12.123,16.802,12.743,4.419,33.911
min,0.0,0.0,0.401,0.401,0.0,0.0
25%,110.061,5.283,8.33,5.457,0.613,25.433
50%,158.24,10.118,15.017,10.511,1.451,46.906
75%,220.412,18.223,24.914,18.702,3.165,66.706
max,9623.118,194.596,537.838,461.434,330.139,263.601


In [19]:
%time processEMEP('emepCOv2')

  """


CPU times: user 13min 8s, sys: 5min 14s, total: 18min 23s
Wall time: 22min 17s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,92.464,7.717,9.709,7.732,1.724,18.614
std,174.258,12.123,16.639,12.698,4.419,33.932
min,0.0,0.0,0.401,0.401,0.0,0.0
25%,137.196,5.283,8.296,5.439,0.613,25.438
50%,189.893,10.118,14.963,10.469,1.451,46.918
75%,269.02,18.222,24.808,18.614,3.165,66.73
max,9347.808,194.59,537.808,461.388,332.551,263.024


## Assimilate PM observations
Analysis run, same set-up as `emepAN`, but with addtional `PM2.5` and `PM10` surface observations.  Development version of the assimilation modules (CAMS50.201801; DA17 modules), configured for `PM` assimilation without feerback.
On `emepANv2` `PM10` observations are assimilated with feedback and `PM2.5` observations are assimilated without feerback.
`emepANv3` also assimlates `CO` observations; `PM10` observations are assimilated with feedback, but no `PM2.5` observations are assimilated.

In [20]:
%%time

processEMEP(
    'emepPM',
     drop=dropEMEP+['SURF_ug_PM25_rh50','SURF_ug_PM10_rh50'],
     surf=dict(
        SURF_ug_O3='O3',
        SURF_ug_NO2='NO2',
        SURF_ug_SO2='SO2',
        SURF_ug_CO='CO',
        SURF_ug_PM25_AN='PM25',   # no feedback
        SURF_ug_PM10_AN='PM10',   # no feedback
    ),
)

  """


CPU times: user 14min 19s, sys: 4min 21s, total: 18min 41s
Wall time: 26min 3s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,76.461,9.232,10.622,8.997,2.424,19.085
std,143.888,15.123,18.435,14.041,8.199,35.151
min,0.48,0.0,0.0,0.0,0.0,0.0
25%,127.562,6.161,9.112,6.048,0.768,30.411
50%,160.098,11.722,16.565,11.727,1.787,51.028
75%,204.551,21.655,27.864,20.939,3.885,70.509
max,9501.986,437.712,538.926,462.206,1036.666,273.538


In [21]:
%%time

processEMEP(
    'emepPMv2',
     drop=dropEMEP+['SURF_ug_PM25_rh50','SURF_ug_PM10_AN'],
     surf=dict(
        SURF_ug_O3='O3',
        SURF_ug_NO2='NO2',
        SURF_ug_SO2='SO2',
        SURF_ug_CO='CO',
        SURF_ug_PM25_AN='PM25',    # no feedback
        SURF_ug_PM10_rh50='PM10',  # with feedback
    ),
)

  """


CPU times: user 14min 8s, sys: 5min 37s, total: 19min 45s
Wall time: 29min 40s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0,120547770.0
mean,76.458,9.232,11.358,9.477,2.424,19.088
std,143.871,15.122,23.77,19.037,8.2,35.15
min,0.48,0.0,0.004,0.0,0.0,0.0
25%,127.588,6.162,9.681,6.645,0.769,30.406
50%,160.074,11.723,16.13,12.14,1.787,51.025
75%,204.517,21.653,26.309,21.035,3.884,70.506
max,9501.969,437.966,11068.503,7955.356,1036.682,273.776


In [9]:
%%time

processEMEP(
    'emepPMv3',
     drop=dropEMEP+['SURF_ug_PM25_AN','SURF_ug_PM10_AN'],
     surf=dict(
        SURF_ug_O3='O3',
        SURF_ug_NO2='NO2',
        SURF_ug_SO2='SO2',
        SURF_ug_CO='CO',
        SURF_ug_PM25_rh50='PM25',  # no DA
        SURF_ug_PM10_rh50='PM10',  # with feedback
    ),
)

  """


CPU times: user 13min 27s, sys: 9min 45s, total: 23min 12s
Wall time: 27min 25s


Unnamed: 0,CO,NO2,PM10,PM25,SO2,O3
count,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0,112852110.0
mean,110.525,9.658,11.516,9.835,2.445,20.278
std,186.328,14.589,23.892,18.956,8.075,35.564
min,0.0,0.0,0.004,0.004,0.0,0.0
25%,145.832,5.916,9.558,6.54,0.73,30.741
50%,207.94,11.305,15.947,11.975,1.72,51.299
75%,303.447,20.957,25.99,20.755,3.77,70.747
max,9301.333,437.981,11481.909,8275.021,1036.684,273.589
