<a href="https://colab.research.google.com/github/benmsanderson/energybalance/blob/main/read_cmip6_4xco2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4xCO2 read data


CMIP6

In [None]:
readdata=1
authdrive=1

Desired interpolation grid

Desired experiments

In [None]:
flds=['tas','rsut','rlut','rsdt']
expts=['1pctCO2','abrupt-4xCO2','piControl','historical','ssp126','ssp585']
calstrt=[True,True,True,False,False,False]
dbe=['CMIP','CMIP','CMIP','CMIP','ScenarioMIP','ScenarioMIP']

Install some stuff

In [None]:
!pip uninstall -y albumentations >/dev/null
!pip install --upgrade xarray zarr gcsfs cftime pydsm nc-time-axis imgaug matplotlib==3.1.3 progress eofs cartopy netcdf4 >/dev/null



Import stuff

In [None]:
from matplotlib import pyplot as plt
from netCDF4 import num2date
import numpy as np
import pydsm.relab as relab
import numpy.matlib
import pandas as pd
import xarray as xr
import zarr
import gcsfs
import pickle
import cftime
import cartopy.crs as ccrs
import dask as da
from eofs.xarray import Eof
from sys import getsizeof
from IPython.display import HTML, display
import time

xr.set_options(display_style='html')
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#code for pretty progress bars
def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))


runtime has 27.3 gigabytes of available RAM



In [None]:
lon_out=np.arange(1,359,2)
lat_out=np.arange(-89,89,2)
lons_sub, lats_sub = np.meshgrid(lon_out,lat_out)

Activate Google Drive to store arrays

In [None]:
if authdrive:
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
plt.rcParams['figure.figsize'] = 12, 6

## Browse Catalog

The data catatalog is stored as a CSV file. Here we read it with Pandas.

In [None]:
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv', low_memory=False)

Variables and experiments in database

In [None]:
vars=df.variable_id.unique()
vars.sort()
expts_full=df.experiment_id.unique()
expts_full.sort()

flds_full=df.variable_id.unique()
flds_full.sort()

In [None]:
df[(df["experiment_id"] == 'ssp585-bgc') & (df["variable_id"]=='fgco2')]

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
241987,C4MIP,CNRM-CERFACS,CNRM-ESM2-1,ssp585-bgc,r1i1p1f2,Omon,fgco2,gn,gs://cmip6/CMIP6/C4MIP/CNRM-CERFACS/CNRM-ESM2-...,,20190711
409547,C4MIP,MIROC,MIROC-ES2L,ssp585-bgc,r1i1p1f2,Omon,fgco2,gn,gs://cmip6/CMIP6/C4MIP/MIROC/MIROC-ES2L/ssp585...,,20200124
464151,C4MIP,MIROC,MIROC-ES2L,ssp585-bgc,r1i1p1f2,Omon,fgco2,gr1,gs://cmip6/CMIP6/C4MIP/MIROC/MIROC-ES2L/ssp585...,,20200731
497096,C4MIP,CSIRO,ACCESS-ESM1-5,ssp585-bgc,r1i1p1f1,Omon,fgco2,gn,gs://cmip6/CMIP6/C4MIP/CSIRO/ACCESS-ESM1-5/ssp...,,20201016


In [None]:
expts_full.filter

array(['1pctCO2', '1pctCO2-4xext', '1pctCO2-bgc', '1pctCO2-cdr',
       '1pctCO2-rad', '1pctCO2to4x-withism', 'abrupt-0p5xCO2',
       'abrupt-2xCO2', 'abrupt-4xCO2', 'abrupt-solm4p', 'abrupt-solp4p',
       'amip', 'amip-4xCO2', 'amip-future4K', 'amip-hist', 'amip-lwoff',
       'amip-m4K', 'amip-p4K', 'amip-p4K-lwoff', 'aqua-4xCO2',
       'aqua-control', 'aqua-control-lwoff', 'aqua-p4K', 'aqua-p4K-lwoff',
       'control-1950', 'dcppA-assim', 'dcppA-hindcast',
       'dcppC-amv-ExTrop-neg', 'dcppC-amv-ExTrop-pos',
       'dcppC-amv-Trop-neg', 'dcppC-amv-Trop-pos', 'dcppC-amv-neg',
       'dcppC-amv-pos', 'dcppC-atl-control', 'dcppC-atl-pacemaker',
       'dcppC-hindcast-noAgung', 'dcppC-hindcast-noElChichon',
       'dcppC-hindcast-noPinatubo', 'dcppC-ipv-NexTrop-neg',
       'dcppC-ipv-NexTrop-pos', 'dcppC-ipv-neg', 'dcppC-ipv-pos',
       'dcppC-pac-control', 'dcppC-pac-pacemaker', 'deforest-globe',
       'esm-hist', 'esm-pi-CO2pulse', 'esm-pi-cdr-pulse', 'esm-piControl',
       

Make dataframe for each experiment type and each field

In [None]:
df_all1=[]
for i, row in enumerate(expts):
  df_ta1=[]
  for j,fld in enumerate(flds):
    tmp = df.query("activity_id=='"+dbe[i]+"' & table_id == 'Amon' & variable_id == '"+fld+"' & experiment_id == '"+expts[i]+"'")
    df_ta1.append(tmp)
  df_all1.append(df_ta1)

Isolate unique models which have completed 4xco2

In [None]:
mdls1=df_all1[0][0].source_id.unique()
mdls1.sort()
mdls1

array(['ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'BCC-CSM2-MR',
       'BCC-ESM1', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'CESM2', 'CESM2-FV2',
       'CESM2-WACCM', 'CESM2-WACCM-FV2', 'CIESM', 'CMCC-CM2-SR5',
       'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1',
       'CanESM5', 'CanESM5-CanOE', 'E3SM-1-0', 'EC-Earth3',
       'EC-Earth3-AerChem', 'EC-Earth3-Veg', 'FGOALS-f3-L', 'FGOALS-g3',
       'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4', 'GISS-E2-1-G',
       'GISS-E2-1-H', 'GISS-E2-2-G', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM',
       'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'KACE-1-0-G',
       'KIOST-ESM', 'MCM-UA-1-0', 'MIROC-ES2L', 'MIROC6',
       'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0',
       'NESM3', 'NorCPM1', 'NorESM2-LM', 'NorESM2-MM', 'SAM0-UNICON',
       'TaiESM1', 'UKESM1-0-LL'], dtype=object)

Make some empty dataframes to store concise list

In [None]:
cnames=df_all1[0][0].columns
df_all=[]
for i, exp in enumerate(expts):
  tmp=[]
  for j,fld  in enumerate(flds):
    tmp.append(pd.DataFrame(columns=cnames))
  df_all.append(tmp)

Now get 1 ensemble member for each model, if it exists, for each experiment.  Only add to dataframe df_ta if we have a full set of experiments

In [None]:
mdls=[]
n=0
for j, mdl in enumerate(mdls1):
    tmpdf=[]
    nruns=[]
    for i, ext in enumerate(expts):
        #find first variable for expt/model
        for j, fld in enumerate(flds):
          tmp=df_all1[i][j].query("source_id=='"+mdl+"'")
          nruns.append(tmp.shape[0])
    #is there at least 1 run per experiment,with all fields?
    if min(nruns)>=1:
      #point to the entry for 1st run, first variable for each expt
      for i, ext in enumerate(expts):
        mmb=df_all1[i][0]['member_id'].values[0]
        for j, fld in enumerate(flds):
          tt = df_all1[i][j].query("source_id=='"+mdl+"' & table_id == 'Amon'")
          df_all[i][j].loc[n]=tt.values[0]
      #add model to final list
      mdls.append(mdl)
      n=n+1
    
    

In [None]:
pickle.dump(mdls, open( "/content/drive/MyDrive/colab_4xco2/mdls.pkl", "wb" ) )


In [None]:
df_all[1][2].loc(1)

<pandas.core.indexing._LocIndexer at 0x7fee92a7d770>

## Load Data

Load Google file system


In [None]:
# load Google cloud storage
gcs = gcsfs.GCSFileSystem(token='anon')

Loop through zstore links, use zarr to open


In [None]:
nm=len(mdls)
nf=len(flds)
ne=len(expts)

In [None]:

if readdata:
  out = display(progress(0, 1), display_id=True)
  dsall=[]
  for i,df_ta in enumerate(df_all):
    dsm=[]
    for j,df in enumerate(df_ta):
      ds=[]
      print(expts[i]+','+flds[j])
      for index, item in enumerate(df.zstore.values, start=0):
        mapper=gcs.get_mapper(item)
        ds.append(xr.open_zarr(mapper, decode_times=False))
        out.update(progress(index+j*nm+i*nm*nf, ne*nm*nf))
      dsm.append(ds)
    dsall.append(dsm)  

1pctCO2,tas
1pctCO2,rsut
1pctCO2,rlut
1pctCO2,rsdt
abrupt-4xCO2,tas
abrupt-4xCO2,rsut
abrupt-4xCO2,rlut
abrupt-4xCO2,rsdt
piControl,tas
piControl,rsut
piControl,rlut
piControl,rsdt
historical,tas
historical,rsut
historical,rlut
historical,rsdt
ssp126,tas
ssp126,rsut
ssp126,rlut
ssp126,rsdt
ssp585,tas
ssp585,rsut
ssp585,rlut
ssp585,rsdt


concatenated dataarrays for ts, global mean


In [None]:
 
if readdata:
  dall=[]
  out = display(progress(0, 1), display_id=True)
  for i,ds in enumerate(dsall,start=0):
    dexp=[]
    for j,dm in enumerate(ds):
      print(expts[i]+','+flds[j])
      for index, dd in enumerate(dm, start=0):
          if 'longitude' in dd.keys():
            dd=dd.rename({'longitude': 'lon','latitude': 'lat'})
          if 'latitude' in dd.coords:
            dd=dd.drop('latitude')  
            dd=dd.drop_dims('latitude')
       
          tmp=dd[flds[j]][:4800,:,:].interp(lon=lon_out,lat=lat_out, kwargs={"fill_value": "extrapolate"})
          if calstrt[i]:
            tmp.coords['time']=pd.date_range('1850-01-01', periods=tmp['time'].values.shape[0],freq='M')
          if tmp['time'].dtype=='float64' or  tmp['time'].dtype=='int64':
            tmp.coords['time']=num2date(tmp['time'].values,tmp['time'].units)
#          if 'historical' in expts[i]:
##            tmpf=dsall[expts.index('ssp585')][j][index][flds[j]].interp(lon=lon_out,lat=lat_out, kwargs={"fill_value": "extrapolate"})
 #           if tmpf['time'].dtype=='float64' or  tmpf['time'].dtype=='int64':
 ##             tmpf.coords['time']=num2date(tmpf['time'].values,tmpf['time'].units)
#            tmp=xr.concat([tmp,tmpf],'time')
#            tmp=tmp.where(tmp['time.year'] < 2021, drop=True)
#          if 'ssp' in expts[i]:
#            tmp=tmp.where(tmp['time.year'] > 2014, drop=True)        
          srm=tmp.groupby('time.year').mean('time')
          if index==0:
            dac=srm
          else:
            dac=xr.concat([dac,srm],'ens',coords='minimal',compat='override')
          out.update(progress(index+j*nm+i*nm*nf, ne*nm*nf))
      dexp.append(dac)
    dall.append(dexp)

1pctCO2,tas
1pctCO2,rsut


  **blockwise_kwargs,


1pctCO2,rlut
1pctCO2,rsdt


  **blockwise_kwargs,


abrupt-4xCO2,tas
abrupt-4xCO2,rsut
abrupt-4xCO2,rlut
abrupt-4xCO2,rsdt
piControl,tas
piControl,rsut
piControl,rlut
piControl,rsdt
historical,tas


  **blockwise_kwargs,


historical,rsut


  **blockwise_kwargs,


historical,rlut


  **blockwise_kwargs,


historical,rsdt


  **blockwise_kwargs,


ssp126,tas


  **blockwise_kwargs,


ssp126,rsut
ssp126,rlut
ssp126,rsdt
ssp585,tas


  **blockwise_kwargs,


ssp585,rsut
ssp585,rlut


  **blockwise_kwargs,


ssp585,rsdt


In [None]:
dall[5][0][0]

Unnamed: 0,Array,Chunk
Bytes,5.54 MB,63.72 kB
Shape,"(87, 89, 179)","(1, 89, 179)"
Count,81536 Tasks,87 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 5.54 MB 63.72 kB Shape (87, 89, 179) (1, 89, 179) Count 81536 Tasks 87 Chunks Type float32 numpy.ndarray",179  89  87,

Unnamed: 0,Array,Chunk
Bytes,5.54 MB,63.72 kB
Shape,"(87, 89, 179)","(1, 89, 179)"
Count,81536 Tasks,87 Chunks
Type,float32,numpy.ndarray


Find complete runs




In [None]:

if readdata:
  out = display(progress(0, 1), display_id=True)
  for i,d in enumerate(dall,start=0):
    tmp=xr.merge(d[:])
    tmp.to_netcdf('/content/drive/MyDrive/colab_4xco2/'+expts[i]+'.nc')
    #lat=tmp.tas.lat
    #weights = np.cos(np.deg2rad(lat))
    #weights.name = "weights"
    #tmp_gm=tmp.weighted(weights).mean(dim='lat').mean(dim='lon')
    #tmp_gm.to_netcdf('/content/drive/MyDrive/colab_4xco2/'+expts[i]+'_gm.nc')
    out.update(progress(i,ne))



  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,
  **blockwise_kwargs,


In [None]:

  
if readdata:
  out = display(progress(0, 1), display_id=True)
  for i,d in enumerate(dall,start=0):
    tmp=xr.open_dataset('/content/drive/MyDrive/colab_4xco2/'+expts[i]+'.nc')
    lat=tmp.tas.lat
    weights = np.cos(np.deg2rad(lat))
    weights.name = "weights"
    tmp_gm=tmp.weighted(weights).mean(dim='lat').mean(dim='lon')
    tmp_gm.to_netcdf('/content/drive/MyDrive/colab_4xco2/'+expts[i]+'_gm.nc')
    out.update(progress(i,ne))
