<a href="https://colab.research.google.com/github/benmsanderson/prpattern/blob/main/CMIP6_get_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4xCO2 read data


CMIP6

In [1]:
src=['CESM2','CESM3']

In [2]:
expts=['1pctCO2','abrupt-4xCO2','piControl','historical','ssp126','ssp585']
calstrt=[True,True,True,False,False,False]
dbe=['CMIP','CMIP','CMIP','CMIP','ScenarioMIP','ScenarioMIP']

State resolution of emulator

In [5]:
import numpy as np
lon_out=np.arange(1,359,2)
lat_out=np.arange(-89,89,2)
lons_sub, lats_sub = np.meshgrid(lon_out,lat_out)

In [6]:
readdata=1
authdrive=1

In [7]:
datadir='/content/drive/MyDrive/colab_4xco2'

Desired experiments

In [8]:
flds=['tas']

Import stuff

In [10]:
from matplotlib import pyplot as plt

import numpy.matlib
import pandas as pd
import xarray as xr
import zarr
import gcsfs
import pickle
import cftime
from sys import getsizeof
from IPython.display import HTML, display
import time




## Browse Catalog

The data catatalog is stored as a CSV file. Here we read it with Pandas.

In [None]:
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv', low_memory=False)

Variables and experiments in database

Make dataframe for each experiment type and each field

In [None]:
vars=df.variable_id.unique()
vars.sort()

expts_full=df.experiment_id.unique()
expts_full.sort()

flds_full=df.variable_id.unique()
flds_full.sort()

src_full=df.variable_id.unique()
src_full.sort()

In [None]:
df_all1=[]
for i, row in enumerate(expts):
  df_ta1=[]
  for j,fld in enumerate(flds):
    tmp = df.query("activity_id=='"+dbe[i]+"' & table_id == 'Amon' & variable_id == '"+fld+"' & experiment_id == '"+expts[i]+"'")
    df_ta1.append(tmp)
  df_all1.append(df_ta1)

Make some empty dataframes to store concise list

In [None]:
cnames=df_all1[0][0].columns
df_all=[]
for i, exp in enumerate(expts):
  tmp=[]
  for j,fld  in enumerate(flds):
    tmp.append(pd.DataFrame(columns=cnames))
  df_all.append(tmp)

Now get 1 ensemble member for each model, if it exists, for each experiment.  Only add to dataframe df_ta if we have a full set of experiments

In [None]:
mdls=[]
n=0
for j, mdl in enumerate(src):
    tmpdf=[]
    nruns=[]
    for i, ext in enumerate(expts):
        #find first variable for expt/model
        for j, fld in enumerate(flds):
          tmp=df_all1[i][j].query("source_id=='"+mdl+"'")
          nruns.append(tmp.shape[0])
    #is there at least 1 run per experiment,with all fields?
    if min(nruns)>=1:
      #point to the entry for 1st run, first variable for each expt
      for i, ext in enumerate(expts):
        mmb=df_all1[i][0]['member_id'].values[0]
        for j, fld in enumerate(flds):
          tt = df_all1[i][j].query("source_id=='"+mdl+"' & table_id == 'Amon'")
          df_all[i][j].loc[n]=tt.values[0]
      #add model to final list
      mdls.append(mdl)
      n=n+1
    else:
        print(mdl+' does not have all required simulations')
mdls
    

## Load Data

Load Google file system


In [None]:
# load Google cloud storage
gcs = gcsfs.GCSFileSystem(token='anon')

Loop through zstore links, use zarr to open


In [None]:
nm=len(mdls)
nf=len(flds)
ne=len(expts)

In [None]:

if readdata:
  out = display(progress(0, 1), display_id=True)
  dsall=[]
  for i,df_ta in enumerate(df_all):

    dsm=[]
    for j,df in enumerate(df_ta):
      ds=[]
      print(expts[i]+','+flds[j])
      for index, item in enumerate(df.zstore.values, start=0):
        mapper=gcs.get_mapper(item)
        ds.append(xr.open_zarr(mapper, decode_times=False))
        out.update(progress(index+j*nm+i*nm*nf, ne*nm*nf))
      dsm.append(ds)
    dsall.append(dsm)  

concatenated dataarrays for ts, global mean


In [None]:
 
if readdata:
  dall=[]
  out = display(progress(0, 1), display_id=True)
  for i,ds in enumerate(dsall,start=0):
    dexp=[]
    for j,dm in enumerate(ds):
      print(expts[i]+','+flds[j])
      for index, dd in enumerate(dm, start=0):
          if 'longitude' in dd.keys():
            dd=dd.rename({'longitude': 'lon','latitude': 'lat'})
          if 'latitude' in dd.coords:
            dd=dd.drop('latitude')  
            dd=dd.drop_dims('latitude')
       
          tmp=dd[flds[j]][:4800,:,:].interp(lon=lon_out,lat=lat_out, kwargs={"fill_value": "extrapolate"})
          if calstrt[i]:
            tmp.coords['time']=pd.date_range('1850-01-01', periods=tmp['time'].values.shape[0],freq='M')
          if tmp['time'].dtype=='float64' or  tmp['time'].dtype=='int64':
            tmp.coords['time']=num2date(tmp['time'].values,tmp['time'].units)     
          srm=tmp.groupby('time.year').mean('time')
          if index==0:
            dac=srm
          else:
            dac=xr.concat([dac,srm],'ens',coords='minimal',compat='override')
          out.update(progress(index+j*nm+i*nm*nf, ne*nm*nf))
      dexp.append(dac)
    dall.append(dexp)

Find complete runs




In [None]:

if readdata:
  out = display(progress(0, 1), display_id=True)
  for i,d in enumerate(dall,start=0):
    tmp=xr.merge(d[:])
    tmp.to_netcdf(datadir+'/colab_4xco2/'+expts[i]+'.nc')
    #lat=tmp.tas.lat
    #weights = np.cos(np.deg2rad(lat))
    #weights.name = "weights"
    #tmp_gm=tmp.weighted(weights).mean(dim='lat').mean(dim='lon')
    #tmp_gm.to_netcdf('/content/drive/MyDrive/colab_4xco2/'+expts[i]+'_gm.nc')
    out.update(progress(i,ne))



In [None]:

  
if readdata:
  out = display(progress(0, 1), display_id=True)
  for i,d in enumerate(dall,start=0):
    tmp=xr.open_dataset(datadir+'/'+expts[i]+'.nc')
    lat=tmp.tas.lat
    weights = np.cos(np.deg2rad(lat))
    weights.name = "weights"
    tmp_gm=tmp.weighted(weights).mean(dim='lat').mean(dim='lon')
    tmp_gm.to_netcdf(datadir+'/'+expts[i]+'_gm.nc')
    out.update(progress(i,ne))
