# Getting ZECMIP File Paths

- Getting all the file paths for zecmip takes a while to laod with glob. Savign all these files in a csv files to improve speed.

In [1]:
from glob import glob
import pandas as pd
import sys, os
import numpy as np
sys.path.append(os.path.join(os.getcwd(), 'Documents/PhD'))
import constants
import json
from pprint import pprint
from typing import List, Dict

In [2]:
constants.DECK_DIR, constants.ZECMIP_DIR

('/g/data/oi10/replicas/CMIP6/CMIP', '/g/data/oi10/replicas/CMIP6/C4MIP')

In [3]:
%%time
# Sample file name: tas_Amon_IPSL-CM5A2-INCA_piControl_r1i1p1f1_gr_185001-209912.nc
fpaths_1000PgC = glob(constants.ZECMIP_DIR + '/**/*tas_*Amon*brch-1000PgC*.nc', recursive=True)

CPU times: user 517 ms, sys: 2.93 s, total: 3.45 s
Wall time: 10.8 s


In [4]:
fpaths_1000PgC[:3]

['/g/data/oi10/replicas/CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pct-brch-1000PgC_r1i1p1f1_gn_207501-209412.nc',
 '/g/data/oi10/replicas/CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pct-brch-1000PgC_r1i1p1f1_gn_205501-207412.nc',
 '/g/data/oi10/replicas/CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pct-brch-1000PgC_r1i1p1f1_gn_199501-201412.nc']

In [5]:
# Needs _piControl or else will also get esm-piControl
fpaths_picontrol = glob(constants.DECK_DIR + '/**/*tas_*Amon*_piControl*.nc', recursive=True)

In [6]:
fpaths_esm_picontrol = glob('/g/data/oi10/replicas/CMIP6/CMIP' + '/**/*tas_*Amon*esm-piControl*.nc', recursive=True)

In [7]:
fpaths_picontrol[:3]

['/g/data/oi10/replicas/CMIP6/CMIP/INM/INM-CM5-0/piControl/r1i1p1f1/Amon/tas/gr1/v20190619/tas_Amon_INM-CM5-0_piControl_r1i1p1f1_gr1_314801-319612.nc',
 '/g/data/oi10/replicas/CMIP6/CMIP/INM/INM-CM5-0/piControl/r1i1p1f1/Amon/tas/gr1/v20190619/tas_Amon_INM-CM5-0_piControl_r1i1p1f1_gr1_246901-249512.nc',
 '/g/data/oi10/replicas/CMIP6/CMIP/INM/INM-CM5-0/piControl/r1i1p1f1/Amon/tas/gr1/v20190619/tas_Amon_INM-CM5-0_piControl_r1i1p1f1_gr1_236901-246812.nc']

In [8]:
fpaths_1pct = glob(constants.DECK_DIR + '/**/*tas_*Amon*1pctCO2*.nc', recursive=True)

In [9]:
fpaths_1pct[:3]

['/g/data/oi10/replicas/CMIP6/CMIP/INM/INM-CM5-0/1pctCO2/r1i1p1f1/Amon/tas/gr1/v20200226/tas_Amon_INM-CM5-0_1pctCO2_r1i1p1f1_gr1_188401-196212.nc',
 '/g/data/oi10/replicas/CMIP6/CMIP/INM/INM-CM5-0/1pctCO2/r1i1p1f1/Amon/tas/gr1/v20200226/tas_Amon_INM-CM5-0_1pctCO2_r1i1p1f1_gr1_196301-199912.nc',
 '/g/data/oi10/replicas/CMIP6/CMIP/INM/INM-CM5-0/1pctCO2/r1i1p1f1/Amon/tas/gr1/v20200226/tas_Amon_INM-CM5-0_1pctCO2_r1i1p1f1_gr1_185001-188312.nc']

In [10]:
base_paths = {
    'picontrol': np.sort(np.unique(list(map(os.path.dirname, fpaths_picontrol)))),
    'esm-piControl':np.sort(np.unique(list(map(os.path.dirname, fpaths_esm_picontrol)))),
    '1000pgc':np.sort(np.unique(list(map(os.path.dirname, fpaths_1000PgC)))),
    '1pct':np.sort(np.unique(list(map(os.path.dirname, fpaths_1pct))))}

In [11]:
base_paths['1000pgc'][:10]

array(['/g/data/oi10/replicas/CMIP6/C4MIP/CCCma/CanESM5/esm-1pct-brch-1000PgC/r1i1p2f1/Amon/tas/gn/v20190429',
       '/g/data/oi10/replicas/CMIP6/C4MIP/CCCma/CanESM5/esm-1pct-brch-1000PgC/r2i1p2f1/Amon/tas/gn/v20190429',
       '/g/data/oi10/replicas/CMIP6/C4MIP/CCCma/CanESM5/esm-1pct-brch-1000PgC/r3i1p2f1/Amon/tas/gn/v20190429',
       '/g/data/oi10/replicas/CMIP6/C4MIP/CCCma/CanESM5/esm-1pct-brch-1000PgC/r4i1p2f1/Amon/tas/gn/v20190429',
       '/g/data/oi10/replicas/CMIP6/C4MIP/CCCma/CanESM5/esm-1pct-brch-1000PgC/r5i1p2f1/Amon/tas/gn/v20190429',
       '/g/data/oi10/replicas/CMIP6/C4MIP/MIROC/MIROC-ES2L/esm-1pct-brch-1000PgC/r1i1p1f2/Amon/tas/gn/v20200622',
       '/g/data/oi10/replicas/CMIP6/C4MIP/MOHC/UKESM1-0-LL/esm-1pct-brch-1000PgC/r1i1p1f2/Amon/tas/gn/v20200210',
       '/g/data/oi10/replicas/CMIP6/C4MIP/MOHC/UKESM1-0-LL/esm-1pct-brch-1000PgC/r2i1p1f2/Amon/tas/gn/v20200106',
       '/g/data/oi10/replicas/CMIP6/C4MIP/MOHC/UKESM1-0-LL/esm-1pct-brch-1000PgC/r3i1p1f2/Amon/tas/gn/v

In [12]:
types_to_match = ['picontrol', '1pct', 'esm-piControl']

In [13]:
path_obj = {}
for ubp in base_paths['1000pgc']:
    ubp_split = ubp.split('/')
    model, ensemble_member, freq = ubp_split[8], ubp_split[10], ubp_split[11]    
    dobj = {'1000pgc': ubp}  
    for mtype in types_to_match:
        path_list = [
            bmp for bmp in base_paths[mtype]
            if model == bmp.split('/')[8] and ensemble_member in bmp and freq in bmp]
        dobj[mtype] = path_list
    
    path_obj[f'{model}_{ensemble_member}_{freq}'] = dobj

In [14]:
# If there is no picontrol then there is no point.
path_obj_2 = {}
for run_params, dobj in path_obj.items():
    if len(dobj['picontrol']) > 0: path_obj_2[run_params] = dobj

In [15]:
# Can get rid of the'day' variant if there is also 'Amon'
path_obj_3 = {}
for run_params, dobj in path_obj_2.items():
    if 'Amon' in run_params or not (run_params.replace('day', 'Amon') in list(path_obj_2)):
        path_obj_3[run_params] = dobj            

In [16]:
pprint(path_obj_3)

{'CESM2_r1i1p1f1_Amon': {'1000pgc': '/g/data/oi10/replicas/CMIP6/C4MIP/NCAR/CESM2/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20191119',
                         '1pct': ['/g/data/oi10/replicas/CMIP6/CMIP/NCAR/CESM2/1pctCO2/r1i1p1f1/Amon/tas/gn/v20190425'],
                         'esm-piControl': ['/g/data/oi10/replicas/CMIP6/CMIP/NCAR/CESM2/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190723'],
                         'picontrol': ['/g/data/oi10/replicas/CMIP6/CMIP/NCAR/CESM2/piControl/r1i1p1f1/Amon/tas/gn/v20190320']},
 'CanESM5_r1i1p2f1_Amon': {'1000pgc': '/g/data/oi10/replicas/CMIP6/C4MIP/CCCma/CanESM5/esm-1pct-brch-1000PgC/r1i1p2f1/Amon/tas/gn/v20190429',
                           '1pct': ['/g/data/oi10/replicas/CMIP6/CMIP/CCCma/CanESM5/1pctCO2/r1i1p2f1/Amon/tas/gn/v20190429'],
                           'esm-piControl': [],
                           'picontrol': ['/g/data/oi10/replicas/CMIP6/CMIP/CCCma/CanESM5/piControl/r1i1p2f1/Amon/tas/gn/v20190429']},
 'GFDL-ESM4_r1i1p1f1_Amon':

In [17]:
# TODO: Not sure why this doesn't work, but can just hard code in for now
path_obj_3['GISS-E2-1-G-CC_r1i1p1f1_Amon']['1pct'] = '/g/data/oi10/replicas/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/1pctCO2/r1i1p1f1/Amon/tas/gn/v20180905'

In [18]:
import copy

In [19]:
# Trurn lists into str and remove old versions.
path_obj_4 = {}
for run_params, dobj_base in path_obj_3.items():
    dobj = copy.deepcopy(dobj_base) # Don't wont to override
    for exp_type, vals in dobj.items():
        if isinstance(vals, list):
            # print(vals)
            if len(vals) > 1: dobj[exp_type] = np.sort(vals)[-1]
            if len(vals) == 0: pass
            else: dobj[exp_type] = vals[0]
    path_obj_4[run_params] = dobj

In [20]:
# ACCESS us stored in its own directroy - manually add
path_obj_4['ACCESS-ESM1-5'] = {
    '1000pgc':'/g/data/fs38/publications/CMIP6/C4MIP/CSIRO/ACCESS-ESM1-5/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20191206',
    '1pct':'/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115',
    'picontrol': '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316',
    'esm-piControl': '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/esm-piControl/r1i1p1f1/Amon/tas/gn/v20191115'
}

In [27]:
path_obj_4['GISS-E2-1-G-CC_r1i1p1f1_Amon']['esm-piControl'] = '/g/data/w40/ab2313/PhD/zecmip/original/esm_picontrol/GISS'

In [28]:
path_obj_4['CanESM5_r1i1p2f1_Amon']['esm-piControl'] = '/g/data/w40/ab2313/PhD/zecmip/original/esm_picontrol/CanESM5'

In [29]:
pprint(path_obj_4)

{'ACCESS-ESM1-5': {'1000pgc': '/g/data/fs38/publications/CMIP6/C4MIP/CSIRO/ACCESS-ESM1-5/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20191206',
                   '1pct': '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115',
                   'esm-piControl': '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/esm-piControl/r1i1p1f1/Amon/tas/gn/v20191115',
                   'picontrol': '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316'},
 'CESM2_r1i1p1f1_Amon': {'1000pgc': '/g/data/oi10/replicas/CMIP6/C4MIP/NCAR/CESM2/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20191119',
                         '1pct': '/g/data/oi10/replicas/CMIP6/CMIP/NCAR/CESM2/1pctCO2/r1i1p1f1/Amon/tas/gn/v20190425',
                         'esm-piControl': '/g/data/oi10/replicas/CMIP6/CMIP/NCAR/CESM2/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190723',
                         'picontrol': '/g/data/oi10/replicas/CMIP

In [30]:
with open(os.path.join(os.getcwd(), 'Documents', 'PhD', 'data', 'zecmip_experiment_paths.json'), 'w') as f:
    json.dump(path_obj_4, f)

In [23]:
# base_fpaths_1000PgC  = np.unique([os.path.join(*f.split('/')[:9]) for f in fpaths_1000PgC])
# base_fpaths_1000PgC 

In [24]:
# path_obj = {}
# for ubp in base_paths['1000pgc']:
#     ubp_split = ubp.split('/')
#     model = ubp_split[8]; ensemble_member = ubp_split[10]; freq = ubp_split[11]
#     run_params = f'{model}_{ensemble_member}_{freq}'
#     # print(run_params)
#     # print(ubp)
#     dobj = {}
#     dobj['1000pgc']= ubp
#     for mtype in types_to_match:
#         # print(f' -- {mtype=}')
#         path_list = []
#         for bmp in base_paths[mtype]:
#             bmp_split = bmp.split('/')
#             model_to_match = bmp_split[0]
#             if model == model_to_match and ensemble_member in bmp:
#                 path_list.append(bmp)
#         # path_list = [f for f in base_paths[mtype] if model in f and ensemble_member in f]
#         dobj[mtype] = path_list 
#     path_obj[run_params] = dobj        