# Getting ZECMIP File Paths

- Getting all the file paths for zecmip takes a while to laod with glob. Savign all these files in a csv files to improve speed.

In [62]:
from glob import glob
import pandas as pd
import sys, os
import numpy as np
sys.path.append(os.path.join(os.getcwd(), 'Documents/PhD'))
import constants
import json
from typing import List, Dict

In [5]:
constants.DECK_DIR, constants.ZECMIP_DIR

('/g/data/oi10/replicas/CMIP6/CMIP', '/g/data/oi10/replicas/CMIP6/C4MIP')

In [70]:
%%time
# Sample file name: tas_Amon_IPSL-CM5A2-INCA_piControl_r1i1p1f1_gr_185001-209912.nc
fpaths_picontrol= glob(constants.DECK_DIR + '/**/*Amon*piControl*.nc', recursive=True)
fpaths_1000PgC = glob(constants.ZECMIP_DIR + '/**/*Amon*brch-1000PgC*.nc', recursive=True)

CPU times: user 25.2 s, sys: 2min, total: 2min 25s
Wall time: 6min 46s


In [71]:
# Sample 1pctCO2 path: /g/data/oi10/replicas/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/1pctCO2
# tas_Amon_CanESM5_1pctCO2-rad_r1i1p1f1_gn_185001-200012
fpaths_1pct_rad = glob(constants.ZECMIP_DIR + '/**/*Amon*1pctCO2-rad*.nc', recursive=True)
fpaths_1pct_bgc = glob(constants.ZECMIP_DIR + '/**/*Amon*1pctCO2-bgc*.nc', recursive=True)

In [77]:
def sort_into_variable_groups(paths_list: List[str], variable_index: int = -4) -> Dict[str, List[str]]:
    """
    Sorts a list of file paths into groups based on the variable located at a specified index in each path.

    Args:
        paths_list (List[str]): A list of file paths.
        variable_index (int, optional): The index of the variable in each file path. Defaults to -4.

    Returns:
        Dict[str, List[str]]: A dictionary where the keys are the variables extracted from the file paths and the
        values are the lists of file paths associated with each variable.
    """
    variable_groups = {}
    for path in paths_list:
        variable = path.split('/')[variable_index]
        if variable in variable_groups: variable_groups[variable].append(path)
        else: variable_groups[variable] = [path]
    return variable_groups

In [78]:
raw_path_dict = {'picontrol':sort_into_variable_groups(fpaths_picontrol), '1000pgc':sort_into_variable_groups(fpaths_1000PgC),
                '1pct_rad':sort_into_variable_groups(fpaths_1pct_rad), '1pct_bgc': sort_into_variable_groups(fpaths_1pct_bgc)}

In [79]:
raw_path_dict.keys()

dict_keys(['picontrol', '1000pgc', '1pct_rad', '1pct_bgc'])

In [80]:
for key, values in raw_path_dict.items():
    print(key, values.keys(), sep='\n'); print('-----')

picontrol
dict_keys(['rsut', 'hus', 'pr', 'clwvi', 'tas', 'rlus', 'prc', 'rsdscs', 'ps', 'wap', 'prsn', 'cct', 'tauu', 'zg', 'prw', 'va', 'rsutcs', 'rsds', 'sfcWind', 'hfls', 'rsdt', 'hurs', 'psl', 'ts', 'vas', 'huss', 'rldscs', 'tauv', 'uas', 'hur', 'clivi', 'rlutcs', 'tasmin', 'rlds', 'rsuscs', 'hfss', 'rlut', 'ta', 'ua', 'evspsbl', 'clt', 'tasmax', 'rsus', 'cli', 'cl', 'fco2nat', 'fco2antt', 'fco2fos', 'tos'])
-----
1000pgc
dict_keys(['co2', 'pr', 'tas'])
-----
1pct_rad
dict_keys(['rsut', 'tas', 'rsdt', 'ts', 'rlut', 'fco2nat', 'fco2antt'])
-----
1pct_bgc
dict_keys(['rsut', 'tas', 'rsdt', 'ts', 'rlut', 'fco2nat', 'fco2antt'])
-----


In [81]:
with open(os.path.join(os.getcwd(),  'Documents/PhD/data', 'raw_paths.json'), 'w') as fp:
    json.dump(raw_path_dict, fp)

In [None]:
# fpaths_ds = pd.DataFrame({'picontrol': fpaths_picontrol})
# fpaths_ds['1000pgc']= pd.Series(fpaths_1000PgC)
# fpaths_ds['1pct_rad']= pd.Series(fpaths_1pct_rad)
# fpaths_ds['1pct_bgc']= pd.Series(fpaths_1pct_bgc)

# fpaths_ds.head()

# fpaths_ds.to_csv(os.path.join(os.getcwd(),  'Documents/PhD', 'data/fpaths_picontrol.csv'))