# Getting ZECMIP File Paths

- Getting all the file paths for zecmip takes a while to laod with glob. Savign all these files in a csv files to improve speed.

In [None]:
from glob import glob
import pandas as pd
import sys, os
import numpy as np

from pprint import pprint
from typing import List, Dict
import copy
from collections import defaultdict
from pathlib import Path

In [None]:
ZECMIP_DIR = '/g/data/oi10/replicas/CMIP6/C4MIP'
DECK_DIR = '/g/data/oi10/replicas/CMIP6/CMIP'

In [None]:
def sort_into_ensemble(path_list: List[str]) -> Dict[str, Dict[str, str]]:
    """
    Sorts a list of file paths into an ensemble dictionary based on the model and its associated ensemble members.

    Args:
        path_list (List[str]): A list of file paths containing information about models, ensemble members, and frequencies.

    Returns:
        Dict[str, Dict[str, str]]: A dictionary where each model maps to a dictionary containing ensemble members as keys
        and their corresponding file paths as values.
    """
    # Create a defaultdict with nested dictionaries as the default factory
    stor_obj = defaultdict(dict)

    for path in path_list:
        # Extract model, ensemble_member, and frequency from the file path
        model, ensemble_member, _ = path.split('/')[8], path.split('/')[10], path.split('/')[11]

        # Find all ensemble members for the current model
        ensemble_members = [path2.split('/')[10] for path2 in path_list if ensemble_member not in path2 and model in path2]

        # Update the ensemble dictionary for the current model
        # Using the update method and dict comprehension to add the current ensemble member and all others found
        stor_obj[model].update({ensemble_member: path, **{member: path2 for member, path2 in zip(ensemble_members, path_list) if member != ensemble_member and model in path2}})

    # Convert the defaultdict to a regular dictionary and return the result
    return dict(stor_obj)


In [None]:
def create_ensemble_summary(base_path_sorted_ensemble):
    """
    Creates an ensemble summary dictionary based on the given sorted ensemble data.

    The function takes the `base_path_sorted_ensemble`, which is a dictionary representing the sorted ensemble data.
    It contains information about models and their associated ensemble members' file paths.
    The `base_path_sorted_ensemble` dictionary has the following structure:
    {
        'model_1': {
            'ensemble_member_1': 'file_path_for_ensemble_member_1',
            'ensemble_member_2': 'file_path_for_ensemble_member_2',
            ...
        },
        'model_2': {
            'ensemble_member_1': 'file_path_for_ensemble_member_1',
            'ensemble_member_2': 'file_path_for_ensemble_member_2',
            ...
        },
        ...
    }

    The function creates an ensemble summary dictionary, `stor_obj_to_return`, where each model maps to a dictionary.
    The inner dictionaries contain information about ensemble members associated with that model.
    If an ensemble member is missing for a specific model, it is not included in the inner dictionary.

    Args:
        base_path_sorted_ensemble (dict): A dictionary representing the sorted ensemble data.

    Returns:
        dict: An ensemble summary dictionary where each model maps to a dictionary containing ensemble members as keys
        and their corresponding file paths as values.
    """
    stor_obj_to_return = {}

    for model, values in base_path_sorted_ensemble['1000pgc'].items():
        # Use dictionary comprehension to create the model_dict containing ensemble members for the current model
        model_dict = {exp: base_path_sorted_ensemble[exp][model] for exp in base_path_sorted_ensemble if exp != '1000pgc' and model in base_path_sorted_ensemble[exp]}
        
        # Add the ensemble member values for the current model from '1000pgc'
        model_dict['1000pgc'] = values

        # Add the model_dict to the stor_obj_to_return
        stor_obj_to_return[model] = model_dict

    return stor_obj_to_return


In [None]:
# variables = ['tas', 'pr']
variable = 'pr'

In [None]:
%%time
# Sample file name: tas_Amon_IPSL-CM5A2-INCA_piControl_r1i1p1f1_gr_185001-209912.nc
fpaths_1000PgC = glob(ZECMIP_DIR + f'/**/*{variable}_*Amon*brch-1000PgC*.nc', recursive=True)

In [None]:
# Needs _piControl or else will also get esm-piControl
fpaths_picontrol = glob(DECK_DIR + f'/**/*{variable}_*Amon*_piControl*.nc', recursive=True)

In [None]:
# Not some variables (precip), do not have an esm-piControl
fpaths_esm_picontrol = glob('/g/data/oi10/replicas/CMIP6/CMIP' + f'/**/*{variable}_*Amon*esm-piControl*.nc', recursive=True)

In [None]:
fpaths_esm_picontrol[:3]

In [None]:
fpaths_1pct = glob(DECK_DIR + f'/**/*{variable}_*Amon*1pctCO2*.nc', recursive=True)

In [None]:
# The ACCESS model is not stored with the rest, add the access model in 
fpaths_1pct.append(
    f'/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/{variable}/gn/v20191115/'
)

fpaths_picontrol.append(
   f'/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/{variable}/gn/v20210316/'
)

fpaths_1000PgC.append(
    f'/g/data/fs38/publications/CMIP6/C4MIP/CSIRO/ACCESS-ESM1-5/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/{variable}/gn/v20191206/',
)

fpaths_esm_picontrol.append(
    f'/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/esm-piControl/r1i1p1f1/Amon/{variable}/gn/v20191115/'
)

In [None]:
fpaths_1pct[:3]

In [None]:
base_paths = {
    'picontrol': np.sort(np.unique(list(map(os.path.dirname, fpaths_picontrol)))),
    'esm-piControl':np.sort(np.unique(list(map(os.path.dirname, fpaths_esm_picontrol)))),
    '1000pgc':np.sort(np.unique(list(map(os.path.dirname, fpaths_1000PgC)))),
    '1pct':np.sort(np.unique(list(map(os.path.dirname, fpaths_1pct))))}

In [None]:
# Sorts a list of file paths into an ensemble dictionary based on the model and its associated ensemble members.
base_path_sorted_ensemble = {key: sort_into_ensemble(base_path_list) for key, base_path_list in base_paths.items()}

In [None]:
pprint(base_path_sorted_ensemble['1000pgc'])

In [None]:
# Creates an ensemble summary dictionary based on the given sorted ensemble data.
base_path_sorted_ensemble_model = create_ensemble_summary(base_path_sorted_ensemble)

In [None]:
pprint(base_path_sorted_ensemble_model['CanESM5'])

In [None]:
pprint(base_path_sorted_ensemble_model['GISS-E2-1-G-CC'])

In [None]:
base_path_sorted_ensemble_model['GISS-E2-1-G-CC'].update(
    **{'1pct' : {'r1i1p1f1': f'/g/data/oi10/replicas/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/1pctCO2/r1i1p1f1/Amon/{variable}/gn/v20180905'}
    })

In [None]:
pprint(base_path_sorted_ensemble_model['GISS-E2-1-G-CC'])

In [None]:
##### Manual Updates

if variable == 'tas':
    # TODO: Why have I got GISS stored locally for esm-piControl AND why do I need to add 1pct run?
    base_path_sorted_ensemble_model['GISS-E2-1-G-CC'].update(
        **{'esm-piControl' : {'r1i1p1f1':'/g/data/w40/ab2313/PhD/zecmip/original/esm_picontrol/GISS'}})
    
    # UKESM run has longer esm-piControl provided by Chris Jones.
    base_path_sorted_ensemble_model['UKESM1-0-LL'].update(
        **{'esm-piControl' :{'r1i1p1f1':'/g/data/w40/ab2313/PhD/zecmip/original/esm_picontrol/UKESM1-0-LL'}, 
        })

In [None]:
base_path_sorted_ensemble_model['UKESM1-0-LL']

In [None]:
pprint(base_path_sorted_ensemble_model)

In [None]:
base_path_sorted_ensemble_model['CESM2']

In [None]:
base_path_sorted_ensemble_model['UKESM1-0-LL']

In [None]:
base_path_sorted_ensemble_model['ACCESS-ESM1-5']

In [None]:
len(base_path_sorted_ensemble_model)

In [None]:
base_path_sorted_ensemble_model.keys()

In [None]:
with open(Path.cwd() / 'Documents' / 'GRL_ZECMIP_natural_variability_and_RCB'  /f'zecmip_experiment_paths_ensemble_sorted_{variable}.json', 'w') as fp:
    json.dump(base_path_sorted_ensemble_model, fp)