## Simulations availability
This notebook follows the data catalog generated by the [jsc-cordex-catalog repository](https://github.com/euro-cordex/jsc-cordex-catalog/tree/main).

In [95]:
import intake
import pandas as pd

cat = intake.open_esm_datastore("https://raw.githubusercontent.com/euro-cordex/jsc-cordex-catalog/refs/heads/main/CORDEX-CMIP6-JSC.json")
cat.keys()


['CORDEX.EUR-11.CLMcom.ERAINT.evaluation.r0i0p0.CCLM4-8-17.v1.fx.v20140515',
 'CORDEX.EUR-11.CLMcom.ERAINT.evaluation.r1i1p1.CCLM4-8-17.v1.mon.v20140515',
 'CORDEX.EUR-11.CLMcom-ETH.ERAINT.evaluation.r0i0p0.COSMO-crCLIM-v1-1.v1.fx.v20191210',
 'CORDEX.EUR-11.CLMcom-ETH.ERAINT.evaluation.r1i1p1.COSMO-crCLIM-v1-1.v1.mon.v20191210',
 'CORDEX.EUR-11.CNRM.ERAINT.evaluation.r1i1p1.ALADIN53.v1.fx.v20150127',
 'CORDEX.EUR-11.CNRM.ERAINT.evaluation.r1i1p1.ALADIN53.v1.mon.v20150127',
 'CORDEX.EUR-11.CNRM.ERAINT.evaluation.r1i1p1.ALADIN63.v1.fx.v20191118',
 'CORDEX.EUR-11.CNRM.ERAINT.evaluation.r1i1p1.ALADIN63.v1.mon.v20191118',
 'CORDEX.EUR-11.DHMZ.ERAINT.evaluation.r1i1p1.RegCM4-2.v1.fx.v20160112',
 'CORDEX.EUR-11.DHMZ.ERAINT.evaluation.r1i1p1.RegCM4-2.v1.mon.v20150527',
 'CORDEX.EUR-11.DMI.ERAINT.evaluation.r1i1p1.HIRHAM5.v1.day.v20131119',
 'CORDEX.EUR-11.DMI.ERAINT.evaluation.r1i1p1.HIRHAM5.v1.fx.v20131119',
 'CORDEX.EUR-11.GERICS.ERAINT.evaluation.r0i0p0.REMO2015.v1.fx.v20180813',
 'CORDEX.

Check available simulations in the CMIP6 CORDEX evaluation experiment (regardless of the variables):

In [96]:
cat_cmip6 = cat.search(driving_experiment_id=["evaluation"], mip_era="CMIP6")

In [97]:
# Create concatenated variable
#cat_cmip6.df['source_institution'] = cat_cmip6.df['source_id'] + '_' + cat_cmip6.df['institution_id']
cat_cmip6.df['source_institution'] = (
    cat_cmip6.df['source_id'].astype(str) + '_' + cat_cmip6.df['institution_id'].astype(str)
)

In [98]:
sorted(cat_cmip6.df["source_institution"].dropna().unique())

['ALARO1-SFX_RMIB-UGent',
 'CCLM6-0-1-URB-ESG_CLMcom-KUL',
 'CCLM6-0-1-URB_CLMcom-CMCC',
 'CCLM6-0-1_CLMcom-Hereon',
 'CNRM-ALADIN64E1_CNRM-MF',
 'HCLIM43-ALADIN_HCLIMcom-SMHI',
 'ICON-CLM-202407-1-1_CLMcom-Hereon',
 'RACMO23E_KNMI',
 'REMO2020-2-2-MR2_GERICS',
 'REMO2020-2-2-TEB_GERICS',
 'REMO2020-2-2-iMOVE-LUC_GERICS',
 'REMO2020-2-2-iMOVE_GERICS',
 'REMO2020-2-2_GERICS',
 'ROAM-NBS_DWD-BSH',
 'RegCM5-0_ICTP',
 'WRF451Q_AUTH',
 'WRF451Q_CESAM-UA',
 'WRF451Q_IDL-FCUL']

In [99]:
cat_cmip6.df["source_institution"].value_counts()

source_institution
CCLM6-0-1_CLMcom-Hereon              5149
ICON-CLM-202407-1-1_CLMcom-Hereon    3582
CNRM-ALADIN64E1_CNRM-MF              3072
RegCM5-0_ICTP                        2393
ROAM-NBS_DWD-BSH                     2125
WRF451Q_IDL-FCUL                     1921
CCLM6-0-1-URB_CLMcom-CMCC            1630
WRF451Q_AUTH                          982
REMO2020-2-2-iMOVE_GERICS             759
REMO2020-2-2-iMOVE-LUC_GERICS         759
REMO2020-2-2-TEB_GERICS               701
REMO2020-2-2_GERICS                   697
WRF451Q_CESAM-UA                      475
ALARO1-SFX_RMIB-UGent                 473
HCLIM43-ALADIN_HCLIMcom-SMHI          397
REMO2020-2-2-MR2_GERICS               373
RACMO23E_KNMI                         340
CCLM6-0-1-URB-ESG_CLMcom-KUL          210
Name: count, dtype: int64

In [100]:
print(cat_cmip6.df["source_institution"].nunique(), "number of different RCMs in CMIP6 evaluation experiment")

18 number of different RCMs in CMIP6 evaluation experiment


### Filter models with all the requested variables:

In [None]:
# Search for all required variables together, at once
# required_vars = ["tas", "hurs","sfcWind","rsds"]
# required_freq = ["day"] # 14 available
required_vars = ["tas", "hurs"]
required_freq = ["1hr"] # 13 available
cat_all = cat.search(variable_id=required_vars, frequency=required_freq, driving_experiment_id=["evaluation"], mip_era="CMIP6")

# Check the dataframe to see what we have
cat_all.df

Unnamed: 0,project_id,mip_era,activity_id,domain_id,institution_id,driving_source_id,driving_experiment_id,driving_variant_label,source_id,version_realization,frequency,version,time_range,variable_id,path
0,CORDEX-CMIP6,CMIP6,DD,EUR-12,KNMI,ERA5,evaluation,r1i1p1f1,RACMO23E,v1-r1,1hr,v20241216,199201010000-199212312300,tas,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
1,CORDEX-CMIP6,CMIP6,DD,EUR-12,KNMI,ERA5,evaluation,r1i1p1f1,RACMO23E,v1-r1,1hr,v20241216,199401010000-199412312300,tas,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
2,CORDEX-CMIP6,CMIP6,DD,EUR-12,KNMI,ERA5,evaluation,r1i1p1f1,RACMO23E,v1-r1,1hr,v20241216,199301010000-199312312300,tas,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
3,CORDEX-CMIP6,CMIP6,DD,EUR-12,KNMI,ERA5,evaluation,r1i1p1f1,RACMO23E,v1-r1,1hr,v20241216,198101010000-198112312300,tas,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
4,CORDEX-CMIP6,CMIP6,DD,EUR-12,KNMI,ERA5,evaluation,r1i1p1f1,RACMO23E,v1-r1,1hr,v20241216,199101010000-199112312300,tas,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334,CORDEX-CMIP6,CMIP6,DD,EUR-12,RMIB-UGent,ERA5,evaluation,r1i1p1f1,ALARO1-SFX,v1-r1,1hr,v20241009,2013010100-2013123123,hurs,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
1335,CORDEX-CMIP6,CMIP6,DD,EUR-12,RMIB-UGent,ERA5,evaluation,r1i1p1f1,ALARO1-SFX,v1-r1,1hr,v20241009,1983010100-1983123123,hurs,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
1336,CORDEX-CMIP6,CMIP6,DD,EUR-12,RMIB-UGent,ERA5,evaluation,r1i1p1f1,ALARO1-SFX,v1-r1,1hr,v20241009,1992010100-1992123123,hurs,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...
1337,CORDEX-CMIP6,CMIP6,DD,EUR-12,RMIB-UGent,ERA5,evaluation,r1i1p1f1,ALARO1-SFX,v1-r1,1hr,v20241009,1982010100-1982123123,hurs,/mnt/CORDEX_CMIP6_tmp/sim_data/CORDEX-CMIP6/DD...


In [102]:
# Group by source/model to find which ones have all required variables
# Define grouping columns (adjust based on what identifies a unique "source" in your catalog)
group_cols = ['source_id', 'driving_source_id', 'institution_id']

# Count how many of the required variables each source has
var_counts = cat_all.df.groupby(group_cols)['variable_id'].apply(
    lambda x: len(set(x) & set(required_vars))
).reset_index(name='var_count')


# Filter for sources that have all required variables
sources_with_all = var_counts[var_counts['var_count'] == len(required_vars)]
print(f"Sources with all {len(required_vars)} required variables:")
sources_with_all

Sources with all 2 required variables:


Unnamed: 0,source_id,driving_source_id,institution_id,var_count
0,ALARO1-SFX,ERA5,RMIB-UGent,2
1,CCLM6-0-1,ERA5,CLMcom-Hereon,2
2,CCLM6-0-1-URB,ERA5,CLMcom-CMCC,2
3,CNRM-ALADIN64E1,ERA5,CNRM-MF,2
4,HCLIM43-ALADIN,ERA5,HCLIMcom-SMHI,2
5,ICON-CLM-202407-1-1,ERA5,CLMcom-Hereon,2
9,REMO2020-2-2-TEB,ERA5,GERICS,2
10,REMO2020-2-2-iMOVE,ERA5,GERICS,2
11,REMO2020-2-2-iMOVE-LUC,ERA5,GERICS,2
12,ROAM-NBS,ERA5,DWD-BSH,2


In [103]:
# Filter the catalog to only include sources with all variables
if len(sources_with_all) > 0:
    # Merge back to get only the rows with complete variable sets
    cat_filtered_df = cat_all.df.merge(sources_with_all[group_cols], on=group_cols, how='inner')
    print(f"\nFiltered catalog has {len(cat_filtered_df)} entries from {len(sources_with_all)} sources")
    cat_filtered_df
else:
    print("No sources found with all required variables")


Filtered catalog has 1214 entries from 13 sources


### Create a csv file with simulations paths

In [104]:
# Get unique source_id and institution_id combinations from filtered catalog
filter_cols = ['source_id', 'institution_id']
filtered_sources = cat_filtered_df[filter_cols].drop_duplicates()
filtered_sources


Unnamed: 0,source_id,institution_id
0,ICON-CLM-202407-1-1,CLMcom-Hereon
142,CCLM6-0-1,CLMcom-Hereon
266,CCLM6-0-1-URB,CLMcom-CMCC
350,HCLIM43-ALADIN,HCLIMcom-SMHI
438,ROAM-NBS,DWD-BSH
524,REMO2020-2-2-iMOVE-LUC,GERICS
608,REMO2020-2-2-TEB,GERICS
692,REMO2020-2-2-iMOVE,GERICS
776,WRF451Q,AUTH
818,WRF451Q,IDL-FCUL


In [105]:
# Filter cat_all to keep only rows matching these source/institution combinations
cat_all_filtered = cat_all.df.merge(filtered_sources, on=filter_cols, how='inner')

# Create a csv file with all columns from cat_all but only filtered sources
cat_all_filtered.to_csv("../data_catalogs/cmip6_evaluation_1hr_heatstress_catalog.csv", index=False)


### Tests: Checking variables separately:

In [106]:
cat_tas = cat.search(variable_id=["tas"],frequency=["1hr"], driving_experiment_id=["evaluation"])
cat_tas.keys()

['CORDEX-CMIP6.EUR-12.AUTH.ERA5.evaluation.r1i1p1f1.WRF451Q.v1-r3.1hr.v20250630',
 'CORDEX-CMIP6.EUR-12.CLMcom-CMCC.ERA5.evaluation.r1i1p1f1.CCLM6-0-1-URB.v1-r1.1hr.v20250201',
 'CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.CCLM6-0-1.v1-r1.1hr.v20230222',
 'CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.ICON-CLM-202407-1-1.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.CNRM-MF.ERA5.evaluation.r1i1p1f1.CNRM-ALADIN64E1.v1-r1.1hr.v20250505',
 'CORDEX-CMIP6.EUR-12.DWD-BSH.ERA5.evaluation.r1i1p1f1.ROAM-NBS.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2.v1-r1.1hr.v20241120',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-MR2.v1-r1.1hr.v20241120',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-TEB.v1-r1.1hr.v20251028',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE.v1-r1.1hr.v20250515',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE-LUC.v

In [107]:
cat_hurs = cat.search(variable_id=["hurs"],frequency=["1hr"], driving_experiment_id=["evaluation"])
cat_hurs.keys()

['CORDEX-CMIP6.EUR-12.AUTH.ERA5.evaluation.r1i1p1f1.WRF451Q.v1-r3.1hr.v20250630',
 'CORDEX-CMIP6.EUR-12.CLMcom-CMCC.ERA5.evaluation.r1i1p1f1.CCLM6-0-1-URB.v1-r1.1hr.v20250201',
 'CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.CCLM6-0-1.v1-r1.1hr.v20230222',
 'CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.ICON-CLM-202407-1-1.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.CNRM-MF.ERA5.evaluation.r1i1p1f1.CNRM-ALADIN64E1.v1-r1.1hr.v20250505',
 'CORDEX-CMIP6.EUR-12.DWD-BSH.ERA5.evaluation.r1i1p1f1.ROAM-NBS.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-TEB.v1-r1.1hr.v20251028',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE.v1-r1.1hr.v20250515',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE-LUC.v1-r1.1hr.v20250515',
 'CORDEX-CMIP6.EUR-12.HCLIMcom-SMHI.ERA5.evaluation.r1i1p1f1.HCLIM43-ALADIN.v1-r1.1hr.v20241205',
 'CORDEX-CMIP6.EUR-12.ICTP.ERA5.evaluation.r1i1p1f1.RegCM5-0.v1

In [108]:
cat_sfcwind = cat.search(variable_id=["sfcWind"],frequency=["1hr"], driving_experiment_id=["evaluation"])
cat_sfcwind.keys()

['CORDEX-CMIP6.EUR-12.AUTH.ERA5.evaluation.r1i1p1f1.WRF451Q.v1-r3.1hr.v20250630',
 'CORDEX-CMIP6.EUR-12.CLMcom-CMCC.ERA5.evaluation.r1i1p1f1.CCLM6-0-1-URB.v1-r1.1hr.v20250201',
 'CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.CCLM6-0-1.v1-r1.1hr.v20230222',
 'CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.ICON-CLM-202407-1-1.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.CNRM-MF.ERA5.evaluation.r1i1p1f1.CNRM-ALADIN64E1.v1-r1.1hr.v20250505',
 'CORDEX-CMIP6.EUR-12.DWD-BSH.ERA5.evaluation.r1i1p1f1.ROAM-NBS.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2.v1-r1.1hr.v20241120',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-MR2.v1-r1.1hr.v20241120',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE.v1-r1.1hr.v20250515',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE-LUC.v1-r1.1hr.v20250515',
 'CORDEX-CMIP6.EUR-12.HCLIMcom-SMHI.ERA5.evaluation.r1i1p1f1.HCLIM43-ALA

In [109]:
cat_rsds = cat.search(variable_id=["rsds"],frequency=["1hr"], driving_experiment_id=["evaluation"])
cat_rsds.keys()

['CORDEX-CMIP6.EUR-12.CLMcom-Hereon.ERA5.evaluation.r1i1p1f1.ICON-CLM-202407-1-1.v1-r1.1hr.v20240920',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-TEB.v1-r1.1hr.v20251028',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE.v1-r1.1hr.v20250515',
 'CORDEX-CMIP6.EUR-12.GERICS.ERA5.evaluation.r1i1p1f1.REMO2020-2-2-iMOVE-LUC.v1-r1.1hr.v20250515']