In [None]:
import pandas as pd
from cmip6_downscaling.data import cat
import intake

In [None]:
def return_valid_gcms(col):
    gcm_list = []
    member_id_list = []
    historical = col.search(
        activity_id='CMIP',
        experiment_id='historical',
        table_id='day',
        grid_label='gn',
        variable_id=['tasmax', 'tasmin', 'pr'],
    )
    ssps = col.search(
        activity_id='ScenarioMIP',
        table_id='day',
        grid_label='gn',
        variable_id=['tasmax', 'tasmin', 'pr'],
    )
    # list of gcms that share historical and ssps scenarios
    gcm_id_list = list(set(ssps.df.source_id.unique()) & set(historical.df.source_id.unique()))

    for gcm_id in gcm_id_list:
        # select historical and ssps dfs for corresponding gcm
        ssps_df = ssps.df[['experiment_id', 'source_id', 'member_id', 'variable_id']]
        ssps_df = ssps_df[ssps_df['source_id'] == gcm_id]
        historical_df = historical.df[['experiment_id', 'source_id', 'member_id', 'variable_id']]
        historical_df = historical_df[historical_df['source_id'] == gcm_id]

        # list of member ids shared between historical and ssps
        shared_member_ids = list(
            set(ssps_df.member_id.unique()) & set(historical_df.member_id.unique())
        )
        print(gcm_id)
        print(shared_member_ids)
        for mem_id in shared_member_ids:
            member_historical_df = historical_df[historical_df['member_id'] == mem_id]
            member_ssps_df = ssps_df[ssps_df['member_id'] == mem_id]
            merge_df = member_historical_df.merge(
                member_ssps_df,
                left_on=['member_id', 'variable_id'],
                right_on=['member_id', 'variable_id'],
                how='inner',
            )
            if len(merge_df) >= 9:
                gcm_list.append(gcm_id)
                member_id_list.append(mem_id)
    valid_df = pd.DataFrame({'GCM': gcm_list, 'member_id': member_id_list})
    return valid_df

In [None]:
az_col = cat.cmip6()
gcs_col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")

In [None]:
az_valid_df = return_valid_gcms(az_col)
gcs_valid_df = return_valid_gcms(gcs_col)
az_valid_df.columns = ['az_GCM', 'az_member_id']
gcs_valid_df.columns = ['gcs_GCM', 'gcs_member_id']
az_gcp_compare = pd.merge(
    az_valid_df,
    gcs_valid_df,
    right_on=['gcs_GCM', 'gcs_member_id'],
    left_on=['az_GCM', 'az_member_id'],
    how='outer',
)
az_gcp_compare.to_csv('az_vs_gcp_catalog.csv', index=False)

In [None]:
import planetary_computer
import xarray as xr
import fsspec
import pystac_client

catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1/")
collection = catalog.get_collection("nasa-nex-gddp-cmip6")
colection_list = collection.summaries.get_list("cmip6:model")

In [None]:
az_list = az_valid_df['az_GCM'].to_list()

In [None]:
pc_az_overlap = set(az_list) & set(colection_list)