In [1]:
# Import the required modules
from pyesgf.search import SearchConnection
import os
import sys
import importlib
import pandas as pd
import requests
from tqdm import tqdm

# Set the os environment to on
os.environ['ESGF_PYCLIENT_NO_FACETS_STAR_WARNING'] = "on"

In [2]:
# Import the functions
sys.path.append('/home/users/benhutch/downloading-data/DAMIP/')

# Import the functions
from testing_download_functions import *

In [3]:
# Import the dictionaries
import dictionaries as dicts

In [42]:
# Import the functions again, so as to not have to restart the kernel
importlib.reload(sys.modules['testing_download_functions'])

# Import the functions again
from testing_download_functions import *

# Import the dictionaries again, so as to not have to restart the kernel
importlib.reload(sys.modules['dictionaries'])

# Import the dictionaries again
import dictionaries as dicts

In [29]:
# Set up the search connection
connection = SearchConnection(dicts.search_connection, distrib=True)

In [30]:
# Set up the paramas
# To speed up the check, only search for year s1961
params = {
    'activity_id': 'DCPP',
    'experiment_id': 'dcppA-hindcast',
    'latest': True,
    'sub_experiment_id': 's1961',
    'project': 'CMIP6',
    'table_id': 'Amon',
}

In [8]:
# Loop over the variables
variables = dicts.variables
models = dicts.models

# Create a dataframe for the results
# Containing three columns: model, experiment, does_model_exist
results_df = pd.DataFrame(columns=['model', 'variable', 'does_model_exist'])

# Loop over the variables
for variable in variables:
    print("Checking whether models exist for variable: " + variable)

    # Loop over the models
    for model in models:
        print("Checking whether model: " + model + " exists for variable: " + variable)

        # Set up the params
        params['variable_id'] = variable
        params['source_id'] = model

        # Query the database
        ctx = connection.new_context(**params)

        try:
            # Get the results from the query
            results = ctx.search()
        except:
            print("Model: " + model + " does not exist for variable: " + variable)
            results = []

        # If the length of results is greater than 0
        if len(results) > 0:
            print("Model: " + model + " exists for variable: " + variable)

            # Append True to the dataframe
            results_df = pd.concat([results_df, pd.DataFrame({'model': [model], 'variable': [variable], 'does_model_exist': [True]})], ignore_index=True)
        else:
            print("Model: " + model + " does not exist for variable: " + variable)

            # Append False to the dataframe
            results_df = pd.concat([results_df, pd.DataFrame({'model': [model], 'variable': [variable], 'does_model_exist': [False]})], ignore_index=True)

# Print the dataframe
results_df

Checking whether models exist for variable: tas
Checking whether model: BCC-CSM2-MR exists for variable: tas
Model: BCC-CSM2-MR exists for variable: tas
Checking whether model: MPI-ESM1-2-HR exists for variable: tas
Model: MPI-ESM1-2-HR exists for variable: tas
Checking whether model: CanESM5 exists for variable: tas
Model: CanESM5 exists for variable: tas
Checking whether model: CMCC-CM2-SR5 exists for variable: tas
Model: CMCC-CM2-SR5 exists for variable: tas
Checking whether model: HadGEM3-GC31-MM exists for variable: tas
Model: HadGEM3-GC31-MM exists for variable: tas
Checking whether model: EC-Earth3 exists for variable: tas
Model: EC-Earth3 exists for variable: tas
Checking whether model: MPI-ESM1-2-LR exists for variable: tas
Model: MPI-ESM1-2-LR exists for variable: tas
Checking whether model: FGOALS-f3-L exists for variable: tas
Model: FGOALS-f3-L exists for variable: tas
Checking whether model: MIROC6 exists for variable: tas
Model: MIROC6 exists for variable: tas
Checking wh

Unnamed: 0,model,variable,does_model_exist
0,BCC-CSM2-MR,tas,True
1,MPI-ESM1-2-HR,tas,True
2,CanESM5,tas,True
3,CMCC-CM2-SR5,tas,True
4,HadGEM3-GC31-MM,tas,True
5,EC-Earth3,tas,True
6,MPI-ESM1-2-LR,tas,True
7,FGOALS-f3-L,tas,True
8,MIROC6,tas,True
9,IPSL-CM6A-LR,tas,True


In [26]:
# Create an empty dictionary to store the valid nodes for each variable
variable_nodes = {}


# Find the valid nodes for each variable and models list combination
# Loop over the var models dictionary
for var, models_list in dicts.var_models.items():
    print("Finding the valid nodes for variable: " + var)
    print("Models list: " + str(models_list))

    # Set up the variable_id for the params
    params['variable_id'] = var

    # Append the variable to the dictionary
    variable_nodes[var] = {}

    # Find the valid nodes
    valid_nodes = find_valid_nodes(params=params, 
                                   models_list=models_list,
                                   conn=connection)
    
    # Append the valid nodes to the dictionary
    variable_nodes[var] = valid_nodes


Finding the valid nodes for variable: tas
Models list: ['BCC-CSM2-MR', 'MPI-ESM1-2-HR', 'CanESM5', 'CMCC-CM2-SR5', 'HadGEM3-GC31-MM', 'EC-Earth3', 'MPI-ESM1-2-LR', 'FGOALS-f3-L', 'MIROC6', 'IPSL-CM6A-LR', 'CESM1-1-CAM5-CMIP5', 'NorCPM1']
trying to find valid nodes for model: BCC-CSM2-MR
{'activity_id': 'DCPP', 'experiment_id': 'dcppA-hindcast', 'latest': True, 'sub_experiment_id': 's1961', 'project': 'CMIP6', 'table_id': 'Amon', 'variable_id': 'tas', 'source_id': 'BCC-CSM2-MR'}
16
{'esgf-data1.llnl.gov', 'cmip.bcc.cma.cn'}
trying to find valid files for node: esgf-data1.llnl.gov
8
trying to find valid files for node: cmip.bcc.cma.cn
8
trying to find valid nodes for model: MPI-ESM1-2-HR
{'activity_id': 'DCPP', 'experiment_id': 'dcppA-hindcast', 'latest': True, 'sub_experiment_id': 's1961', 'project': 'CMIP6', 'table_id': 'Amon', 'variable_id': 'tas', 'source_id': 'MPI-ESM1-2-HR'}
10
{'esgf-data1.llnl.gov'}
trying to find valid files for node: esgf-data1.llnl.gov
10
trying to find valid 

In [31]:
# print the valid nodes for tas
variable_nodes['tas']

[{'source_id': 'BCC-CSM2-MR',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 8},
 {'source_id': 'MPI-ESM1-2-HR',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 10},
 {'source_id': 'CanESM5',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 40},
 {'source_id': 'CMCC-CM2-SR5',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 20},
 {'source_id': 'HadGEM3-GC31-MM',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 10},
 {'source_id': 'EC-Earth3',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 16},
 {'source_id': 'MPI-ESM1-2-LR',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 10},
 {'source_id': 'FGOALS-f3-L',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 9},
 {'source_id': 'MIROC6', 'data_node': 'aims3.llnl.gov', 'num_results': 10},
 {'source_id': 'IPSL-CM6A-LR',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 10},
 {'source_id': 'CESM1-1-CAM5-CMIP5',
  'data_node': 'esgf-data1.llnl.gov',
  'num_results': 40},
 {'source_id': 

In [53]:
# For each variable, create the results list
# Initialize an empty dictionary to store the results
results = {}

# Loop over the variables
for var in dicts.variables:
    print("Finding the results for variable: " + var)

    # Extract the valid nodes for the variable
    valid_nodes = variable_nodes[var]

    # Append the variable to params
    params['variable_id'] = var

    # print the type of valid nodes
    print("Valid nodes type: " + str(type(valid_nodes)))
    print("Valid nodes: " + str(valid_nodes))

    # Find the results for the variable
    var_results = create_results_list(params=params,
                                      max_results_list=valid_nodes,
                                      connection=connection)

    # Append the results to the dictionary
    results[var] = var_results

Finding the results for variable: tas
Valid nodes type: <class 'list'>
Valid nodes: [{'source_id': 'BCC-CSM2-MR', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 8}, {'source_id': 'MPI-ESM1-2-HR', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 10}, {'source_id': 'CanESM5', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 40}, {'source_id': 'CMCC-CM2-SR5', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 20}, {'source_id': 'HadGEM3-GC31-MM', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 10}, {'source_id': 'EC-Earth3', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 16}, {'source_id': 'MPI-ESM1-2-LR', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 10}, {'source_id': 'FGOALS-f3-L', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 9}, {'source_id': 'MIROC6', 'data_node': 'aims3.llnl.gov', 'num_results': 10}, {'source_id': 'IPSL-CM6A-LR', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 10}, {'source_id': 'CESM1-1-CAM5-CMIP5', 'data_node': 'esgf-data1.llnl.gov', '

In [54]:
print("Results: " + str(results))
# print the length of the results
print("Length of results: " + str(len(results)))
print(type(results['tas']))

Results: {'tas': [<pyesgf.search.results.ResultSet object at 0x7f726fff7a90>, <pyesgf.search.results.ResultSet object at 0x7f726f63cca0>, <pyesgf.search.results.ResultSet object at 0x7f726f63cb20>, <pyesgf.search.results.ResultSet object at 0x7f726f63d5d0>, <pyesgf.search.results.ResultSet object at 0x7f7375d42620>, <pyesgf.search.results.ResultSet object at 0x7f73741f29b0>, <pyesgf.search.results.ResultSet object at 0x7f73741f08e0>, <pyesgf.search.results.ResultSet object at 0x7f726f63c7f0>, <pyesgf.search.results.ResultSet object at 0x7f726f63e4a0>, <pyesgf.search.results.ResultSet object at 0x7f73741f2440>, <pyesgf.search.results.ResultSet object at 0x7f726f63e200>, <pyesgf.search.results.ResultSet object at 0x7f726f63cb50>], 'psl': [<pyesgf.search.results.ResultSet object at 0x7f726f63cf70>, <pyesgf.search.results.ResultSet object at 0x7f726fb6dcf0>, <pyesgf.search.results.ResultSet object at 0x7f726fb6f070>, <pyesgf.search.results.ResultSet object at 0x7f726fb6e7d0>, <pyesgf.searc

In [55]:
# Initialize an empty dictionary to store the results file context
results_file_context = {}

# For each of the variables
for var, results in results.items():
    print("Variable: " + var)
    print("Results: " + str(results))

    # Initialize an empty dictionary to store the file context
    file_context_list = []

    # Loop over the results
    for result in results:
        print("Result: " + str(result))

        # Extract the file context
        file_context = extract_file_context(result)

        # Append the file context to the list
        file_context_list.append(file_context)

    # Append the file context list to the dictionary
    results_file_context[var] = file_context_list

Variable: tas
Results: [<pyesgf.search.results.ResultSet object at 0x7f726fff7a90>, <pyesgf.search.results.ResultSet object at 0x7f726f63cca0>, <pyesgf.search.results.ResultSet object at 0x7f726f63cb20>, <pyesgf.search.results.ResultSet object at 0x7f726f63d5d0>, <pyesgf.search.results.ResultSet object at 0x7f7375d42620>, <pyesgf.search.results.ResultSet object at 0x7f73741f29b0>, <pyesgf.search.results.ResultSet object at 0x7f73741f08e0>, <pyesgf.search.results.ResultSet object at 0x7f726f63c7f0>, <pyesgf.search.results.ResultSet object at 0x7f726f63e4a0>, <pyesgf.search.results.ResultSet object at 0x7f73741f2440>, <pyesgf.search.results.ResultSet object at 0x7f726f63e200>, <pyesgf.search.results.ResultSet object at 0x7f726f63cb50>]
Result: <pyesgf.search.results.ResultSet object at 0x7f726fff7a90>
Extracting file context for 8 datasets...
Error: <pyesgf.search.results.DatasetResult object at 0x7f7375df1cf0>
Processed 2 out of 8 results.
Processed 3 out of 8 results.
Processed 4 out o

In [56]:
# Create an empty dataframe to store the results
df = pd.DataFrame()

for var, file_context_lists in results_file_context.items():
    print("Variable: " + var)

    # Loop over the file contexts
    for file_context in file_context_lists:
        # Convert the dictionary to a dataframe
        file_context_df = pd.DataFrame.from_dict(file_context)

        # Add a new column on the far left of the dataframe
        # containing the variable name
        file_context_df.insert(0, 'variable', var)

        # Concatenate the dataframe to the results dataframe
        df = pd.concat([df, file_context_df], ignore_index=True)

# Print the dataframe
df

Variable: tas
Variable: psl
Variable: uas
Variable: vas
Variable: rsds


Unnamed: 0,variable,filename,url
0,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r4i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
1,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r2i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
2,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r3i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
3,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r5i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
4,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r8i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
...,...,...,...
1637,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r3i2p1f...,https://esgf-data1.llnl.gov/thredds/fileServer...
1638,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r2i2p1f...,https://esgf-data1.llnl.gov/thredds/fileServer...
1639,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r2i1p1f...,https://esgf-data1.llnl.gov/thredds/fileServer...
1640,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r10i1p1...,https://esgf-data1.llnl.gov/thredds/fileServer...
