In [1]:
# Import the required modules
from pyesgf.search import SearchConnection
import os
import sys
import importlib
import pandas as pd
import requests
from tqdm import tqdm

# Set the os environment to on
os.environ['ESGF_PYCLIENT_NO_FACETS_STAR_WARNING'] = "on"

In [2]:
# Import the functions
sys.path.append('/home/users/benhutch/downloading-data/DAMIP/')

# Import the functions
from testing_download_functions import *

In [3]:
# Import the dictionaries
import dictionaries as dicts

In [4]:
# Import the functions again, so as to not have to restart the kernel
importlib.reload(sys.modules['testing_download_functions'])

# Import the functions again
from testing_download_functions import *

# Import the dictionaries again, so as to not have to restart the kernel
importlib.reload(sys.modules['dictionaries'])

# Import the dictionaries again
import dictionaries as dicts

In [5]:
# Set up the search connection
connection = SearchConnection(dicts.search_connection, distrib=True)

In [6]:
# Set up the paramas
# To speed up the check, only search for year s1961
params = {
    'activity_id': 'DCPP',
    'experiment_id': 'dcppA-hindcast',
    'latest': True,
    'sub_experiment_id': 's1961',
    'project': 'CMIP6',
    'table_id': 'Amon',
}

In [7]:
# Loop over the variables
variables = dicts.variables_test
models = dicts.models

# Create a dataframe for the results
# Containing three columns: model, experiment, does_model_exist
results_df = pd.DataFrame(columns=['model', 'variable', 'does_model_exist'])

# Loop over the variables
for variable in variables:
    print("Checking whether models exist for variable: " + variable)

    # Loop over the models
    for model in models:
        print("Checking whether model: " + model + " exists for variable: " + variable)

        # Set up the params
        params['variable_id'] = variable
        params['source_id'] = model

        # Query the database
        ctx = connection.new_context(**params)

        try:
            # Get the results from the query
            results = ctx.search()
        except:
            print("Model: " + model + " does not exist for variable: " + variable)
            results = []

        # If the length of results is greater than 0
        if len(results) > 0:
            print("Model: " + model + " exists for variable: " + variable)

            # Append True to the dataframe
            results_df = pd.concat([results_df, pd.DataFrame({'model': [model], 'variable': [variable], 'does_model_exist': [True]})], ignore_index=True)
        else:
            print("Model: " + model + " does not exist for variable: " + variable)

            # Append False to the dataframe
            results_df = pd.concat([results_df, pd.DataFrame({'model': [model], 'variable': [variable], 'does_model_exist': [False]})], ignore_index=True)

# Print the dataframe
results_df

Checking whether models exist for variable: rsds
Checking whether model: BCC-CSM2-MR exists for variable: rsds
Model: BCC-CSM2-MR exists for variable: rsds
Checking whether model: MPI-ESM1-2-HR exists for variable: rsds
Model: MPI-ESM1-2-HR exists for variable: rsds
Checking whether model: CanESM5 exists for variable: rsds
Model: CanESM5 exists for variable: rsds
Checking whether model: CMCC-CM2-SR5 exists for variable: rsds
Model: CMCC-CM2-SR5 exists for variable: rsds
Checking whether model: HadGEM3-GC31-MM exists for variable: rsds
Model: HadGEM3-GC31-MM exists for variable: rsds
Checking whether model: EC-Earth3 exists for variable: rsds
Model: EC-Earth3 exists for variable: rsds
Checking whether model: MPI-ESM1-2-LR exists for variable: rsds
Model: MPI-ESM1-2-LR does not exist for variable: rsds
Checking whether model: FGOALS-f3-L exists for variable: rsds
Model: FGOALS-f3-L exists for variable: rsds
Checking whether model: MIROC6 exists for variable: rsds
Model: MIROC6 exists for

Unnamed: 0,model,variable,does_model_exist
0,BCC-CSM2-MR,rsds,True
1,MPI-ESM1-2-HR,rsds,True
2,CanESM5,rsds,True
3,CMCC-CM2-SR5,rsds,True
4,HadGEM3-GC31-MM,rsds,True
5,EC-Earth3,rsds,True
6,MPI-ESM1-2-LR,rsds,False
7,FGOALS-f3-L,rsds,True
8,MIROC6,rsds,True
9,IPSL-CM6A-LR,rsds,True


In [8]:
# Create an empty dictionary to store the valid nodes for each variable
variable_nodes = {}


# Find the valid nodes for each variable and models list combination
# Loop over the var models dictionary
for var, models_list in dicts.var_models_test.items():
    print("Finding the valid nodes for variable: " + var)
    print("Models list: " + str(models_list))

    # Set up the variable_id for the params
    params['variable_id'] = var

    # Append the variable to the dictionary
    variable_nodes[var] = {}

    # Find the valid nodes
    valid_nodes = find_valid_nodes(params=params, 
                                   models_list=models_list,
                                   conn=connection)
    
    # Append the valid nodes to the dictionary
    variable_nodes[var] = valid_nodes


Finding the valid nodes for variable: rsds
Models list: ['BCC-CSM2-MR', 'MPI-ESM1-2-HR', 'CanESM5', 'CMCC-CM2-SR5', 'HadGEM3-GC31-MM', 'EC-Earth3', 'FGOALS-f3-L', 'MIROC6', 'IPSL-CM6A-LR', 'CESM1-1-CAM5-CMIP5', 'NorCPM1']
trying to find valid nodes for model: BCC-CSM2-MR
{'activity_id': 'DCPP', 'experiment_id': 'dcppA-hindcast', 'latest': True, 'sub_experiment_id': 's1961', 'project': 'CMIP6', 'table_id': 'Amon', 'variable_id': 'rsds', 'source_id': 'BCC-CSM2-MR'}
32
{'cmip.bcc.cma.cn', 'esgf.ceda.ac.uk', 'esgf-data1.llnl.gov', 'esgf3.dkrz.de'}
trying to find valid files for node: cmip.bcc.cma.cn
8
trying to find valid files for node: esgf.ceda.ac.uk
8
trying to find valid files for node: esgf-data1.llnl.gov
8
trying to find valid files for node: esgf3.dkrz.de
8
trying to find valid nodes for model: MPI-ESM1-2-HR
{'activity_id': 'DCPP', 'experiment_id': 'dcppA-hindcast', 'latest': True, 'sub_experiment_id': 's1961', 'project': 'CMIP6', 'table_id': 'Amon', 'variable_id': 'rsds', 'source_

In [9]:
# For each variable, create the results list
# Initialize an empty dictionary to store the results
results = {}

# Loop over the variables
for var in dicts.variables_test:
    print("Finding the results for variable: " + var)

    # Extract the valid nodes for the variable
    valid_nodes = variable_nodes[var]

    # Append the variable to params
    params['variable_id'] = var

    # print the type of valid nodes
    print("Valid nodes type: " + str(type(valid_nodes)))
    print("Valid nodes: " + str(valid_nodes))

    # Find the results for the variable
    var_results = create_results_list(params=params,
                                      max_results_list=valid_nodes,
                                      connection=connection)

    # Append the results to the dictionary
    results[var] = var_results

Finding the results for variable: rsds
Valid nodes type: <class 'list'>
Valid nodes: [{'source_id': 'BCC-CSM2-MR', 'data_node': 'cmip.bcc.cma.cn', 'num_results': 8}, {'source_id': 'MPI-ESM1-2-HR', 'data_node': 'esgf.ceda.ac.uk', 'num_results': 10}, {'source_id': 'CanESM5', 'data_node': 'esgf.ceda.ac.uk', 'num_results': 20}, {'source_id': 'CMCC-CM2-SR5', 'data_node': 'esgf-node2.cmcc.it', 'num_results': 20}, {'source_id': 'HadGEM3-GC31-MM', 'data_node': 'esgf.ceda.ac.uk', 'num_results': 10}, {'source_id': 'EC-Earth3', 'data_node': 'esgf.ceda.ac.uk', 'num_results': 16}, {'source_id': 'FGOALS-f3-L', 'data_node': 'esg.lasg.ac.cn', 'num_results': 3}, {'source_id': 'MIROC6', 'data_node': 'esgf-data02.diasjp.net', 'num_results': 10}, {'source_id': 'IPSL-CM6A-LR', 'data_node': 'esgf.ceda.ac.uk', 'num_results': 10}, {'source_id': 'CESM1-1-CAM5-CMIP5', 'data_node': 'esgf-data1.llnl.gov', 'num_results': 40}, {'source_id': 'NorCPM1', 'data_node': 'esgf.ceda.ac.uk', 'num_results': 20}]


  0%|          | 0/11 [00:00<?, ?it/s]

Querying for source_id: BCC-CSM2-MR and data_node: cmip.bcc.cma.cn


  9%|▉         | 1/11 [00:01<00:15,  1.51s/it]

Found 8 results.
Querying for source_id: MPI-ESM1-2-HR and data_node: esgf.ceda.ac.uk


 18%|█▊        | 2/11 [00:03<00:18,  2.06s/it]

Found 10 results.
Querying for source_id: CanESM5 and data_node: esgf.ceda.ac.uk


 27%|██▋       | 3/11 [00:06<00:18,  2.25s/it]

Found 20 results.
Querying for source_id: CMCC-CM2-SR5 and data_node: esgf-node2.cmcc.it


 36%|███▋      | 4/11 [00:09<00:17,  2.48s/it]

Found 20 results.
Querying for source_id: HadGEM3-GC31-MM and data_node: esgf.ceda.ac.uk


 45%|████▌     | 5/11 [00:11<00:14,  2.46s/it]

Found 10 results.
Querying for source_id: EC-Earth3 and data_node: esgf.ceda.ac.uk


 55%|█████▍    | 6/11 [00:14<00:12,  2.48s/it]

Found 16 results.
Querying for source_id: FGOALS-f3-L and data_node: esg.lasg.ac.cn


 64%|██████▎   | 7/11 [00:15<00:08,  2.09s/it]

Found 3 results.
Querying for source_id: MIROC6 and data_node: esgf-data02.diasjp.net


 73%|███████▎  | 8/11 [00:16<00:05,  1.87s/it]

Found 10 results.
Querying for source_id: IPSL-CM6A-LR and data_node: esgf.ceda.ac.uk


 82%|████████▏ | 9/11 [00:19<00:04,  2.12s/it]

Found 10 results.
Querying for source_id: CESM1-1-CAM5-CMIP5 and data_node: esgf-data1.llnl.gov


 91%|█████████ | 10/11 [00:21<00:02,  2.03s/it]

Found 40 results.
Querying for source_id: NorCPM1 and data_node: esgf.ceda.ac.uk


100%|██████████| 11/11 [00:24<00:00,  2.21s/it]

Found 20 results.





In [54]:
print("Results: " + str(results))
# print the length of the results
print("Length of results: " + str(len(results)))
print(type(results['tas']))

Results: {'tas': [<pyesgf.search.results.ResultSet object at 0x7f726fff7a90>, <pyesgf.search.results.ResultSet object at 0x7f726f63cca0>, <pyesgf.search.results.ResultSet object at 0x7f726f63cb20>, <pyesgf.search.results.ResultSet object at 0x7f726f63d5d0>, <pyesgf.search.results.ResultSet object at 0x7f7375d42620>, <pyesgf.search.results.ResultSet object at 0x7f73741f29b0>, <pyesgf.search.results.ResultSet object at 0x7f73741f08e0>, <pyesgf.search.results.ResultSet object at 0x7f726f63c7f0>, <pyesgf.search.results.ResultSet object at 0x7f726f63e4a0>, <pyesgf.search.results.ResultSet object at 0x7f73741f2440>, <pyesgf.search.results.ResultSet object at 0x7f726f63e200>, <pyesgf.search.results.ResultSet object at 0x7f726f63cb50>], 'psl': [<pyesgf.search.results.ResultSet object at 0x7f726f63cf70>, <pyesgf.search.results.ResultSet object at 0x7f726fb6dcf0>, <pyesgf.search.results.ResultSet object at 0x7f726fb6f070>, <pyesgf.search.results.ResultSet object at 0x7f726fb6e7d0>, <pyesgf.searc

In [10]:
# Initialize an empty dictionary to store the results file context
results_file_context = {}

# For each of the variables
for var, results in results.items():
    print("Variable: " + var)
    print("Results: " + str(results))

    # Initialize an empty dictionary to store the file context
    file_context_list = []

    # Loop over the results
    for result in results:
        print("Result: " + str(result))

        # Extract the file context
        file_context = extract_file_context(result)

        # Append the file context to the list
        file_context_list.append(file_context)

    # Append the file context list to the dictionary
    results_file_context[var] = file_context_list

Variable: rsds
Results: [<pyesgf.search.results.ResultSet object at 0x7fa61aee2170>, <pyesgf.search.results.ResultSet object at 0x7fa591ec7100>, <pyesgf.search.results.ResultSet object at 0x7fa591e86ec0>, <pyesgf.search.results.ResultSet object at 0x7fa61aee1b70>, <pyesgf.search.results.ResultSet object at 0x7fa61aee3f70>, <pyesgf.search.results.ResultSet object at 0x7fa591bce1a0>, <pyesgf.search.results.ResultSet object at 0x7fa591bce1d0>, <pyesgf.search.results.ResultSet object at 0x7fa591bce9b0>, <pyesgf.search.results.ResultSet object at 0x7fa591bceda0>, <pyesgf.search.results.ResultSet object at 0x7fa591bcef20>, <pyesgf.search.results.ResultSet object at 0x7fa591bcf1f0>]
Result: <pyesgf.search.results.ResultSet object at 0x7fa61aee2170>
Extracting file context for 8 datasets...


 12%|█▎        | 1/8 [00:02<00:18,  2.58s/it]

Processed 1 out of 8 results.


 25%|██▌       | 2/8 [00:03<00:11,  1.86s/it]

Processed 2 out of 8 results.


 38%|███▊      | 3/8 [00:05<00:07,  1.57s/it]

Processed 3 out of 8 results.


 50%|█████     | 4/8 [00:06<00:05,  1.43s/it]

Processed 4 out of 8 results.


 62%|██████▎   | 5/8 [00:07<00:04,  1.35s/it]

Processed 5 out of 8 results.


 75%|███████▌  | 6/8 [00:08<00:02,  1.31s/it]

Processed 6 out of 8 results.


 88%|████████▊ | 7/8 [00:10<00:01,  1.28s/it]

Processed 7 out of 8 results.


100%|██████████| 8/8 [00:11<00:00,  1.41s/it]


Processed 8 out of 8 results.
Result: <pyesgf.search.results.ResultSet object at 0x7fa591ec7100>
Extracting file context for 10 datasets...


 10%|█         | 1/10 [00:09<01:21,  9.06s/it]

Processed 1 out of 10 results.


 20%|██        | 2/10 [00:18<01:12,  9.11s/it]

Processed 2 out of 10 results.


 30%|███       | 3/10 [00:27<01:04,  9.15s/it]

Processed 3 out of 10 results.


 40%|████      | 4/10 [00:36<00:54,  9.10s/it]

Processed 4 out of 10 results.


 50%|█████     | 5/10 [00:45<00:45,  9.13s/it]

Processed 5 out of 10 results.


 60%|██████    | 6/10 [00:54<00:36,  9.10s/it]

Processed 6 out of 10 results.


 70%|███████   | 7/10 [01:03<00:27,  9.09s/it]

Processed 7 out of 10 results.


 80%|████████  | 8/10 [01:12<00:18,  9.12s/it]

Processed 8 out of 10 results.


 90%|█████████ | 9/10 [01:22<00:09,  9.12s/it]

Processed 9 out of 10 results.


100%|██████████| 10/10 [01:31<00:00,  9.12s/it]


Processed 10 out of 10 results.
Result: <pyesgf.search.results.ResultSet object at 0x7fa591e86ec0>
Extracting file context for 20 datasets...


  5%|▌         | 1/20 [00:09<02:53,  9.11s/it]

Processed 1 out of 20 results.


 10%|█         | 2/20 [00:18<02:43,  9.07s/it]

Processed 2 out of 20 results.


 15%|█▌        | 3/20 [00:24<02:11,  7.71s/it]

Error: <pyesgf.search.results.DatasetResult object at 0x7fa591bcf460>


 20%|██        | 4/20 [00:33<02:11,  8.24s/it]

Processed 4 out of 20 results.


In [56]:
# Create an empty dataframe to store the results
df = pd.DataFrame()

for var, file_context_lists in results_file_context.items():
    print("Variable: " + var)

    # Loop over the file contexts
    for file_context in file_context_lists:
        # Convert the dictionary to a dataframe
        file_context_df = pd.DataFrame.from_dict(file_context)

        # Add a new column on the far left of the dataframe
        # containing the variable name
        file_context_df.insert(0, 'variable', var)

        # Concatenate the dataframe to the results dataframe
        df = pd.concat([df, file_context_df], ignore_index=True)

# Print the dataframe
df

Variable: tas
Variable: psl
Variable: uas
Variable: vas
Variable: rsds


Unnamed: 0,variable,filename,url
0,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r4i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
1,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r2i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
2,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r3i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
3,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r5i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
4,tas,tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r8i1...,http://esgf-data1.llnl.gov/thredds/fileServer/...
...,...,...,...
1637,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r3i2p1f...,https://esgf-data1.llnl.gov/thredds/fileServer...
1638,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r2i2p1f...,https://esgf-data1.llnl.gov/thredds/fileServer...
1639,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r2i1p1f...,https://esgf-data1.llnl.gov/thredds/fileServer...
1640,rsds,rsds_Amon_NorCPM1_dcppA-hindcast_s1961-r10i1p1...,https://esgf-data1.llnl.gov/thredds/fileServer...


In [58]:
# Set up the directory for the data on JASMIN
dcpp_dir_badc = "/badc/cmip6/data/CMIP6/DCPP/"

# Checkk whether these files exist on JASMIN
jasmin_files_df = check_file_exists_jasmin(df=df,
                                            directory=dcpp_dir_badc)

BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r4i1p1f1_gn_196101-197012.nc
BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r2i1p1f1_gn_196101-197012.nc
BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r3i1p1f1_gn_196101-197012.nc
BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r5i1p1f1_gn_196101-197012.nc
BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r8i1p1f1_gn_196101-197012.nc
BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r6i1p1f1_gn_196101-197012.nc
BCC
File exists for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r7i1p1f1_gn_196101-197012.nc
MPI-M
File exists for tas_Amon_MPI-ESM1-2-HR_dcppA-hindcast_s1961-r3i1p1f1_gn_196111-197112.nc
MPI-M
File exists for tas_Amon_MPI-ESM1-2-HR_dcppA-hindcast_s1961-r1i1p1f1_gn_196111-197112.nc
MPI-M
File exists for tas_Amon_MPI-ESM1-2-HR_dcppA-hindcast_s1961-r5i1p1f1_gn_196111-197112.nc
MPI-M
File exists for tas_Amon_MPI-ESM1-2-HR_dcppA-hindcast_s1961-r4i1p1f1_gn_

In [64]:
# Now set up the group work space directory
dcpp_dir_gws = "/gws/nopw/j04/canari/users/benhutch/"

# Check whether these files exist on JAASMIN
jasmin_files_df_gws = check_file_exists_jasmin(df=df,
                                            directory=dcpp_dir_gws) 

BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r4i1p1f1_gn_196101-197012.nc
BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r2i1p1f1_gn_196101-197012.nc
BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r3i1p1f1_gn_196101-197012.nc
BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r5i1p1f1_gn_196101-197012.nc
BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r8i1p1f1_gn_196101-197012.nc
BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r6i1p1f1_gn_196101-197012.nc
BCC
gws
File does not exist for tas_Amon_BCC-CSM2-MR_dcppA-hindcast_s1961-r7i1p1f1_gn_196101-197012.nc
MPI-M
gws
File does not exist for tas_Amon_MPI-ESM1-2-HR_dcppA-hindcast_s1961-r3i1p1f1_gn_196111-197112.nc
MPI-M
gws
File does not exist for tas_Amon_MPI-ESM1-2-HR_dcppA-hindcast_s1961-r1i1p1f1_gn_196111-197112.nc
MPI-M
gws
File does not exist for tas_Amon_MPI-ESM1-2-HR_dcppA-hi