In [1]:
# Testing the downloading data method from:
# https://claut.gitlab.io/man_ccia/lab2.html
# Import the required modules
from pyesgf.search import SearchConnection
import os
import sys
import importlib
import pandas as pd
import requests
from tqdm import tqdm

# Set the os environment to on
os.environ['ESGF_PYCLIENT_NO_FACETS_STAR_WARNING'] = "on"

In [2]:
# Import the functions
from testing_download_functions import query_data_esgf, extract_file_context, \
                                        download_file, extract_file_context_multithread, \
                                        check_file_exists_jasmin, query_models_esgf

In [3]:
# Import the functions again
# import the functions
_ = importlib.reload(sys.modules['testing_download_functions'])

# Import the functions
from testing_download_functions import query_data_esgf, extract_file_context, \
                                        download_file, extract_file_context_multithread, \
                                        check_file_exists_jasmin, query_models_esgf, find_valid_nodes

In [4]:
# set the search connection
# to the LLNL search node
conn = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True)

In [5]:
# Find the models which have data on the esgf node for the following constraints
# The constraints are:
experiment_id = 'dcppA-hindcast'
latest = True
variable_id = 'tas'
project = 'CMIP6'
table_id = 'Amon'
activity_id = 'DCPP'
sub_experiment_id = 's1985'

In [24]:
# Test the function which does the same as below
models_list = query_models_esgf(experiment_id=experiment_id,
                                variable_id=variable_id,
                                table_id=table_id,
                                activity_id=activity_id,
                                connection=conn,
                                latest=latest,
                                project=project,
                                sub_experiment_id=sub_experiment_id)

# Print the models list
print(models_list)

KeyboardInterrupt: 

In [7]:
# # Set up the params for the query
# params = {
#     "latest": latest,
#     "project": project,
#     "experiment_id": experiment_id,
#     "variable_id": variable_id,
#     "activity_id": activity_id,
#     "table_id": table_id
# }

# # Query the database
# query = conn.new_context(**params)

# # Get the results
# results = query.search()

# print(len(results))

# # Python
# # Form a list of the unique 'source_id' values from the results
# # Python
# # Form a set of the unique 'source_id' values from the results
# source_id_set = set(id for result in results for id in result.json['source_id'])

# # Print the set
# print(source_id_set)

In [6]:
# print(type(models_list))

# # # Convert the set to a list
# source_id_list = list(models_list)

# # Constrain to the model HadGEM3-GC31-LL and the model following it
# # find the index of the model
# index = source_id_list.index('HadGEM3-GC31-MM')

# source_id_list = ["BCC-CSM2-MR", "HadGEM3-GC31-MM"]
source_id_list = ["FGOALS-f3-L"]

source_id_set = source_id_list

In [7]:
# Set the params for the query
params = {
    "latest": latest,
    "project": project,
    "experiment_id": experiment_id,
    "variable_id": variable_id,
    "activity_id": activity_id,
    "table_id": table_id,
    "sub_experiment_id": sub_experiment_id,
    "variant_label": "r6i1p1f1"
}

In [8]:
# # Constrain the source_id_set to the first 1 model
# source_id_set = list(source_id_set)[0:1]

# # Print the set
# print(source_id_set)

# Initialize an empty dictionary to store the results
max_results = {'source_id': None, 'data_node': None, 'num_results': 0}

# Create a list for the max_results dictionaries
max_results_list = []

# Set up the max results per source dictionary
max_results_per_source = {}

# Loop through the source_id_set and query which nodes have data for each model
for source_id in source_id_set:
    print("trying to find valid nodes for model: {}".format(source_id))
    # Set the source_id constraint
    params['source_id'] = source_id
    print(params)
    # Query the database
    model_query = conn.new_context(**params)
    # Get the results
    model_results = model_query.search()
    # Print the number of results
    print(len(model_results))

    # if the len of the model results is not 0
    if len(model_results) != 0:
        # Print the first result
        print(model_results[0].json['id'])

    # Identify the unique nodes (data_node) which have data for the model
    data_node_set = set(result.json['data_node'] for result in model_results)

    # Print the set
    print(data_node_set)

    # Loop through the data_node_set and query how many files are available for each 
    # node
    for data_node in data_node_set:
        print("trying to find valid files for node: {}".format(data_node))
        
        # Set up the params for the query
        params_node = params.copy()
        
        # Set the data_node constraint
        params_node['data_node'] = data_node
        # Query the database
        node_query = conn.new_context(**params_node)
        # Get the results
        node_results = node_query.search()
        # Print the number of results
        print(len(node_results))

        # If this source_id is not in max_results_per_source or this data_node has more results, update the dictionary
        if source_id not in max_results_per_source or len(node_results) > max_results_per_source[source_id]:
            max_results = {'source_id': source_id, 'data_node': data_node, 'num_results': len(node_results)}
            max_results_per_source[source_id] = len(node_results)

            # Append the max_results dictionary to the list
            max_results_list.append(max_results)
        else:
            print("this data_node has less results than the previous one")
            continue

# Print the dictionary
print(max_results_list)





trying to find valid nodes for model: FGOALS-f3-L
{'latest': True, 'project': 'CMIP6', 'experiment_id': 'dcppA-hindcast', 'variable_id': 'tas', 'activity_id': 'DCPP', 'table_id': 'Amon', 'sub_experiment_id': 's1985', 'variant_label': 'r6i1p1f1', 'source_id': 'FGOALS-f3-L'}
2
CMIP6.DCPP.CAS.FGOALS-f3-L.dcppA-hindcast.s1985-r6i1p1f1.Amon.tas.gr.v20220325|esgf-data1.llnl.gov
{'esg.lasg.ac.cn', 'esgf-data1.llnl.gov'}
trying to find valid files for node: esg.lasg.ac.cn
1
trying to find valid files for node: esgf-data1.llnl.gov
1
this data_node has less results than the previous one
[{'source_id': 'FGOALS-f3-L', 'data_node': 'esg.lasg.ac.cn', 'num_results': 1}]


In [11]:
# # Test the function for finding the valid nodes
# max_results_list = find_valid_nodes(params=params, 
#                                     models_list=models_list,
#                                     conn=conn)

In [9]:
# Clean the max_results_list to remove duplicate source_id entries
# Keep the entry with the highest number of results (num_results)
# Initialize an empty list to store the unique source_id entries
unique_source_id_list = []

# Loop through the max_results_list and append the unique source_id entries
for result in max_results_list:
    if result['source_id'] not in unique_source_id_list:
        unique_source_id_list.append(result['source_id'])

# Print the list
print(unique_source_id_list)

# Initialize an empty list to store the unique max_results_list entries
unique_max_results_list = []

# Loop through the unique_source_id_list and only
# Append the max_results_list entries which match the source_id and have the highest num_results
for source_id in unique_source_id_list:
    print("source_id: {}".format(source_id))
    # Initialize an empty list to store the num_results
    num_results_list = []
    # Loop through the max_results_list and append the num_results to the list
    for result in max_results_list:
        if result['source_id'] == source_id:
            num_results_list.append(result['num_results'])
    # Get the max num_results
    max_num_results = max(num_results_list)
    # Loop through the max_results_list and append the entries which match the source_id and max_num_results
    for result in max_results_list:
        if result['source_id'] == source_id and result['num_results'] == max_num_results:
            unique_max_results_list.append(result)

['FGOALS-f3-L']
source_id: FGOALS-f3-L


In [13]:
# # Convert the unique_max_results_list to a dataframe
# unique_max_results_df = pd.DataFrame.from_dict(unique_max_results_list)

# # Print the dataframe
# unique_max_results_df

# # save the dataframe
# # save in current directory + save_data + filename
# save_dir = os.path.join(os.getcwd(), 'save_data')
# save_filename = 'unique_max_results_df_{}_{}_{}_{}_{}.csv'.format(experiment_id, variable_id, project, table_id, activity_id)

# # Form the save path
# save_path = os.path.join(save_dir, save_filename)

# # Check if the save directory exists
# if not os.path.exists(save_dir):
#     # Make the directory
#     os.makedirs(save_dir)

# # Save the dataframe
# unique_max_results_df.to_csv(save_path)

In [14]:
# save_dir = os.path.join(os.getcwd(), 'save_data')
# save_filename = 'unique_max_results_df_{}_{}_{}_{}_{}.csv'.format(experiment_id, variable_id, project, table_id, activity_id)

# # Form the save path
# save_path = os.path.join(save_dir, save_filename)

In [10]:
print(type(unique_max_results_list))

print(unique_max_results_list)

# Convert the unique_max_results_list to a dataframe
unique_max_results_df = pd.DataFrame.from_dict(unique_max_results_list)

# Print the dataframe
unique_max_results_df

<class 'list'>
[{'source_id': 'FGOALS-f3-L', 'data_node': 'esg.lasg.ac.cn', 'num_results': 1}]


Unnamed: 0,source_id,data_node,num_results
0,FGOALS-f3-L,esg.lasg.ac.cn,1


In [11]:
# Open the save_path as a dataframe
# unique_max_results_df = pd.read_csv(save_path)

# # Convert unique_max_results to a dataframe
# unique_max_results_df = pd.DataFrame.from_dict(unique_max_results_list)

results_list = []

# Loop over the dataframe to create a list of result sets
for i in range(len(unique_max_results_df)):
    
    # Get the source_id and data_node
    source_id = unique_max_results_df.loc[i, 'source_id']
    data_node = unique_max_results_df.loc[i, 'data_node']

    # Print the source_id and data_node
    print("source_id: {}, data_node: {}".format(source_id, data_node))

    # Print the experiment_id, variable_id, activity_id, and data_node
    print("experiment_id: {}, variable_id: {}, activity_id: {}, data_node: {}".format(experiment_id, variable_id, activity_id, data_node))

    results = query_data_esgf(conn,
                            source_id = source_id,
                            experiment_id = experiment_id,
                            variable_id = variable_id,
                            table_id = table_id,
                            project=project,
                            activity_id = activity_id,
                            data_node = data_node,
                            sub_experiment_id=sub_experiment_id
    )

    print(len(results))

    # Append the results to the results_list
    results_list.append(results)

# Print the results_list
print(results_list)

source_id: FGOALS-f3-L, data_node: esg.lasg.ac.cn
experiment_id: dcppA-hindcast, variable_id: tas, activity_id: DCPP, data_node: esg.lasg.ac.cn
9
[<pyesgf.search.results.ResultSet object at 0x7fd2085ef220>]


In [12]:
# Set up a list for the file context
file_context_list = []
failed_results_list = []

# # Constrain results_list to the first 3 results
# results_list = results_list[3:5]

# Loop through the results_list and print ther number of results
for results in results_list:
    print(len(results))

    # Extract the file context from the results
    file_context, failed_results = extract_file_context(results)

    # Append the file_context to the file_context_list
    file_context_list.append(file_context)

    # Append the failed_results to the failed_results_list
    failed_results_list.append(failed_results)


# Print the file_context_list
print(file_context_list)

# Print the failed_results_list
print(failed_results_list)

9
Extracting file context for 9 datasets...


  0%|          | 0/9 [00:00<?, ?it/s]

 11%|█         | 1/9 [00:03<00:30,  3.83s/it]

Processed 1 out of 9 results.


 22%|██▏       | 2/9 [00:06<00:20,  2.95s/it]

Processed 2 out of 9 results.


 33%|███▎      | 3/9 [00:07<00:14,  2.34s/it]

Processed 3 out of 9 results.


 44%|████▍     | 4/9 [00:09<00:09,  1.95s/it]

Processed 4 out of 9 results.


 56%|█████▌    | 5/9 [00:11<00:08,  2.24s/it]

Processed 5 out of 9 results.


 67%|██████▋   | 6/9 [00:13<00:06,  2.07s/it]

Processed 6 out of 9 results.


 78%|███████▊  | 7/9 [00:15<00:04,  2.11s/it]

Processed 7 out of 9 results.


 89%|████████▉ | 8/9 [00:19<00:02,  2.70s/it]

Processed 8 out of 9 results.


100%|██████████| 9/9 [00:21<00:00,  2.41s/it]

Processed 9 out of 9 results.
[[{'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1p1f1_gr_198511-199512.nc', 'url': 'https://esg.lasg.ac.cn/thredds/fileServer/esg_dataroot/CMIP6/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/s1985-r2i1p1f1/Amon/tas/gr/v20220202/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1p1f1_gr_198511-199512.nc'}, {'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1p1f1_gr_198511-199512.nc', 'url': 'https://esg.lasg.ac.cn/thredds/fileServer/esg_dataroot/CMIP6/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/s1985-r1i1p1f1/Amon/tas/gr/v20220129/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1p1f1_gr_198511-199512.nc'}, {'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1p1f1_gr_198511-199512.nc', 'url': 'https://esg.lasg.ac.cn/thredds/fileServer/esg_dataroot/CMIP6/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/s1985-r3i1p1f1/Amon/tas/gr/v20220212/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1p1f1_gr_198511-199512.nc'}, {'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r4i




In [13]:
print(len(file_context_list))
print(type(file_context_list))
print(file_context_list)

for file_context in file_context_list:
    print(type(file_context))
    print(file_context)

1
<class 'list'>
[[{'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1p1f1_gr_198511-199512.nc', 'url': 'https://esg.lasg.ac.cn/thredds/fileServer/esg_dataroot/CMIP6/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/s1985-r2i1p1f1/Amon/tas/gr/v20220202/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1p1f1_gr_198511-199512.nc'}, {'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1p1f1_gr_198511-199512.nc', 'url': 'https://esg.lasg.ac.cn/thredds/fileServer/esg_dataroot/CMIP6/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/s1985-r1i1p1f1/Amon/tas/gr/v20220129/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1p1f1_gr_198511-199512.nc'}, {'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1p1f1_gr_198511-199512.nc', 'url': 'https://esg.lasg.ac.cn/thredds/fileServer/esg_dataroot/CMIP6/DCPP/CAS/FGOALS-f3-L/dcppA-hindcast/s1985-r3i1p1f1/Amon/tas/gr/v20220212/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1p1f1_gr_198511-199512.nc'}, {'filename': 'tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r4i1p1f1_gr_1985

In [14]:
# Create an empty DataFrame
df = pd.DataFrame()

# Loop through the file_context_list
for file_context in file_context_list:
    # Convert the dictionary to a DataFrame
    temp_df = pd.DataFrame(file_context)

    # Append the temp_df to the main df
    df = pd.concat([df, temp_df], ignore_index=True)

# Print the df
df

Unnamed: 0,filename,url
0,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
1,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
2,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
3,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r4i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
4,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r5i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
5,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r6i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
6,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r9i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
7,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r7i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...
8,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r8i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...


In [20]:
# # Python
# # Create an empty DataFrame
# df = pd.DataFrame()

# # Loop through the file_context_list
# for file_context in file_context_list:
#     # Convert the dictionary to a DataFrame
#     temp_df = pd.DataFrame.from_dict(file_context)

#     # Append the temp_df to the main df
#     df = pd.concat([df, temp_df], ignore_index=True)

In [15]:
# reload for modules
import importlib
import sys

# import the functions
_ = importlib.reload(sys.modules['testing_download_functions'])

from testing_download_functions import query_data_esgf, extract_file_context, \
                                        download_file, extract_file_context_multithread, \
                                        check_file_exists_jasmin

In [16]:
# Check whether the files exist on JASMIN
import glob

# Set up the file_context_df
file_context_df = df.copy()

# We want to verify whether these files exist on JASMIN
damip_dir = "/badc/cmip6/data/CMIP6/DCPP/"

# Test the function
# TODO
files_df = check_file_exists_jasmin(file_context_df, damip_dir)

files_df

CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r4i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r5i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r6i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r9i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r7i1p1f1_gr_198511-199512.nc
CAS
badc
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r8i1p1f1_gr_198511-199512.nc


Unnamed: 0,filename,url,file_exists,filepath
0,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
1,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
2,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
3,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r4i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
4,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r5i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
5,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r6i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
6,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r9i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
7,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r7i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,
8,tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r8i1...,https://esg.lasg.ac.cn/thredds/fileServer/esg_...,False,


In [17]:
# We only want to download the files which don't already exist on JASMIN
dcpp_dir_gws = "/gws/nopw/j04/canari/users/benhutch/"


# Now we want to check whether the files exist on JASMIN
files_df = check_file_exists_jasmin(files_df, dcpp_dir_gws)

CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r2i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r1i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r3i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r4i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r5i1p1f1_gr_198511-199512.nc
CAS
gws
File does not exist for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r6i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r9i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r7i1p1f1_gr_198511-199512.nc
CAS
gws
File exists for tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r8i1p1f1_gr_198511-199512.nc


In [18]:
# We only want to download the files which don't already exist on JASMIN
# Set up the download directory
download_dir = "/gws/nopw/j04/canari/users/benhutch/"

# Keep a copy of the files_df
files_df_copy = files_df.copy()

# Keep only the files which exist on JASMIN in the copy
files_df_copy = files_df_copy[files_df_copy['file_exists'] == True]

# Loop through files_df and download the files
# First constrain the dataframe to only the files which don't already exist on JASMIN
files_df = files_df[files_df['file_exists'] == False]

# Reset the index
files_df.reset_index(drop=True, inplace=True)

# Loop through the files_df and download the files
for i in tqdm(range(len(files_df))):
    # Get the file_url
    file_url = files_df.loc[i, 'url']

    # Get the filename
    filename = files_df.loc[i, 'filename']

    # Split the filename and extract the variable name
    variable = filename.split('_')[0]

    # Split the filename to get the experiment name
    experiment = filename.split('_')[3]

    # Set up the model
    model = filename.split('_')[2]

    # Set up the download directory
    # download_dir_loop = os.path.join(download_dir, experiment, variable, model)

    # Modify download dir to be inline with that already saved on canari
    download_dir_loop = os.path.join(download_dir, experiment, "data",
                                        variable, model)

    # If the download directory doesn't exist, make it
    if not os.path.exists(download_dir_loop):
        os.makedirs(download_dir_loop)

    # Set up the download path
    download_path = os.path.join(download_dir_loop, filename)

    # Assert that the download path doesn't already exist
    assert not os.path.exists(download_path), "The file {} already exists".format(download_path)

    # In the filepath column of the dataframe
    # replace the current file path with the download path
    files_df.loc[i, 'filepath'] = download_path

    # Replace the file_exists column with True
    files_df.loc[i, 'file_exists'] = True

    # Set up the request
    r = requests.get(file_url, stream=True)

    # Set up the total size
    total_size = int(r.headers.get('content-length', 0))
    
    # Set up the block size
    block_size = 1024

    # Download the file
    with open(download_path, 'wb') as f:
        for data in tqdm(r.iter_content(block_size), 
                        total = total_size//block_size, 
                        unit = 'KiB', 
                        unit_scale = True):
            f.write(data)

        # If the total size is no 0
        if total_size != 0:
            print("File is not empty")
            print("Download complete - file saved to {}".format(download_path))

# Assert that all rows in file_exists are True
assert all(files_df['file_exists'] == True), "Not all files have been downloaded"

# Assert that the filepath column doesn't contain any NaNs
assert not any(files_df['filepath'].isna()), "The filepath column contains NaNs"


# append the files_df to the files_df_copy
files_df_copy = pd.concat([files_df_copy, files_df], ignore_index=True)

# Assrt tha


24.7kKiB [00:05, 4.83kKiB/s]                          


File is not empty
Download complete - file saved to /gws/nopw/j04/canari/users/benhutch/dcppA-hindcast/data/tas/FGOALS-f3-L/tas_Amon_FGOALS-f3-L_dcppA-hindcast_s1985-r6i1p1f1_gr_198511-199512.nc


100%|██████████| 1/1 [00:34<00:00, 34.22s/it]


In [25]:
# Assert that all rows in file_exists are True
assert all(files_df_copy['file_exists'] == True), "Not all files have been downloaded"

# Assert that the filepath column doesn't contain any NaNs
assert not any(files_df_copy['filepath'].isna()), "The filepath column contains NaNs"

In [14]:
# Extract all of the file_context into dictionaries
# and then append them to a dataframe
# Initialize an empty list to store the dictionaries
# Create an empty dataframe
file_context_df = pd.DataFrame()


# For each file_context
for file_context in file_context_list:
    # Append the file_context to the dataframe
    file_context_df = file_context_df.append(file_context, ignore_index=True)

  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_context_df = file_context_df.append(file_context, ignore_index=True)
  file_conte

In [None]:
# Python
# Form a list of the unique 'source_id' values from the results
source_id_list = list(set([result.json['source_id'] for result in results]))

# Print the list
print(source_id_list)

In [5]:
# test the function for querying the database
results = query_data_esgf(conn,
                        source_id='E3SM-2-0',
                        experiment_id='hist-aer',
                        variable_id='tas',
                        table_id='Amon',
                        data_node='esgf-data2.llnl.gov',)

# print the len of the results
print(len(results))

# print the type of the results
print(type(results))

# print the results
print(results)

5
<class 'pyesgf.search.results.ResultSet'>
<pyesgf.search.results.ResultSet object at 0x7f87bf91f130>


In [6]:
# Print the details of the first result
print(results[0].json['id'])

CMIP6.DAMIP.E3SM-Project.E3SM-2-0.hist-aer.r2i1p1f1.Amon.tas.gr.v20220906|esgf-data2.llnl.gov


In [7]:
# Extract the file context
# files_list = extract_file_context(results)

# # # Turn the list into a dataframe
# # files_df = pd.DataFrame.from_dict(files_list)

# # files_df

In [8]:
files_list_mt = extract_file_context_multithread(results)

files_list_mt

Extracting file context for 5 datasets...


[{'filename': 'tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_185001-189912.nc',
  'url': 'https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/CMIP6/DAMIP/E3SM-Project/E3SM-2-0/hist-aer/r2i1p1f1/Amon/tas/gr/v20220906/tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_185001-189912.nc'},
 {'filename': 'tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_190001-194912.nc',
  'url': 'https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/CMIP6/DAMIP/E3SM-Project/E3SM-2-0/hist-aer/r2i1p1f1/Amon/tas/gr/v20220906/tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_190001-194912.nc'},
 {'filename': 'tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_195001-199912.nc',
  'url': 'https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/CMIP6/DAMIP/E3SM-Project/E3SM-2-0/hist-aer/r2i1p1f1/Amon/tas/gr/v20220906/tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_195001-199912.nc'},
 {'filename': 'tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_200001-201412.nc',
  'url': 'https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/CMIP6/DAMIP/E3SM-Project/

In [9]:
files_list = files_list_mt

In [11]:
# Print the type of the files list
print(type(files_list))

# Extract this into a dataframe
files_df = pd.DataFrame.from_dict(files_list)
files_df

# Assert that all filenames contrain the string "185001" and "202012"
# assert all(files_df['filename'].str.contains('185001')), "Not all filenames contain the string 185001"
# assert all(files_df['filename'].str.contains('202012')), "Not all filenames contain the string 202012"

files_df

<class 'list'>


Unnamed: 0,filename,url
0,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_185001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
1,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_190001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
2,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_195001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
3,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_200001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
4,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_185001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
5,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_190001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
6,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_195001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
7,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_200001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
8,tas_Amon_E3SM-2-0_hist-aer_r3i1p1f1_gr_185001-...,https://esgf-data2.llnl.gov/thredds/fileServer...
9,tas_Amon_E3SM-2-0_hist-aer_r3i1p1f1_gr_190001-...,https://esgf-data2.llnl.gov/thredds/fileServer...


In [12]:
import glob

# We want to verify whether these files exist on JASMIN
damip_dir = "/badc/cmip6/data/CMIP6/DAMIP/"

# Test the function
files_df = check_file_exists_jasmin(files_df, damip_dir)

files_df

E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_185001-189912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_190001-194912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_195001-199912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_200001-201412.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_185001-189912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_190001-194912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_195001-199912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_200001-201412.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r3i1p1f1_gr_185001-189912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r3i1p1f1_gr_190001-194912.nc
E3SM-Project
File does not exist for tas_Amon_E3SM-2-0_hist-aer_r3i1p1

Unnamed: 0,filename,url,file_exists
0,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_185001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
1,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_190001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
2,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_195001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
3,tas_Amon_E3SM-2-0_hist-aer_r2i1p1f1_gr_200001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
4,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_185001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
5,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_190001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
6,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_195001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
7,tas_Amon_E3SM-2-0_hist-aer_r1i1p1f1_gr_200001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
8,tas_Amon_E3SM-2-0_hist-aer_r3i1p1f1_gr_185001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False
9,tas_Amon_E3SM-2-0_hist-aer_r3i1p1f1_gr_190001-...,https://esgf-data2.llnl.gov/thredds/fileServer...,False


In [10]:
# Set up the directory to download to
download_dir = "/gws/nopw/j04/scenario/users/benhutch/DAMIP"

# Set up the variable
variable = 'tas'

# Set up the experiment id
experiment_id = 'hist-aer'

# Set up the model
model = 'CanESM5'

# Set up the directory
download_path = os.path.join(download_dir, experiment_id, 
                             variable, model)

# Print the download path
print(download_path)

# Use the download function to download a single file
download_file(files_df['url'][0], 
              files_df['filename'][0], download_path)

/gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5
Downloading tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r1i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc


53.1kKiB [00:46, 1.13kKiB/s]                          


Downloaded size does not match expected size!
 FYI, the status code was  200


In [11]:
# Download all the files
for i in tqdm(range(len(files_df))):
    download_file(files_df['url'][i], 
                  files_df['filename'][i], download_path)

  0%|          | 0/5 [00:00<?, ?it/s]

Downloading tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r1i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc


53.1kKiB [00:05, 10.5kKiB/s]                          
 20%|██        | 1/5 [00:05<00:23,  5.86s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200
Downloading tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r4i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc


53.1kKiB [00:02, 22.6kKiB/s]                          
 40%|████      | 2/5 [00:08<00:12,  4.25s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200
Downloading tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r2i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc


53.1kKiB [00:19, 2.73kKiB/s]                          


Downloaded size does not match expected size!
 FYI, the status code was  200


 60%|██████    | 3/5 [00:29<00:23, 11.58s/it]

Downloading tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r5i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc


53.1kKiB [00:02, 23.0kKiB/s]                          
 80%|████████  | 4/5 [00:32<00:08,  8.18s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200
Downloading tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r3i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc


53.1kKiB [00:13, 3.82kKiB/s]                          
100%|██████████| 5/5 [00:46<00:00,  9.39s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200



