In [1]:
# Testing the downloading data method from:
# https://claut.gitlab.io/man_ccia/lab2.html
# Import the required modules
from pyesgf.search import SearchConnection
import os
import pandas as pd
import requests
from tqdm import tqdm

# Set the os environment to on
os.environ['ESGF_PYCLIENT_NO_FACETS_STAR_WARNING'] = "on"

In [2]:
# Import the functions
from testing_download_functions import query_data_esgf, extract_file_context, \
                                        download_file, extract_file_context_multithread, \
                                        check_file_exists_jasmin

In [3]:
# set the search connection
# to the LLNL search node
conn = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True)

In [4]:
print(type(conn))

<class 'pyesgf.search.connection.SearchConnection'>


In [5]:
# test the function for querying the database
results = query_data_esgf(conn,
                        source_id='CanESM5',
                        experiment_id='hist-aer',
                        variable_id='tas',
                        table_id='Amon',
                        member_id='r1i1p1f1, r2i1p1f1, r3i1p1f1, r4i1p1f1, r5i1p1f1, r15i1p1f1, r14i1p1f1, r15i1p2f1, r14i1p2f1'
                        data_node='crd-esgf-drc.ec.gc.ca',)

# print the len of the results
print(len(results))

# print the type of the results
print(type(results))

# print the results
print(results)

5
<class 'pyesgf.search.results.ResultSet'>
<pyesgf.search.results.ResultSet object at 0x7f200941ba60>


In [6]:
# Print the details of the first result
print(results[0].json['id'])

CMIP6.DAMIP.CCCma.CanESM5.hist-aer.r1i1p1f1.Amon.tas.gn.v20190429|crd-esgf-drc.ec.gc.ca


In [7]:
# # Use the function to extract the file context
# files_list_mt = extract_file_context_multithread(results)

# # Turn the list into a dataframe
# files_df_mt = pd.DataFrame.from_dict(files_list_mt)

# files_df_mt

In [8]:
# Extract the file context
files_list = extract_file_context(results)

# # Turn the list into a dataframe
# files_df = pd.DataFrame.from_dict(files_list)

# files_df

Extracting file context for 5 datasets...


In [9]:
# Print the type of the files list
print(type(files_list))

# Extract this into a dataframe
files_df = pd.DataFrame.from_dict(files_list)
files_df

# Assert that all filenames contrain the string "185001" and "202012"
assert all(files_df['filename'].str.contains('185001')), "Not all filenames contain the string 185001"
assert all(files_df['filename'].str.contains('202012')), "Not all filenames contain the string 202012"

files_df

<class 'list'>


Unnamed: 0,filename,url
0,tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...
1,tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...
2,tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...
3,tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...
4,tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...


In [16]:
import glob

# We want to verify whether these files exist on JASMIN
damip_dir = "/badc/cmip6/data/CMIP6/DAMIP/"

# Create a new column in the dataframe to store whether the file exists
files_df['file_exists'] = False

# Loop over the dataframe
for i in range(len(files_df)):
    # Get the filename
    filename = files_df['filename'][i]

    # Split the filename by _
    filename_split = filename.split('_')

    # Extract the variable name
    variable_name = filename_split[0]

    # Extract the time window
    time_window = filename_split[1]

    # Extract the model name
    model_name = filename_split[2]

    # Split the url by /
    # and extract the last 7th element
    # which is the directory name
    url_split = files_df['url'][i].split('/')
    model_group = url_split[9]

    print(model_group)

    # Extract the experiment name
    experiment_name = filename_split[3]

    # Extract the ensemble member name
    ensemble_member_name = filename_split[4]

    # Extract the grid name
    grid_name = filename_split[5]

    # Form the pattern
    pattern = os.path.join(damip_dir, model_group, model_name, experiment_name,
                                ensemble_member_name, time_window, variable_name,
                                grid_name, "files", "d*", filename)
    
    # Print the pattern
    print(pattern)

    # Get a list of all paths that match the pattern
    filepaths = glob.glob(pattern)

    # If filepath is greater than 0, the file exists
    if len(filepaths) > 0:
        print("File exists for " + filename)
        files_df.loc[i, 'file_exists'] = True
    elif len(filepaths) == 0:
        print("File does not exist for " + filename)
        files_df.loc[i, 'file_exists'] = False
    elif len(filepaths) > 1:
        print("More than one file found for " + filename)
        AssertionError("More than one file found for " + filename)
    else:
        print("Something went wrong with " + filename)
        AssertionError("Something went wrong with " + filename)

files_df

CCCma
/badc/cmip6/data/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r1i1p1f1/Amon/tas/gn/files/d*/tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc
File exists for tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc
CCCma
/badc/cmip6/data/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r4i1p1f1/Amon/tas/gn/files/d*/tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc
File exists for tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc
CCCma
/badc/cmip6/data/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r2i1p1f1/Amon/tas/gn/files/d*/tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc
File exists for tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc
CCCma
/badc/cmip6/data/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r5i1p1f1/Amon/tas/gn/files/d*/tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc
File exists for tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc
CCCma
/badc/cmip6/data/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r3i1p1f1/Amon/tas/gn/files/d*/tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc
F

Unnamed: 0,filename,url,file_exists
0,tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...,True
1,tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...,True
2,tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...,True
3,tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...,True
4,tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-2...,http://crd-esgf-drc.ec.gc.ca/thredds/fileServe...,True


In [10]:
# Set up the directory to download to
download_dir = "/gws/nopw/j04/scenario/users/benhutch/DAMIP"

# Set up the variable
variable = 'tas'

# Set up the experiment id
experiment_id = 'hist-aer'

# Set up the model
model = 'CanESM5'

# Set up the directory
download_path = os.path.join(download_dir, experiment_id, 
                             variable, model)

# Print the download path
print(download_path)

# Use the download function to download a single file
download_file(files_df['url'][0], 
              files_df['filename'][0], download_path)

/gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5
Downloading tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r1i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc


53.1kKiB [00:46, 1.13kKiB/s]                          


Downloaded size does not match expected size!
 FYI, the status code was  200


In [11]:
# Download all the files
for i in tqdm(range(len(files_df))):
    download_file(files_df['url'][i], 
                  files_df['filename'][i], download_path)

  0%|          | 0/5 [00:00<?, ?it/s]

Downloading tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r1i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r1i1p1f1_gn_185001-202012.nc


53.1kKiB [00:05, 10.5kKiB/s]                          
 20%|██        | 1/5 [00:05<00:23,  5.86s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200
Downloading tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r4i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r4i1p1f1_gn_185001-202012.nc


53.1kKiB [00:02, 22.6kKiB/s]                          
 40%|████      | 2/5 [00:08<00:12,  4.25s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200
Downloading tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r2i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r2i1p1f1_gn_185001-202012.nc


53.1kKiB [00:19, 2.73kKiB/s]                          


Downloaded size does not match expected size!
 FYI, the status code was  200


 60%|██████    | 3/5 [00:29<00:23, 11.58s/it]

Downloading tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r5i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r5i1p1f1_gn_185001-202012.nc


53.1kKiB [00:02, 23.0kKiB/s]                          
 80%|████████  | 4/5 [00:32<00:08,  8.18s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200
Downloading tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc from http://crd-esgf-drc.ec.gc.ca/thredds/fileServer/esgE_dataroot/AR6/CMIP6/DAMIP/CCCma/CanESM5/hist-aer/r3i1p1f1/Amon/tas/gn/v20190429/tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc
Saving to /gws/nopw/j04/scenario/users/benhutch/DAMIP/hist-aer/tas/CanESM5tas_Amon_CanESM5_hist-aer_r3i1p1f1_gn_185001-202012.nc


53.1kKiB [00:13, 3.82kKiB/s]                          
100%|██████████| 5/5 [00:46<00:00,  9.39s/it]

Downloaded size does not match expected size!
 FYI, the status code was  200



