In [1]:
import requests
import xml.etree.ElementTree as ET
import wget
from pathlib import Path

import urllib # urllib.request, urllib.request.urlretrieve

In [2]:
# try to make downloads in parallel (especially would be useful because some servers are very slow)
import multiprocessing
from itertools import product
# reference: [[https://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments]]

In [3]:
def esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
                files_type="OPENDAP", local_node=True, project="CMIP6",
                verbose=False, format="application%2Fsolr%2Bjson",
                use_csrf=False, **search):
    client = requests.session()
    payload = search
    payload["project"] = project
    payload["type"]= "File"
    if local_node:
        payload["distrib"] = "false"
    if use_csrf:
        client.get(server)
        if 'csrftoken' in client.cookies:
            # Django 1.6 and up
            csrftoken = client.cookies['csrftoken']
        else:
            # older versions
            csrftoken = client.cookies['csrf']
        payload["csrfmiddlewaretoken"] = csrftoken

    payload["format"] = format

    offset = 0
    numFound = 10000
    all_files = []
#     files_type = files_type.upper()
    while offset < numFound:
        payload["offset"] = offset
        url_keys = [] 
        for k in payload:
            url_keys += ["{}={}".format(k, payload[k])]

        url = "{}/?{}".format(server, "&".join(url_keys))
#         print(url)
        r = client.get(url)
        r.raise_for_status()
        resp = r.json()["response"]
        numFound = int(resp["numFound"])
        resp = resp["docs"]
        offset += len(resp)
        for d in resp:
            if verbose:
                for k in d:
                    print("{}: {}".format(k,d[k]))
            url = d["url"]
            for f in d["url"]:
                sp = f.split("|")
                if sp[-1] == files_type:
                    all_files.append(sp[0].split(".html")[0])
#                 else:
#                     print(f"other file type: {sp[-1]}")
    return sorted(all_files)

# ORIGIN:
# https://medium.com/pangeo/cmip6-in-the-cloud-five-ways-96b177abe396
# https://nbviewer.jupyter.org/github/pangeo-data/pangeo-cmip6-examples/blob/master/search_and_load_with_esgf_opendap.ipynb
# NOTES:
# files_type: OPENDAP, HTTPServer, GridFTP, Globus

# r.json has three keys: dict_keys(['responseHeader', 'response', 'facet_counts'])
# responseHeader has some info about the search and response, but not really useful info for us.
# facet_counts is a dict of dicts, but they are empty:
#     {'facet_queries': {},
#      'facet_fields': {},
#      'facet_ranges': {},
#      'facet_intervals': {},
#      'facet_heatmaps': {}}
# response is a dict with 4 keys: dict_keys(['numFound', 'start', 'maxScore', 'docs'])
# start and maxScore don't seem to be useful, numFound and docs are both used here; numFOund is just the total number of results
# docs has everything; it's a list; length 10 in my example
# the entries of the list are dicts; each appears to be a search result hit
# 52 entries in them


In [4]:
def available_experiments(MODEL, server="https://esgf-node.llnl.gov/esg-search/search",
                files_type="OPENDAP", local_node=True, project="CMIP6",
                verbose=False, format="application%2Fsolr%2Bjson",
                use_csrf=False, **search):
    """Returns a list of all experiments available for MODEL."""
    client = requests.session()
    payload = search
    payload["project"] = project
    payload["type"]= "File"
    if local_node:
        payload["distrib"] = "false"
    if use_csrf:
        client.get(server)
        if 'csrftoken' in client.cookies:
            # Django 1.6 and up
            csrftoken = client.cookies['csrftoken']
        else:
            # older versions
            csrftoken = client.cookies['csrf']
        payload["csrfmiddlewaretoken"] = csrftoken

    payload["format"] = format
    # make this a facet search:
    payload['source_id'] = MODEL
    payload["facets"] = "experiment_id"
    payload["limit"] = 0 # return no results
    all_files = []
    url_keys = [] 
    for k in payload:
        url_keys += ["{}={}".format(k, payload[k])]

    url = "{}/?{}".format(server, "&".join(url_keys))
    r = client.get(url)
    r.raise_for_status()
    j = r.json()
    resp = j["response"]
    numFound = int(resp["numFound"])
    experiments = j['facet_counts']['facet_fields']['experiment_id']
    print(f"Total found items: {numFound}")
    return list(zip(experiments[0:None:2], experiments[1:None:2]))


In [5]:
def parse_result_urls(r):
    """Construct a list of filenames from URLs returned from esgf_search"""
    names = [i.split("/")[-1] for i in r]
    # get URLs of each unique filename
    ndx = [names.index(elem) for elem in set(names)]
    return [r[n] for n in ndx]

In [6]:
def wget_list(r, downloads=None):
    """Download list of URLs into location downloads.
       Default location will be `~/Downloads`.
    """
    if downloads is None:
        dest = Path("~/Downloads").expanduser()
    else:
        dest = Path(downloads)
    for url in r:
        name = url.split("/")[-1]
        print(f"Downloading {name}")
        wget.download(url, str(dest/name))
    print("Downloads finished.")

In [7]:
def get_multi(inlist, location):
    with multiprocessing.Pool(processes=4) as pool:
        results = pool.starmap(wget.download, product(inlist, location))

In [8]:
def wget_needed(r, downloads=None, dry_run=False, parallel=False):
    if downloads is None:
        dest = Path("~/Downloads").expanduser()
    else:
        dest = Path(downloads)
    # this function will check whether we have the file:
    urls = []
    for url in r:
        name = url.split("/")[-1]
        oname = dest/name
        if not oname.is_file():
            print(f"Add {name} to download list.")
            urls.append(url)
        else:
            print(f"File already present: {name} ----> Skipping.")
    # now try to get all the files:
    if not dry_run:
        if parallel:
            get_multi(wget.download, product(urls, str(dest/name)))
        else:
            for u in urls:
                print(f"Going to download {u}")
                wget.download(u, downloads)
    else:
        print("dry run; no downloading")

    print("Downloading complete.")

In [9]:
def wget_needed_tryall(r, downloads=None):
    """Given list of URLs (r), download if the the file is not in specified location (downloads). 
    
    When provided a list with multiple locations for the same file (different servers), will try in order. If a server doesn't provide the file, print a message and move to next URL.
    
    return: the list of downloaded file names.
    
    
    notes
    -----
        Use the urllib package instead of wget in order to get the return code when URL retrieve fails.
        (I think wget uses urllib anyway, so same result.)
    """
    print(f"We begin with a list of {len(r)} URLs.")
    if downloads is None:
        dest = Path("~/Downloads").expanduser()
    else:
        dest = Path(downloads)
    # this function will check whether we have the file:
    downloaded = []
    failed = 0
    for url in r:
        name = url.split("/")[-1]
        oname = dest/name
        if not oname.is_file():
            try:
                urllib.request.urlretrieve(url, oname)
                downloaded.append(name)
            except Exception as e:
                print('\tThe server couldn\'t fulfill the request.')
                print('\tError code: ', e.code)
                failed += 1
#             try:
#                 wget.download(u, downloads)
#                 downloaded.append(url)
#             except:
#                 print("Something happened, moving to next URL.")
#                 continue
        else:
            print(f"File already present: {name} ----> Skipping.")
    print(f"Downloading complete. Was able to complete downloading for {len(downloaded)} files. Failed to retrieve {failed} URLs.")
    return downloaded

In [10]:
def download_variable_from_experiment(experiment, variable, downloads=None, table_id=None, dry_run=False):
    """Specify experiment name and variable name,
       download all of the files found.
       Options: specify download directory, specify MIP table name."""
    opts = {'variable_id': variable, 'experiment_id': experiment, 'latest': True}
    if table_id is not None:
        opts['table_id']=table_id
    results = esgf_search(local_node=False, files_type="HTTPServer", **opts)
    if len(results) > 0:
        print(f"NUMBER OF RESULTS: {len(results)}")
        url_list = parse_result_urls(results)
        if dry_run:
            print("LIST OF URLS: ")
            print(url_list)
        wget_needed(url_list, downloads=downloads, dry_run=dry_run)
    else:
        print("No search results were found.")

In [42]:
esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
                files_type="HTTPServer", local_node=False, project="CMIP6",
                verbose=False, format="application%2Fsolr%2Bjson",
                use_csrf=False, variable_id='pr', experiment_id='amip-lwoff', table_id='day')

['http://aims3.llnl.gov/thredds/fileServer/css03_data/CMIP6/CFMIP/CNRM-CERFACS/CNRM-CM6-1/amip-lwoff/r1i1p1f2/day/pr/gr/v20190711/pr_day_CNRM-CM6-1_amip-lwoff_r1i1p1f2_gr_19790101-20141231.nc',
 'http://aims3.llnl.gov/thredds/fileServer/css03_data/CMIP6/CFMIP/IPSL/IPSL-CM6A-LR/amip-lwoff/r1i1p1f1/day/pr/gr/v20180928/pr_day_IPSL-CM6A-LR_amip-lwoff_r1i1p1f1_gr_19790101-20141231.nc',
 'http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/CFMIP/NCAR/CESM2/amip-lwoff/r1i1p1f1/day/pr/gn/v20200210/pr_day_CESM2_amip-lwoff_r1i1p1f1_gn_19790101-19881231.nc',
 'http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/CFMIP/NCAR/CESM2/amip-lwoff/r1i1p1f1/day/pr/gn/v20200210/pr_day_CESM2_amip-lwoff_r1i1p1f1_gn_19890101-19981231.nc',
 'http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/CFMIP/NCAR/CESM2/amip-lwoff/r1i1p1f1/day/pr/gn/v20200210/pr_day_CESM2_amip-lwoff_r1i1p1f1_gn_19990101-20081231.nc',
 'http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/CFMI

In [11]:
# download_variable_from_experiment("aqua-control", "wap", downloads="/Volumes/Glyph6TB/CMIP6", table_id="day")

In [11]:
# result = esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
#                 local_node=False, files_type="HTTPServer", project="CMIP6",
#                 verbose=False, format="application%2Fsolr%2Bjson",
#                 use_csrf=False, variable_id='wap', experiment_id='aqua-control', table_id='day', source_id='IPSL-CM6A-LR')

var_to_download = 'ta'
table = 'Amon'
experiments = ('amip', 'amip-lwoff', 'aqua-control', 'aqua-control-lwoff', 'aqua-4xCO2', 'amip-4xCO2',) # ('amip', 'amip-lwoff', 'aqua-control', 'aqua-control-lwoff')
models_to_download =[("HadGEM3-GC31-LL", "r1i1p1f3"), ("HadGEM3-GC31-LL", "r5i1p1f2"), 
                    ('MRI-ESM2-0', 'r1i1p1f1'),
                    ('CNRM-CM6-1', 'r1i1p1f2'),
                    ('IPSL-CM6A-LR', 'r1i1p1f1'),
                    ('CESM2', 'r1i1p1f1'),
                    ]
# models_to_download = [('MRI-ESM2-0', 'r1i1p1f1')]

downloads="/Volumes/Jedha/CMIP6"

for model, member in models_to_download:
    for ex in experiments:        
        result = esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
                        local_node=False, files_type="HTTPServer", project="CMIP6",
                        verbose=False, format="application%2Fsolr%2Bjson",
                        use_csrf=False, variable_id=var_to_download, experiment_id=ex, 
                        table_id=table, source_id=model, variant_label=member)
        print(result)
        print(f"Start trying to download for {model}, {member}, {ex} ... Found {len(result)} URLs to try.")
        if len(result) > 0:
            print(f"NUMBER OF RESULTS: {len(result)}")
            completed = wget_needed_tryall(result, downloads=downloads) # wget_needed(url_list, downloads=downloads, dry_run=False)
        else:
            print("No search results were found.")

['http://aims3.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/MOHC/HadGEM3-GC31-LL/amip/r1i1p1f3/Amon/ta/gn/v20190617/ta_Amon_HadGEM3-GC31-LL_amip_r1i1p1f3_gn_197901-201412.nc', 'http://esgf-data3.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/MOHC/HadGEM3-GC31-LL/amip/r1i1p1f3/Amon/ta/gn/v20190617/ta_Amon_HadGEM3-GC31-LL_amip_r1i1p1f3_gn_197901-201412.nc', 'http://esgf-data3.diasjp.net/thredds/fileServer/esg_dataroot/CMIP6/CMIP/MOHC/HadGEM3-GC31-LL/amip/r1i1p1f3/Amon/ta/gn/v20190617/ta_Amon_HadGEM3-GC31-LL_amip_r1i1p1f3_gn_197901-201412.nc', 'http://esgf3.dkrz.de/thredds/fileServer/cmip6/CMIP/MOHC/HadGEM3-GC31-LL/amip/r1i1p1f3/Amon/ta/gn/v20190617/ta_Amon_HadGEM3-GC31-LL_amip_r1i1p1f3_gn_197901-201412.nc']
Start trying to download for HadGEM3-GC31-LL, r1i1p1f3, amip ... Found 4 URLs to try.
NUMBER OF RESULTS: 4
We begin with a list of 4 URLs.
File already present: ta_Amon_HadGEM3-GC31-LL_amip_r1i1p1f3_gn_197901-201412.nc ----> Skipping.
File already present: ta_Amon_HadGEM3-GC3

In [None]:
result = esgf_search(files_type="HTTPServer", activity_id='CMIP', table_id='CFmon', variable_id='cltcalipso', experiment_id='amip',
                  institution_id="NCAR", source_id="CESM2")

# NOTES:
# All these are the optional kwargs to the search, and really are just passed to the RESTFul API by constructing a URL string.
# Options include:
# - member_id="r10i1p1f1"
# - source_id="CESM2"
# - institution_id="NCAR"
# - experiment_id='amip'
# - variable_id='cltcalipso'
# - table_id='CFmon'
# - activity_id='CMIP'
# - 
result

In [None]:
parse_result_urls(result)

In [None]:
wget_list(parse_result_urls(result))

In [None]:
cllcalipso = esgf_search(local_node=False, files_type="HTTPServer", table_id='CFmon', variable_id='cllcalipso', experiment_id='amip', latest=True)
parse_result_urls(cllcalipso)
wget_list(parse_result_urls(cllcalipso), downloads="/Volumes/Glyph6TB/CMIP6/")

In [None]:
parse_result_urls(cllcalipso)


In [None]:
diffresult = esgf_search(local_node=False, files_type="HTTPServer", table_id='CFmon', variable_id='cltcalipso', experiment_id='amip', latest=True)
parse_result_urls(diffresult)

In [None]:
wget_list(parse_result_urls(diffresult))

In [None]:
Path('~').expanduser()

In [None]:
Path("~/Downloads").expanduser()

In [None]:
cesm_cfmip_result = esgf_search(local_node=False, files_type="HTTPServer", table_id='CFmon', variable_id='clcalipso', latest=True, institution_id="NCAR")
parse_result_urls(cesm_cfmip_result)

In [None]:
for r in cesm_cfmip_result:
    out = []
    s = r.split("/")
    out.appendprint(s[-7])

In [None]:
set([r.split("/")[-7] for r in cesm_cfmip_result])

In [None]:
esgf_search(local_node=False, files_type="HTTPServer", experiment_id='a4SST', activity_id='CFMIP', table_id='Amon')

In [16]:
import os
SOMEHOST = "https://esgf-data1.llnl.gov"
HOST_UP  = True if os.system("ping -c 5 " + SOMEHOST.strip(";")) == 0 else False
HOST_UP

False

In [17]:
os.system("ping -c 5 " + SOMEHOST)

17408

In [21]:

print(urllib.request.urlopen("http://aims3.llnl.gov").getcode())  # == 200 for serving being alive, apparently

200


In [23]:
rtst = requests.get('http://aims3.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/amip/r10i1p1f1/day/wap/gr/v20181109/wap_day_IPSL-CM6A-LR_amip_r10i1p1f1_gr_19580101-20141231.nc')
print(rtst)

<Response [503]>


In [24]:
dir(rtst)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [27]:
rtst.content

b'<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>503 Service Unavailable</title>\n</head><body>\n<h1>Service Unavailable</h1>\n<p>The server is temporarily unable to service your\nrequest due to maintenance downtime or capacity\nproblems. Please try again later.</p>\n</body></html>\n'

In [28]:
rtst.status_code

503

In [29]:
rtst2 = requests.get('https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/amip/r12i1p1f1/day/wap/gr/v20191121/wap_day_IPSL-CM6A-LR_amip_r12i1p1f1_gr_19580101-20141231.nc')

In [31]:
rtst2.content

b'Error {\n    code = 400;\n    message = "Unrecognized request";\n};\n'

In [38]:
pload = {'project':"CMIP6", 'type':"File", 'format':"application%2Fsolr%2Bjson", 'variable_id':"pr", 
 'experiment_id':"amip-p4K-lwoff", 'table_id':"day", 'source_id':"CESM2"}

In [39]:
r = requests.get("https://esgf-node.llnl.gov/esg-search/search", params=pload)
print(r)

<Response [501]>


In [40]:
r.url

'https://esgf-node.llnl.gov/esg-search/search?project=CMIP6&type=File&format=application%252Fsolr%252Bjson&variable_id=pr&experiment_id=amip-p4K-lwoff&table_id=day&source_id=CESM2'

In [42]:
r.text

'<!doctype html><html lang="en"><head><title>HTTP Status 501 – Not Implemented</title><style type="text/css">h1 {font-family:Tahoma,Arial,sans-serif;color:white;background-color:#525D76;font-size:22px;} h2 {font-family:Tahoma,Arial,sans-serif;color:white;background-color:#525D76;font-size:16px;} h3 {font-family:Tahoma,Arial,sans-serif;color:white;background-color:#525D76;font-size:14px;} body {font-family:Tahoma,Arial,sans-serif;color:black;background-color:white;} b {font-family:Tahoma,Arial,sans-serif;color:white;background-color:#525D76;} p {font-family:Tahoma,Arial,sans-serif;background:white;color:black;font-size:12px;} a {color:black;} a.name {color:black;} .line {height:1px;background-color:#525D76;border:none;}</style></head><body><h1>HTTP Status 501 – Not Implemented</h1><hr class="line" /><p><b>Type</b> Status Report</p><p><b>Message</b> Invalid requested format: application%2Fsolr%2Bjson</p><p><b>Description</b> The server does not support the functionality required to fulfi

In [43]:
pload['format']

'application%2Fsolr%2Bjson'

In [13]:
# result = esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
#                 local_node=False, files_type="HTTPServer", project="CMIP6",
#                 verbose=False, format="application%2Fsolr%2Bjson",
#                 use_csrf=False, variable_id='wap', experiment_id='aqua-control', table_id='day', source_id='IPSL-CM6A-LR')

var_to_download = 'pr'
table = 'day'
experiments = ('aqua-4xCO2', 'aqua-p4K', 'aqua-p4K-lwoff', 'amip-p4K', 'amip-p4K-lwoff', 'amip-4xCO2')
models_to_download = ("HadGEM3-GC31-LL",)

downloads="/Volumes/Jedha/CMIP6"

for model in models_to_download:
    for ex in experiments:        
        result = esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
                        local_node=False, files_type="HTTPServer", project="CMIP6",
                        verbose=False, format="application%2Fsolr%2Bjson",
                        use_csrf=False, variable_id=var_to_download, experiment_id=ex, 
                        table_id=table, source_id=model)
#         print(f"Start trying to download for {model}, {member}, {ex} ... Found {len(result)} URLs to try.")
        if len(result) > 0:
            print(f"NUMBER OF RESULTS: {len(result)}")
            completed = wget_needed_tryall(result, downloads=downloads) # wget_needed(url_list, downloads=downloads, dry_run=False)
        else:
            print(f"No search results were found for {model} - {ex}.")

NUMBER OF RESULTS: 3
We begin with a list of 3 URLs.
File already present: pr_day_HadGEM3-GC31-LL_aqua-4xCO2_r1i1p1f3_gn_19790101-19881230.nc ----> Skipping.
File already present: pr_day_HadGEM3-GC31-LL_aqua-4xCO2_r1i1p1f3_gn_19790101-19881230.nc ----> Skipping.
File already present: pr_day_HadGEM3-GC31-LL_aqua-4xCO2_r1i1p1f3_gn_19790101-19881230.nc ----> Skipping.
Downloading complete. Was able to complete downloading for 0 files. Failed to retrieve 0 URLs.
NUMBER OF RESULTS: 3
We begin with a list of 3 URLs.
File already present: pr_day_HadGEM3-GC31-LL_aqua-p4K_r1i1p1f3_gn_19790101-19881230.nc ----> Skipping.
File already present: pr_day_HadGEM3-GC31-LL_aqua-p4K_r1i1p1f3_gn_19790101-19881230.nc ----> Skipping.
File already present: pr_day_HadGEM3-GC31-LL_aqua-p4K_r1i1p1f3_gn_19790101-19881230.nc ----> Skipping.
Downloading complete. Was able to complete downloading for 0 files. Failed to retrieve 0 URLs.
NUMBER OF RESULTS: 2
We begin with a list of 2 URLs.
File already present: pr_da