Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

CLIMATE-316 Add ESGF Download Script to repository #500

Merged
merged 2 commits into from Mar 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 34 additions & 24 deletions examples/esgf_integration_example.py
Expand Up @@ -30,36 +30,46 @@

"""

import ocw.data_source.esgf as esgf
from getpass import getpass
from __future__ import print_function

import ssl
import sys
from getpass import getpass

import ocw.data_source.esgf as esgf


def main():
"""
An example of using the OCW ESGF library. Connects to an ESGF
server and downloads a dataset.
"""
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context

dataset_id = 'obs4mips.CNES.AVISO.zos.mon.v20110829|esgf-data.jpl.nasa.gov'
variable = 'zosStderr'

if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
if sys.version_info[0] >= 3:
username = input('Enter your ESGF OpenID:\n')
else:
username = raw_input('Enter your ESGF OpenID:\n')

dataset_id = 'obs4mips.CNES.AVISO.zos.mon.v20110829|esgf-data.jpl.nasa.gov'
variable = 'zosStderr'
password = getpass(prompt='Enter your ESGF Password:\n')

if sys.version_info[0] >= 3:
username = input('Enter your ESGF OpenID:\n')
else:
username = raw_input('Enter your ESGF OpenID:\n')
# Multiple datasets are returned in a list if the ESGF dataset is
# divided into multiple files.
datasets = esgf.load_dataset(dataset_id, variable, username, password)

password = getpass(prompt='Enter your ESGF Password:\n')
# For this example, our dataset is only stored in a single file so
# we only need to look at the 0-th value in the returned list.
dataset = datasets[0]

# Multiple datasets are returned in a list if the ESGF dataset is
# divided into multiple files.
datasets = esgf.load_dataset(dataset_id,
variable,
username,
password)
print('\n--------\n')
print('Variable: ', dataset.variable)
print('Shape: ', dataset.values.shape)
print('A Value: ', dataset.values[100][100][100])

# For this example, our dataset is only stored in a single file so
# we only need to look at the 0-th value in the returned list.
ds = datasets[0]

print('\n--------\n')
print('Variable: ', ds.variable)
print('Shape: ', ds.values.shape)
print('A Value: ', ds.values[100][100][100])
if __name__ == '__main__':
main()
66 changes: 33 additions & 33 deletions ocw/data_source/esgf.py
Expand Up @@ -16,9 +16,26 @@
# specific language governing permissions and limitations
# under the License.
#
"""
A set of functions to wrap downloading ESGF datasets into an OCW dataset object.

*** Note *** The ESGF data source requires that the user have certain credentials downloaded from
the ESG. The current version of the module should download these automatically. Older versions of
the library will not download them. The solution is to use the WGET script from the EGS to download
a test dataset to get the credentials. The data source should work as expected then.

"""
import os
import sys

import requests
from bs4 import BeautifulSoup

import ocw.data_source.local as local
from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
from ocw.esgf.download import download
from ocw.esgf.logon import logon

if sys.version_info[0] >= 3:
from urllib.error import HTTPError
else:
Expand All @@ -27,15 +44,6 @@
# might be around one day
from urllib2 import HTTPError

from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
from ocw.esgf.download import download
from ocw.esgf.logon import logon
from ocw.esgf.search import SearchClient
import ocw.data_source.local as local

from bs4 import BeautifulSoup
import requests


def load_dataset(dataset_id,
variable_name,
Expand All @@ -44,9 +52,8 @@ def load_dataset(dataset_id,
search_url=DEFAULT_ESGF_SEARCH,
elevation_index=0,
name='',
save_path='/tmp',
**additional_constraints):
''' Load an ESGF dataset.
save_path='/tmp'):
""" Load an ESGF dataset.

:param dataset_id: The ESGF ID of the dataset to load.
:type dataset_id: :mod:`string`
Expand Down Expand Up @@ -74,52 +81,45 @@ def load_dataset(dataset_id,
:param save_path: (Optional) Path to where downloaded files should be saved.
:type save_path: :mod:`string`

:param additional_constraints: (Optional) Additional key,value pairs to
pass as constraints to the search wrapper. These can be anything found
on the ESGF metadata page for a dataset.

:returns: A :class:`list` of :class:`dataset.Dataset` contained the
requested dataset. If the dataset is stored in multiple files each will
be loaded into a separate :class:`dataset.Dataset`.

:raises ValueError: If no dataset can be found for the supplied ID and
variable, or if the requested dataset is a multi-file dataset.
'''
download_data = _get_file_download_data(url=search_url,
dataset_id=dataset_id,
variable=variable_name)
"""
download_data = \
_get_file_download_data(url=search_url, dataset_id=dataset_id, variable=variable_name)

datasets = []

for url, var in download_data:
_download_files([url],
esgf_username,
esgf_password,
download_directory=save_path)
_download_files([url], esgf_username, esgf_password, download_directory=save_path)

file_save_path = os.path.join(save_path, url.split('/')[-1])
datasets.append(local.load_file(file_save_path,
var,
name=name,

datasets.append(local.load_file(file_save_path, var, name=name,
elevation_index=elevation_index))

origin = {
'source': 'esgf',
'dataset_id': dataset_id,
'variable': variable_name
}
for ds in datasets:
ds.origin = origin

for dataset in datasets:
dataset.origin = origin

return datasets


def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):
''''''
""""""
url += '?type=File&dataset_id={}&variable={}'
url = url.format(dataset_id, variable)

r = requests.get(url)
xml = BeautifulSoup(r.content, "html.parser")
raw_data = requests.get(url)
xml = BeautifulSoup(raw_data.content, "html.parser")

dont_have_results = not bool(xml.response.result['numfound'])

Expand All @@ -141,7 +141,7 @@ def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):


def _download_files(file_urls, username, password, download_directory='/tmp'):
''''''
""""""
try:
logon(username, password)
except HTTPError:
Expand Down
2 changes: 1 addition & 1 deletion ocw/esgf/constants.py
Expand Up @@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
#
'''Module containing constant parameters for ESGF RCMES integration.'''
"""Module containing constant parameters for ESGF RCMES integration."""

# default location of ESGF user credentials
ESGF_CREDENTIALS = "~/.esg/credentials.pem"
Expand Down
53 changes: 33 additions & 20 deletions ocw/esgf/download.py
Expand Up @@ -16,12 +16,18 @@
# specific language governing permissions and limitations
# under the License.
#
'''
"""
OCW module to download a file from ESGF.

'''
"""

from __future__ import print_function

import sys
from os.path import expanduser, join

from ocw.esgf.constants import ESGF_CREDENTIALS

if sys.version_info[0] >= 3:
from http.client import HTTPSConnection
from urllib.request import build_opener
Expand All @@ -35,50 +41,57 @@
from urllib2 import build_opener
from urllib2 import HTTPCookieProcessor
from urllib2 import HTTPSHandler
from os.path import expanduser, join

from ocw.esgf.constants import ESGF_CREDENTIALS


class HTTPSClientAuthHandler(HTTPSHandler):
'''
"""
HTTP handler that transmits an X509 certificate as part of the request
'''
"""

def __init__(self, key, cert):
HTTPSHandler.__init__(self)
self.key = key
self.cert = cert

def https_open(self, req):
"""
Opens the https connection.
:param req: The https request object.
:return: An addinfourl object for the request.
"""
return self.do_open(self.getConnection, req)

def getConnection(self, host, timeout=300):
return HTTPSConnection(host, key_file=self.key, cert_file=self.cert)
"""
Create an HTTPSConnection object.
:param host: The ESGF server to connect to.
:param timeout: Connection timeout in seconds.
:return:
"""
return HTTPSConnection(host, key_file=self.key, cert_file=self.cert, timeout=timeout)


def download(url, toDirectory="/tmp"):
'''
"""
Function to download a single file from ESGF.

:param url: the URL of the file to download
:param toDirectory: target directory where the file will be written
'''
"""

# setup HTTP handler
certFile = expanduser(ESGF_CREDENTIALS)
opener = build_opener(HTTPSClientAuthHandler(certFile, certFile))
cert_file = expanduser(ESGF_CREDENTIALS)
opener = build_opener(HTTPSClientAuthHandler(cert_file, cert_file))
opener.add_handler(HTTPCookieProcessor())

# download file
localFilePath = join(toDirectory, url.split('/')[-1])
print("\nDownloading url: %s to local path: %s ..." % (url, localFilePath))
localFile = open(localFilePath, 'w')
webFile = opener.open(url)
localFile.write(webFile.read())
local_file_path = join(toDirectory, url.split('/')[-1])
print("\nDownloading url: %s to local path: %s ..." % (url, local_file_path))
local_file = open(local_file_path, 'w')
web_file = opener.open(url)
local_file.write(web_file.read())

# cleanup
localFile.close()
webFile.close()
local_file.close()
web_file.close()
opener.close()
print("... done")
16 changes: 8 additions & 8 deletions ocw/esgf/logon.py
Expand Up @@ -16,28 +16,28 @@
# specific language governing permissions and limitations
# under the License.
#
'''
"""
RCMES module to logon onto the ESGF.
'''
"""
import os

from pyesgf.logon import LogonManager

from ocw.esgf.constants import JPL_MYPROXY_SERVER_DN, JPL_HOSTNAME
from ocw.esgf.constants import JPL_HOSTNAME, JPL_MYPROXY_SERVER_DN


def logon(openid, password):
'''
"""
Function to retrieve a short-term X.509 certificate that can be used to authenticate with ESGF.
The certificate is written in the location ~/.esg/credentials.pem.
The trusted CA certificates are written in the directory ~/.esg/certificates.
'''
"""
# Must configure the DN of the JPL MyProxy server if using a JPL openid
if JPL_HOSTNAME in openid:
os.environ['MYPROXY_SERVER_DN'] = JPL_MYPROXY_SERVER_DN

lm = LogonManager()
logon_manager = LogonManager()

lm.logon_with_openid(openid, password, bootstrap=True)
logon_manager.logon_with_openid(openid, password, bootstrap=True)

return lm.is_logged_on()
return logon_manager.is_logged_on()