A basic THREDDS crawler generating a flat tree (parent, children but not retaining the nested tree of the TDS response).

Runs a GET against catalog responses and a HEAD (do not download the actual data files) against the leaf Dataset items.

To run, initialize some local server for the files in notebooks/xml_samples:

```
python -m SimpleHTTPServer 8082
```

Note: this doesn't handle every TDS structure


In [76]:
import os
import json
import sys
from lxml import etree

import urlparse
import requests

from datetime import datetime

import logging
reload(logging)

logger = logging.getLogger(__name__)
handler = logging.FileHandler(filename="logs/tds_%s.log" % datetime.now().strftime('%Y%m%d-%H%M'), mode="a")
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

def generate_path(extension):
    return os.path.join('xml_samples', extension)

# NOTE: starting with just the catalog.xml requests
def generate_url(parent_url, rel_path):
    logger.debug('Parsing {0}, {1}'.format(parent_url, rel_path))
    
    if urlparse.urlparse(rel_path).scheme:
        logger.debug('Contained full path {0}'.format(rel_path))
        return rel_path
    
    parts = urlparse.urlparse(parent_url)
    
    rel_path = rel_path[1:] if rel_path.startswith('/') else rel_path
    rel_paths = rel_path.split('/')
    url_paths = parts.path.split('/')
    match_index = url_paths.index(rel_paths[0]) if rel_paths[0] in url_paths else -1
    
    if match_index < 0:
        # it does not intersect, just combine
        new_url = urlparse.urljoin(parent_url.replace('catalog.xml', ''), rel_path)
    else:
        new_url = urlparse.urljoin(urlparse.urlunparse((
            parts.scheme,
            parts.netloc,
            '/'.join(url_paths[0:match_index + 1]),
            None,
            None,
            None
        )), rel_path)
    logger.debug('Generated url = {0}'.format(new_url))
    return new_url

# def request(path):
#     if not os.path.exists(path):
#         return ''
#     with open(path, 'r') as f:
#         text = f.read()
#     return text

def head(url):
    try:
        req = requests.head(url)
        return req.status_code, ''
    except Exception as ex:
        logger.error('head: {0}: {1}'.format(url, ex))
        raise

def get(url):
    # return the status code and content if get
    try:
        req = requests.get(url)
        return req.status_code, req.content
    except Exception as ex:
        logger.error('get: {0}: {1}'.format(url, ex))
        raise
    
def parse_xml(text):
    try:
        parser = etree.XMLParser(
            remove_blank_text=True,
            remove_comments=True,
            recover=True,
            remove_pis=True
        )
        return etree.fromstring(text, parser=parser)
    except:
        return None

def extract_element_tag(tag):
    if not tag:
        return ''
    return tag.split('}')[-1]    
    
class Crawl(object):
    def __init__(self, path, from_root=False):
        self.path = path
        if from_root:
            root_path = self._find_root_url()
            if root_path != self.path:
                logger.debug('Adjusted for root catalog.xml: {0}'.format(root_path))
                self.path = root_path
        self.crawl()
    
    def _find_root_url(self):
        '''
        before parsing the larger tree, check that the catalog_url
        is the root node - return the shortest url that's good
        '''

        parts = urlparse.urlparse(self.path)
        route_parts = parts.path.split('/')
        route_parts = [r for r in route_parts if r and r != 'catalog.xml']

        founds = []
        for i in xrange(len(route_parts) + 1):
            route = urlparse.urlunparse(
                (parts.scheme,
                 parts.netloc,
                 '/'.join(route_parts[:len(route_parts) - i] + ['catalog.xml']),
                 parts.params,
                 parts.query,
                 parts.fragment)
            )
            req = requests.head(route)
            status_code = req.status_code
            if status_code in [200, 304]:
                founds.append(route)

        return self.path if not founds else min(founds)
    
    def _get(self, path):
        # url = generate_path(path)
        # url = generate_url(path)
        status_code, response = get(path)
        
        logger.info('GET Request {0}: {1}'.format(path, status_code))
        
        return status_code, response
        
    def _parse(self, response):
        # parse the xml
        
        xml = parse_xml(response)
        
        logger.debug('parsed file {0}'.format(path))
        return xml
    
    def crawl(self):
        # start at the path, iterate through the catalogRefs
        # and generate the tree of links
        
        fetched = []
        to_fetch = [self.path]
        
        tree = {}
        
        while to_fetch:
            
            path = to_fetch.pop()
            if path in fetched:
                logger.debug('Previously crawled today: {0}'.format(path))
                continue
                
            logger.debug('starting paths to follow: {0} : {1}'.format(len(to_fetch), path))
            
            status_code, response = self._get(path)
            
            if status_code != 200:
                # bail but note the error
                logger.debug('Get URL failed: {0} with {1}'.format(path, status_code))
                tree[path] = {"status": status_code}
                continue
                
            xml = self._parse(response)
            
            logger.debug('Has XML = {0}'.format(xml is not None))
            
            services = xml.xpath('//*[local-name()="service" and @base != ""]/@*[local-name()="base"]')
            services = [s[:-1] if s.endswith('/') else s for s in services]
            
            logger.debug('Services found = {0}: {1}'.format(len(services), services))
            
            elements = xml.xpath('//*[local-name()="catalogRef" or local-name()="dataset"]')
            logger.debug('number of elements found {0}'.format(len(elements)))
            
            children = []
            for element in elements:
                tag = extract_element_tag(element.tag)
                
                logger.debug('{0} element in {1} ({2})'.format(tag.upper(), path, element.attrib['ID']))
                
                # and then parse the specific bits
                if tag == 'dataset':
                    elem = Dataset(path, element, services)
                elif tag == 'catalogRef':
                    elem = CatalogRef(path, element)
                
                # this is like a list of one. good job.
                hrefs = [elem.description.get('href', '')]
                hrefs = [h for h in hrefs if h]
                
                new_urls = [generate_url(path, href) for href in hrefs]

                if not elem.terminus:
                    to_fetch = list(set(to_fetch + new_urls))
    
                children.append(elem.description)
    
            tree[path] = {"status": status_code, "children": children}
            fetched.append(path)
            
            logger.debug('fetched paths {0} ({1})'.format(len(fetched), len(to_fetch)))
        
        return tree

class CatalogRef(object):
    def __init__(self, path, element):
        self.path = path
        self.element = element
        
        self.description = self._parse()
        
        self.terminus = False
        
    def _parse(self):
        # parse the xml
        
        name = self.element.attrib.get('name', '')
        cat_id = self.element.attrib.get('ID', '')
        title = self.element.attrib.get('title', '')
        href = self.element.attrib.get('{http://www.w3.org/1999/xlink}href', '')
        tag = extract_element_tag(self.element.tag)
        
        # get the parent 
        parent = self.element.getparent()
        parent_tag = extract_element_tag(parent.tag)
        parent_id = parent.attrib.get('ID', '') if parent_tag != 'catalog' else ''
        
        description = {
            'id': cat_id,
            'href': href,
            'type': tag
        }
        if name:
            description.update({'name': name})
        if title:
            description.update({'title': title})
        if parent_id:
            description.update({'parent': parent_id, 'parent_type': parent_tag})
               
        return description

    
class Dataset(object):
    # where services is the list of bases for this example
    def __init__(self, path, element, services):
        self.path = path
        self.element = element
        self.services = services
        
        self.description = self._parse()
    
    def _parse(self):
        # parse the xml
        name = self.element.attrib.get('name', '')
        dataset_id = self.element.attrib.get('ID', '')
        href = self.element.attrib.get('urlPath', None)
        # TODO: deal with the urlPath as child element
        
        # get the parent 
        parent = self.element.getparent()
        parent_tag = extract_element_tag(parent.tag)
        parent_id = parent.attrib.get('ID', '') if parent_tag != 'catalog' else ''
        
        self.terminus = href is not None
        
        hrefs = []
        urls = []
        if href:
            hrefs = [generate_url(self.path, '/'.join([service, href])) for service in self.services if service]
            # make the HEAD requests here - does the endopint exist?
            # and store in a tuple? because we aren't going to follow the path anyway
            for h in hrefs:
                status_code, response = head(h)
                logger.info('HEAD Request {0}: {1}'.format(h, status_code))
                
                # save the response
                urls.append((h, status_code))
            
            
        date = next(iter(self.element.xpath('*[local-name()="date"]/text()')), '')
        
        description = {
            'id': dataset_id
        }
        if urls:
            description.update({'urls': urls})
        if name:
            description.update({'name': name})
        if date:
            description.update({'date': date})
            
        if parent_id:
            description.update({'parent': parent_id, 'parent_type': parent_tag})
            
        return description
  
    

In [75]:
# crawler = Crawl('catalog.xml')
# crawler = Crawl('TSData/catalog.xml')

# crawling from a local file store (the xml_samples set)
# via python -m SimpleHTTPServer 8082

crawler = Crawl('http://localhost:8082/thredds/catalog/catalog.xml')

tree = crawler.crawl()

print '***************'
print 'crawl complete:'

tree

***************
crawl complete:


{'http://localhost:8082/thredds/catalog/TSdata/ARGO_MERCHANT/catalog.xml': {'children': [{'id': 'TSdata/ARGO_MERCHANT',
    'name': 'ARGO_MERCHANT'},
   {'date': '2014-12-02T20:32:46Z',
    'id': 'TSdata/ARGO_MERCHANT/1211-A1H.cdf',
    'name': '1211-A1H.cdf',
    'parent': 'TSdata/ARGO_MERCHANT',
    'parent_type': 'dataset',
    'urls': [('http://localhost:8082/thredds/dodsC/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:8082/thredds/fileServer/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:8082/thredds/wcs/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:8082/thredds/ncss/grid/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:8082/thredds/wms/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:8082/thredds/iso/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:8082/thredds/ncml/TSdata/ARGO_MERCHANT/1211-A1H.cdf',
      404),
     ('http://localhost:

In [73]:
# TODO: finish putting together test set (this fails because of a get 404)
crawler = Crawl('http://localhost:8082/opendap/hyrax/catalog.xml')
tree = crawler.crawl()

tree

{'http://localhost:8082/opendap/hyrax/Catalog/catalog.xml': {'status': 404},
 'http://localhost:8082/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3zref.008/catalog.xml': {'children': [{'id': '/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3zref.008/',
    'name': '/Nimbus7_TOMS_Level3/TOMSN7L3zref.008'},
   {'date': '2011-02-24T21:04:34',
    'id': '/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3zref.008/TOMSN7L3zref_19781101_19930506.tar.gz',
    'name': 'TOMSN7L3zref_19781101_19930506.tar.gz',
    'parent': '/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3zref.008/',
    'parent_type': 'dataset'}],
  'status': 200},
 'http://localhost:8082/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3ztoz.008/catalog.xml': {'children': [{'id': '/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3ztoz.008/',
    'name': '/Nimbus7_TOMS_Level3/TOMSN7L3ztoz.008'},
   {'date': '2011-02-24T21:04:34',
    'id': '/opendap/hyrax/Nimbus7_TOMS_Level3/TOMSN7L3ztoz.008/TOMSN7L3ztoz_19781101_19930506.tar.gz',
    'name': 'TOMSN7L3ztoz_19781101

Running against XML on disk with very basic classes

In [5]:
hrefs_to_parse = ['catalog.xml']
hrefs_parsed = []

tree = []

class catref():
    def __init__(self, the_id, parent_id, rel_path):
        self.the_id = the_id
        self.parent_id = parent_id
        self.rel_path = rel_path

    def __repr__(self):
        return '<CatRef {0} (parent: {1}, relative_path: {2})>'.format(self.the_id, self.parent_id, self.rel_path)
        
class dataset():
    def __init__(self, the_id, parent_id, rel_path, terminus):
        self.the_id = the_id
        self.parent_id = parent_id
        self.rel_path = rel_path
        self.terminus = terminus

    def __repr__(self):
        return '<Dataset {0} (parent: {1}, relative_path: {2}, terminus: {3})>'.format(
            self.the_id, self.parent_id, self.rel_path, self.terminus
        )

while hrefs_to_parse:
    href_path = hrefs_to_parse.pop()
    path = generate_path(href_path)
    
    print '## PARSING : {0} from {1}'.format(path, href_path)
    
    response = request(path)
    xml = parse_xml(response)

    prefix = '/'.join(path.split('/')[1:-1])
    
    elements = xml.xpath('//*[local-name()="catalogRef" or local-name()="dataset"]')
    
    print '\thas {0} elements'.format(len(elements))
    
    for element in elements:
        tag = extract_element_tag(element.tag)
        
        if tag == 'catalogRef':
            new_path = element.attrib.get('{http://www.w3.org/1999/xlink}href', '')
        elif tag == 'dataset':
            new_path = element.attrib.get('urlPath', '')
            
        the_id = element.attrib.get('ID', '')
        
        terminus = tag == 'dataset' and 'urlPath' in element.attrib
        
        if not new_path:
            print '\t!!!! Failed to extract path', element.attrib.keys()
            continue
            
        new_path = new_path.replace('/thredds/catalog/', '')
        new_path = '/'.join([prefix, new_path])
        if new_path.startswith('/'):
            new_path = new_path[1:]
        
        if not terminus:
            hrefs_to_parse.append(new_path)
        else:
            print '\t--------Terminus : {0}'.format(new_path)
            
        if tag == 'catalogRef':
            tree.append(catref(the_id, href_path, new_path))
        elif tag == 'dataset':
            tree.append(dataset(the_id, href_path, new_path, terminus))

    hrefs_parsed.append(path)
    print '## Left to parse : {0}'.format(len(hrefs_to_parse))
    
    


## PARSING : xml_samples/catalog.xml from catalog.xml
	has 1 elements
## Left to parse : 1
## PARSING : xml_samples/TSdata/catalog.xml from TSdata/catalog.xml
	has 3 elements
	!!!! Failed to extract path ['name', 'ID']
## Left to parse : 2
## PARSING : xml_samples/TSdata/BARNEGAT/catalog.xml from TSdata/BARNEGAT/catalog.xml
	has 30 elements
	!!!! Failed to extract path ['name', 'ID']
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9611ecp-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9612solot-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9613HRaqd-cal.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9614dw-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9615exo-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9616ecp-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9617solot-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9621dw-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9622ecn-a.nc
	--------Terminus : TS

In [6]:
tree

[<CatRef TSdata (parent: catalog.xml, relative_path: TSdata/catalog.xml)>,
 <CatRef TSdata/ARGO_MERCHANT (parent: TSdata/catalog.xml, relative_path: TSdata/ARGO_MERCHANT/catalog.xml)>,
 <CatRef TSdata/BARNEGAT (parent: TSdata/catalog.xml, relative_path: TSdata/BARNEGAT/catalog.xml)>,
 <Dataset TSdata/BARNEGAT/9611ecp-a.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9611ecp-a.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9612solot-a.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9612solot-a.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9613HRaqd-cal.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9613HRaqd-cal.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9614dw-a.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9614dw-a.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9615exo-a.nc (parent: TSdata/BARNEGAT/catalog.xml, rela