Recursion testing using local files instead of URL requests

(can we manage the catalogRef vs dataset links regardless?)

In [2]:
import os
import json
import sys
from lxml import etree

import urlparse
import requests

from datetime import datetime

import logging
reload(logging)

logger = logging.getLogger(__name__)
handler = logging.FileHandler(filename="tds_%s.log" % datetime.now().strftime('%Y%m%d-%H%M'), mode="a")
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

def generate_path(extension):
    return os.path.join('xml_samples', extension)

# NOTE: starting with just the catalog.xml requests
def generate_url(parent_url, rel_path):
    parts = urlparse.urlparse(parent_url)
    
    return urlparse.urlunparse((
        parts.scheme,
        parts.netloc,
        parts.path.replace('catalog.xml', rel_path),
        None,
        None,
        None
    ))

# def request(path):
#     if not os.path.exists(path):
#         return ''
#     with open(path, 'r') as f:
#         text = f.read()
#     return text

def request(url, is_terminus):
    # return the status code and content if get
    if is_terminus:
        try:
            req = requests.head(url)
            return req.status_code, ''
        except:
            logger.error('head: {0}: {1}'.format(url, ex))
            raise
    
    else:
        try:
            req = requests.get(url)
            return req.status_code, req.content
        except:
            logger.error('get: {0}: {1}'.format(url, ex))
            raise
    
def parse_xml(text):
    try:
        return etree.fromstring(text)
    except:
        return None

def extract_element_tag(tag):
    if not tag:
        return ''
    return tag.split('}')[-1]    
    
class Crawl(object):
    def __init__(self, path):
        self.path = path
        self.crawl()
    
    def _parse(self, path):
        # parse the xml
        url = generate_path(path)
        response = request(url)
        xml = parse_xml(response)
        
        logger.debug('parsed file {0}'.format(path))
        return xml
    
    def crawl(self):
        # start at the path, iterate through the catalogRefs
        # and generate the tree of links
        
        fetched = []
        to_fetch = [self.path]
        
        tree = {}
        
        while to_fetch:
            
            path = to_fetch.pop()
            logger.debug('starting paths to follow: {0} : {1}'.format(len(to_fetch), path))
            
            xml = self._parse(path)
            
            services = xml.xpath('//*[local-name()="service" and @base != ""]')
            
            elements = xml.xpath('//*[local-name()="catalogRef" or local-name()="dataset"]')
            logger.debug('number of elements found {0}'.format(len(elements)))
            
            for element in elements:
                tag = extract_element_tag(element.tag)
                
                logger.debug('{0} element in {1} ({2})'.format(tag.upper(), path, element.attrib['ID']))
                
                # and then parse the specific bits
                if tag == 'dataset':
                    elem = Dataset(path, element, services)
                elif tag == 'catalogRef':
                    elem = CatalogRef(path, element)
                
                # look for children links for any element
                new_rel_path = element.attrib.get('{http://www.w3.org/1999/xlink}href', '') if tag == 'catalogRef' 
                    else element.attrib.get('urlPath', '')
                
                if not new_rel_path:
                    logger.warning('no relative path')
                    continue
                
                new_url = generate_url(path, new_rel_path)
   
                # local_tree['children'].append(elem.description)
                to_fetch = list(set(to_fetch + elem.follows))
                
                logger.debug('adding paths to follow: {0} ({1})'.format(len(to_fetch), len(elem.follows)))

            #tree[path] = local_tree
            fetched.append(path)
            
            logger.debug('fetched paths {0} ({1})'.format(len(fetched), len(to_fetch)))
        
        return tree

class CatalogRef(object):
    def __init__(self, path, element):
        self.path = path
        self.element = element
        
        self.description, self.follows = self._parse()
        
    def _parse(self):
        # parse the xml
        
        name = self.element.attrib.get('name', '')
        cat_id = self.element.attrib.get('ID', '')
        title = self.element.attrib.get('title', '')
        href = self.element.attrib.get('{http://www.w3.org/1999/xlink}href', '')
        tag = extract_element_tag(self.element.tag)
        
        # get the parent 
        parent = self.element.getparent()
        parent_tag = extract_element_tag(parent.tag)
        parent_id = parent.attrib.get('ID', '') if parent_tag != 'catalog' else ''
        
        # nested catalogRefs
        # TODO: why is this empty and 
        follows = self.element.xpath('*[local-name()="catalogRef"]')
        follows = [f.attrib.get('{http://www.w3.org/1999/xlink}href') for f in follows]
        
        logger.debug('{0} has {1} links to follow'.format(cat_id, len(follows)))
        
        description = {
            'id': cat_id,
            'href': href,
            'type': tag
        }
        if name:
            description.update({'name': name})
        if title:
            description.update({'title': title})
        if parent_id:
            description.update({'parent': parent_id, 'parent_type': parent_tag})
               
        return description, follows
        
        
class Dataset(object):
    # where services is the list of bases for this example
    def __init__(self, path, element, services):
        self.path = path
        self.element = element
        self.services = services
        
        self.description, self.follows = self._parse()
    
    def _parse(self):
        # parse the xml
        name = self.element.attrib.get('name', '')
        dataset_id = self.element.attrib.get('ID', '')
        href = self.element.attrib.get('urlPath', '')
        
        # get the parent 
        parent = self.element.getparent()
        parent_tag = extract_element_tag(parent.tag)
        parent_id = parent.attrib.get('ID', '') if parent_tag != 'catalog' else ''
        
        # nested catalogRefs
        follows = self.element.xpath('*[local-name()="catalogRef"]/@href')
        
        logger.debug('{0} has {1} links to follow'.format(dataset_id, len(follows)))
        
        hrefs= []
        if href:
            # do something about the services
            hrefs = ['/'.join([service, href]) for service in services]
        
        date = next(iter(self.element.xpath('*[local-name()="date"]/text()')), '')
        
        description = {
            'id': dataset_id
        }
        if hrefs:
            description.update({'hrefs': hrefs})
        if name:
            description.update({'name': name})
        if date:
            description.update({'date': date})
            
        if parent_id:
            description.update({'parent': parent_id, 'parent_type': parent_tag})
            
        return description, follows
  
    

In [81]:
# crawler = Crawl('catalog.xml')
crawler = Crawl('TSData/catalog.xml')

tree = crawler.crawl()

print '***************'
print 'crawl complete:'


2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,473 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015

DEBUG:crawler:starting paths to follow: 0 : TSData/catalog.xml


2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,488 -

DEBUG:crawler:parsed file TSData/catalog.xml


2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,495 

DEBUG:crawler:number of elements found 3


2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,515 - [D

DEBUG:crawler:DATASET element in TSData/catalog.xml (TSdata)


2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope
2015-04-16 09:47:09,530 - [DEBUG] nope


DEBUG:crawler:nope


2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,542 - [DEBUG] TSdata has 0 links to

DEBUG:crawler:TSdata has 0 links to follow


2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,551 - [DEBUG] adding

DEBUG:crawler:adding paths to follow: 0 (0)


2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,561 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,

DEBUG:crawler:CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)


2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,572 - [DEBUG] ARGO_M

DEBUG:crawler:ARGO_MERCHANT/catalog.xml


2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,581 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow

DEBUG:crawler:TSdata/ARGO_MERCHANT has 0 links to follow


2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,592 - [DEBUG] adding

DEBUG:crawler:adding paths to follow: 0 (0)


2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,606 - [DEBUG] CATALOGREF element in TSData/catalog

DEBUG:crawler:CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)


2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,615 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16

DEBUG:crawler:BARNEGAT/catalog.xml


2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,623 - [DEBUG] TSdata/BARNEGAT has 0 links to

DEBUG:crawler:TSdata/BARNEGAT has 0 links to follow


2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,630 - [DEBUG] adding

DEBUG:crawler:adding paths to follow: 0 (0)


2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,638 - [D

DEBUG:crawler:fetched paths 1 (0)


2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015-04-16 09:47:09,646 - [DEBUG] starting paths to follow: 0 : TSData/catalog.xml
2015

DEBUG:crawler:starting paths to follow: 0 : TSData/catalog.xml


2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 - [DEBUG] parsed file TSData/catalog.xml
2015-04-16 09:47:09,654 -

DEBUG:crawler:parsed file TSData/catalog.xml


2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 - [DEBUG] number of elements found 3
2015-04-16 09:47:09,661 

DEBUG:crawler:number of elements found 3


2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [DEBUG] DATASET element in TSData/catalog.xml (TSdata)
2015-04-16 09:47:09,669 - [D

DEBUG:crawler:DATASET element in TSData/catalog.xml (TSdata)


2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope
2015-04-16 09:47:09,675 - [DEBUG] nope


DEBUG:crawler:nope


2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to follow
2015-04-16 09:47:09,682 - [DEBUG] TSdata has 0 links to

DEBUG:crawler:TSdata has 0 links to follow


2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,689 - [DEBUG] adding

DEBUG:crawler:adding paths to follow: 0 (0)


2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,697 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)
2015-04-16 09:47:09,

DEBUG:crawler:CATALOGREF element in TSData/catalog.xml (TSdata/ARGO_MERCHANT)


2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_MERCHANT/catalog.xml
2015-04-16 09:47:09,704 - [DEBUG] ARGO_M

DEBUG:crawler:ARGO_MERCHANT/catalog.xml


2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow
2015-04-16 09:47:09,713 - [DEBUG] TSdata/ARGO_MERCHANT has 0 links to follow

DEBUG:crawler:TSdata/ARGO_MERCHANT has 0 links to follow


2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,724 - [DEBUG] adding

DEBUG:crawler:adding paths to follow: 0 (0)


2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)
2015-04-16 09:47:09,735 - [DEBUG] CATALOGREF element in TSData/catalog

DEBUG:crawler:CATALOGREF element in TSData/catalog.xml (TSdata/BARNEGAT)


2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16 09:47:09,742 - [DEBUG] BARNEGAT/catalog.xml
2015-04-16

DEBUG:crawler:BARNEGAT/catalog.xml


2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to follow
2015-04-16 09:47:09,748 - [DEBUG] TSdata/BARNEGAT has 0 links to

DEBUG:crawler:TSdata/BARNEGAT has 0 links to follow


2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding paths to follow: 0 (0)
2015-04-16 09:47:09,754 - [DEBUG] adding

DEBUG:crawler:adding paths to follow: 0 (0)


2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [DEBUG] fetched paths 1 (0)
2015-04-16 09:47:09,762 - [D

DEBUG:crawler:fetched paths 1 (0)


***************
crawl complete:


In [82]:
print json.dumps(tree, indent=4)

{
    "TSData/catalog.xml": {
        "children": [
            {
                "id": "TSdata", 
                "name": "Times Series Data"
            }, 
            {
                "parent_type": "dataset", 
                "href": "ARGO_MERCHANT/catalog.xml", 
                "type": "catalogRef", 
                "id": "TSdata/ARGO_MERCHANT", 
                "parent": "TSdata"
            }, 
            {
                "parent_type": "dataset", 
                "href": "BARNEGAT/catalog.xml", 
                "type": "catalogRef", 
                "id": "TSdata/BARNEGAT", 
                "parent": "TSdata"
            }
        ]
    }
}


In [5]:
hrefs_to_parse = ['catalog.xml']
hrefs_parsed = []

tree = []

class catref():
    def __init__(self, the_id, parent_id, rel_path):
        self.the_id = the_id
        self.parent_id = parent_id
        self.rel_path = rel_path

    def __repr__(self):
        return '<CatRef {0} (parent: {1}, relative_path: {2})>'.format(self.the_id, self.parent_id, self.rel_path)
        
class dataset():
    def __init__(self, the_id, parent_id, rel_path, terminus):
        self.the_id = the_id
        self.parent_id = parent_id
        self.rel_path = rel_path
        self.terminus = terminus

    def __repr__(self):
        return '<Dataset {0} (parent: {1}, relative_path: {2}, terminus: {3})>'.format(
            self.the_id, self.parent_id, self.rel_path, self.terminus
        )

while hrefs_to_parse:
    href_path = hrefs_to_parse.pop()
    path = generate_path(href_path)
    
    print '## PARSING : {0} from {1}'.format(path, href_path)
    
    response = request(path)
    xml = parse_xml(response)

    prefix = '/'.join(path.split('/')[1:-1])
    
    elements = xml.xpath('//*[local-name()="catalogRef" or local-name()="dataset"]')
    
    print '\thas {0} elements'.format(len(elements))
    
    for element in elements:
        tag = extract_element_tag(element.tag)
        
        if tag == 'catalogRef':
            new_path = element.attrib.get('{http://www.w3.org/1999/xlink}href', '')
        elif tag == 'dataset':
            new_path = element.attrib.get('urlPath', '')
            
        the_id = element.attrib.get('ID', '')
        
        terminus = tag == 'dataset' and 'urlPath' in element.attrib
        
        if not new_path:
            print '\t!!!! Failed to extract path', element.attrib.keys()
            continue
            
        new_path = new_path.replace('/thredds/catalog/', '')
        new_path = '/'.join([prefix, new_path])
        if new_path.startswith('/'):
            new_path = new_path[1:]
        
        if not terminus:
            hrefs_to_parse.append(new_path)
        else:
            print '\t--------Terminus : {0}'.format(new_path)
            
        if tag == 'catalogRef':
            tree.append(catref(the_id, href_path, new_path))
        elif tag == 'dataset':
            tree.append(dataset(the_id, href_path, new_path, terminus))

    hrefs_parsed.append(path)
    print '## Left to parse : {0}'.format(len(hrefs_to_parse))
    
    


## PARSING : xml_samples/catalog.xml from catalog.xml
	has 1 elements
## Left to parse : 1
## PARSING : xml_samples/TSdata/catalog.xml from TSdata/catalog.xml
	has 3 elements
	!!!! Failed to extract path ['name', 'ID']
## Left to parse : 2
## PARSING : xml_samples/TSdata/BARNEGAT/catalog.xml from TSdata/BARNEGAT/catalog.xml
	has 30 elements
	!!!! Failed to extract path ['name', 'ID']
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9611ecp-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9612solot-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9613HRaqd-cal.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9614dw-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9615exo-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9616ecp-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9617solot-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9621dw-a.nc
	--------Terminus : TSdata/BARNEGAT/TSdata/BARNEGAT/9622ecn-a.nc
	--------Terminus : TS

In [6]:
tree

[<CatRef TSdata (parent: catalog.xml, relative_path: TSdata/catalog.xml)>,
 <CatRef TSdata/ARGO_MERCHANT (parent: TSdata/catalog.xml, relative_path: TSdata/ARGO_MERCHANT/catalog.xml)>,
 <CatRef TSdata/BARNEGAT (parent: TSdata/catalog.xml, relative_path: TSdata/BARNEGAT/catalog.xml)>,
 <Dataset TSdata/BARNEGAT/9611ecp-a.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9611ecp-a.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9612solot-a.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9612solot-a.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9613HRaqd-cal.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9613HRaqd-cal.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9614dw-a.nc (parent: TSdata/BARNEGAT/catalog.xml, relative_path: TSdata/BARNEGAT/TSdata/BARNEGAT/9614dw-a.nc, terminus: True)>,
 <Dataset TSdata/BARNEGAT/9615exo-a.nc (parent: TSdata/BARNEGAT/catalog.xml, rela