This notebook contains working notes for identifying types of OpenSearch OSDD responses (catalog | dataset | granule | other) and requirements for semi-generic parsing. So no explicit namespacing and liberal use of xpath.

hanging on to this link for esri cdata testing: http://uaf.nodc.noaa.gov/geoportal/openSearchDescription.

### Prep Work 

In [14]:
import requests
import json
from lxml import etree
import urlparse
import urllib
from bs4 import BeautifulSoup
from itertools import chain

def extract_urls(xml, mimetype='atom+xml'):
    return xml.xpath('//*[local-name()="Url" and (@*[local-name()="type"]="application/%(mimetype)s" or @*[local-name()="type"]="text/%(mimetype)s")]' % {'mimetype': mimetype})

def extract_template(url, append_limit=True):
    # get the base url from the template
    template_parts = urlparse.urlparse(url)
    
    if not template_parts.scheme:
        return '', '', {}, False
    
    base_url = urlparse.urlunparse((
        template_parts.scheme,
        template_parts.netloc,
        template_parts.path,
        None,
        None,
        None
    ))

    qp = {k: v[0] for k, v in urlparse.parse_qs(template_parts.query).iteritems()}

    # get the hard-coded params
    defaults = {k:v for k, v in qp.iteritems() 
            if not v.startswith('{') 
            and not v.endswith('}')}
    
    # a flag for some hard-coded response format type to manage
    # accept headers or no
    format_defined = len([v for k, v in defaults.iteritems() if 'atom' in v.lower() or 'rss' in v.lower()]) > 0

    # get the rest (and ignore the optional/namespaces)
    parameters = {k: v[1:-1] for k, v in qp.iteritems() 
            if v.startswith('{') 
            and v.endswith('}')}
    
    if append_limit:
        terms = extract_parameter_key('count', parameters)
        if terms:
            defaults = dict(
                chain(defaults.items(), {k: 5 for k in terms.keys()}.items())
            )
            
    # note: not everyone manages url-encoded query parameter delimiters
    #       and not everyone manages non-url-encoded values so yeah. we are
    #       ignoring the non-url-encoded group tonight.
    # return the base, defaults, parameters as dict
    return base_url, defaults, parameters, format_defined

def extract_parameter_key(value, params):
    # sort out the query parameter name for a parameter
    # and don't send curly bracketed things, please
    return {k: v.split(':')[-1].replace('?', '') for k, v 
                in params.iteritems() 
                if value in v}

def extract_parameter_defs(url_elem, defined_terms):
    # could just go with a namespace check but
    # namespaces are included and not used more
    # than i'd like. safety first.
    params = url_elem.xpath('*[local-name()="Parameter"]')
    
    # and go crazy pedant with a) does each query param value
    # have a defined parameter element? (don't know if it should) 
    # and b) required or not parameters?
    for p in params:
        p_value = p.attrib.get('value', '')
        if not p_value:
            continue
        
        qps = {k:v for k, v in defined_terms.iteritems() if p_value[1:-1] in v}
        
        if not qps:
            continue
        
        #todo: sort out required v not and return things

def extract_query_terms(xml, param_name):
    # find a query element that contains an example
    # for the provided param_name (no namespace, no optional flag)
    example_queries = {}
    xp = '//*[local-name()="Query" and @*[local-name()="role"]="example"]/@*[local-name()="{0}"]'.format(param_name)
    try:
        example_queries = xml.xpath(xp)
    except:
        print 'failed example query: ', xp
        return []
    
    return example_queries
    
def extract_links_from_description(elem):
    # for the html blobs embedded in the text()
    # we don't really have much info about the links
    # but at least it ignores cdata.
    soup = BeautifulSoup(elem.text)
    return [h.get('href') for h in soup.find_all('a')]

def extract_search_rel(xml):
    elem = next(
        iter(
                xml.xpath('/*/*[local-name()="link" and (@*[local-name()="rel"]="search" or @*[local-name()="rel"]="http://esipfed.org/ns/fedsearch/1.0/search#")]')
            ), None
        )
    return elem

def extract_item_links(xml):
    # item or entry links from a secondary search
    return xml.xpath('//*[local-name()="entry" or local-name()="item"]/*[local-name()="link"]')

def extract_response_stats(xml):
    total = next(iter(xml.xpath('//*[local-name()="totalResults"]/text()')), 'Unknown')
    subset = next(iter(xml.xpath('//*[local-name()="itemsPerPage"]/text()')), 'Unknown')
    
    if subset == 'Unknown':
        subset = len(xml.xpath('//*[local-name()="entry" or local-name()="item"]'))
    
    return subset, total

def generate_requests(url_elem):
    # rebuild the url(s)
    url_base, defaults, params, format_defined = extract_template(url_elem.attrib.get('template'))
    accept_type = url_elem.attrib.get('type', '')
    
    headers = {'Accept': accept_type} if format_defined == False else {}
    
    search_terms = extract_parameter_key('searchTerms', params)
    
    search_urls = []
    
    # build the empty search
    if search_terms:
        qps = dict(
            chain(
                defaults.items(),
                {search_terms.keys()[0]: ''}.items()
            )
        )
        
        search_urls.append(url_base + '?' + urllib.urlencode(qps.items()))
    
    # try to build an example search
    query_examples = list(
        chain.from_iterable(
            [extract_query_terms(url_elem.getparent(), p) for p in search_terms.values()]
        )
    )
    
    if query_examples:
        test_query = {search_terms.keys()[0]: query_examples[0]}
        qps = dict(
            chain(
                defaults.items(),
                test_query.items()
            )
        )
        
        search_urls.append(url_base + '?' + urllib.urlencode(qps.items()))
    
    return search_urls, headers

def execute_request(url, headers={}):
    try:
        req = requests.get(url, headers=headers)
    except:
        logger.error('\tSkipping connection issue\'s')
        return '-999', '', ''
    
    return req.status_code, req.content, req.headers

def parse_response(content, headers={}):
    # see if it has content, see if the xml parses, see if it's even xml
    if not content:
        return {'error': 'No content'}
    
    if 'html' in headers.get('content-type'):
        return {'error': 'HTML response'}

    try:
        xml = etree.fromstring(content)
    except:
        return {'error': 'XML Parse error'}

    return {'xml': xml}
    

# from doug, see notes re: uptime
cwic_links = [
    'http://dap.onc.uvic.ca/erddap/opensearch1.1/description.xml',
    'http://gcmd.gsfc.nasa.gov/KeywordSearch/default/openSearch.jsp?Portal=cwic',
    'http://podaac.jpl.nasa.gov/ws/search/dataset/osd.xml',
    # 'http://nsidc.org/api/opensearch/1.1/dataset/description',  # we're just not going to run this
    'http://ghrc.nsstc.nasa.gov/hydro/ghost.xml',
    'http://mirador.gsfc.nasa.gov/mirador_dataset_opensearch.xml',
    'http://eo-virtual-archive4.esa.int/search/ER02_SAR_RAW_0P/description',
    'http://www1.usgs.gov/erddap/opensearch1.1/description.xml',
    # 'http://bison.usgs.ornl.gov/doc/api.jsp',  # this is now a dead link
    # 'http://ceocat.ccrs.nrcan.gc.ca/opensearch_description_document.xml',  # this is 403 access forbidden
    # 'http://rs211980.rs.hosteurope.de/mule/os-description/',  # 503 service down
    'http://geo.spacebel.be/opensearch/description.xml',  # from the fedeo documentation page listed
    'http://lance-modis.eosdis.nasa.gov/user_services/dataset_opensearch.xml'
]

In [165]:
# processing the cwic services
# Question 1: do we all have searchTerms parameters?

for cwic_link in cwic_links:
    print 'Processing {0}'.format(cwic_link)
    
    req = requests.get(cwic_link)
    if req.status_code != 200:
        print '\tFailed request'
        continue
    
    xml = etree.fromstring(req.content)
    
    extracted_url_elems = extract_urls(xml)
    
    for extracted_elem in extracted_url_elems:
        template_base, template_defaults, template_params, format_defined = extract_template(extracted_elem.attrib.get('template'))
        accept_type = extracted_elem.attrib.get('type', '')
        
        # let's see if the url includes some enumerations as Parameter children
        # parameter_definitions = extract_parameter_defs(extracted_elem, template_params)
        
        catalog_params = extract_parameter_key('searchTerms', template_params)
        print '\tCatalog searchTerms: ', catalog_params
    
        if not catalog_params:
            continue
        
        query_examples = list(chain.from_iterable([extract_query_terms(xml, p) for p in catalog_params.values()]))
        
        print '\tExample queries: ', query_examples
  
        # try two requests, one without the example text (just the empty key) and
        # one with the term
        qps = '&'.join([template_defaults, catalog_params.keys()[0] + '='])
        if qps.startswith('&'):
            qps = qps[1:]
        first_request = template_base + '?' + qps
        
        print '\tProcessing {0}'.format(first_request)
        
        headers = {'Accept': accept_type} if format_defined == False else {}
        first_req = requests.get(first_request, headers=headers) 
        if first_req.status_code != 200:
            print '\t\tFailed request: {0}'.format(first_req.content)
            continue
            
        if not first_req.content:
            print '\t\tEmpty response!'
        
        else:
            extract_xml = etree.fromstring(first_req.content)
            subset, total = extract_response_stats(extract_xml)
            print '\t\tRequested {0} out of {1}'.format(subset, '{:,}'.format(int(total)) if total != 'Unknown' else total)

            # see if there's a reference to the parent catalog request 
            parent_search_elem = next(
                iter(
                    extract_xml.xpath('/*/*[local-name()="link" and (@*[local-name()="rel"]="search" or @*[local-name()="rel"]="http://esipfed.org/ns/fedsearch/1.0/search#")]')), None
                )
            print '\t\tCatalog reference: ', parent_search_elem.attrib.get('href') if parent_search_elem is not None else 'Not found'
        
        if query_examples:
            test_query = {catalog_params.keys()[0]: query_examples[0]}
            qps = '&'.join(
                # TODO: urlencode just the values
                [template_defaults, urllib.urlencode(test_query.items())]
            )
            if qps.startswith('&'):
                qps = qps[1:]
            second_request = template_base + '?' + qps
            
            print '\t\tProcessing {0}'.format(second_request)
            
            # reuse the headers
            second_req = requests.get(second_request, headers=headers)
            if second_req.status_code != 200:
                print '\t\t\tFailed request: {0}'.format(second_req.content)
                continue
            
            if not second_req.content:
                print '\t\tEmpty response!'
                continue

            extract_xml = etree.fromstring(second_req.content)
            subset, total = extract_response_stats(extract_xml)
            print '\t\tRequested {0} out of {1}'.format(subset, '{:,}'.format(int(total)) if total != 'Unknown' else total)

Processing http://dap.onc.uvic.ca/erddap/opensearch1.1/description.xml
	Catalog searchTerms:  {'searchTerms': 'searchTerms'}
	Example queries:  ['temperature']
	Processing http://dap.onc.uvic.ca/erddap/opensearch1.1/search?itemsPerPage=5&format=atom&searchTerms=
		Requested 5 out of 19
		Catalog reference:  Not found
		Processing http://dap.onc.uvic.ca/erddap/opensearch1.1/search?itemsPerPage=5&format=atom&searchTerms=temperature
			Requested 5 out of 16
Processing http://gcmd.gsfc.nasa.gov/KeywordSearch/default/openSearch.jsp?Portal=cwic
	Catalog searchTerms:  {'searchTerms': 'searchTerms'}
	Example queries:  ['modis']
	Processing http://gcmd.gsfc.nasa.gov/KeywordSearch/OpenSearch.do?count=5&output=atom&MetadataType=0&Portal=cwic&clientId=fromgcmd&searchTerms=
		Empty response!
		Processing http://gcmd.gsfc.nasa.gov/KeywordSearch/OpenSearch.do?count=5&output=atom&MetadataType=0&Portal=cwic&clientId=fromgcmd&searchTerms=modis
			Requested 5 out of 137
Processing http://podaac.jpl.nasa.

## Notes


### Parameter Constraints

From this set, we see two options - using the OpenSearch [Parameters extension](http://a9.com/-/spec/opensearch/extensions/parameters/1.0/) (ex: FEDEO service) or a custom [extension](http://podaac.jpl.nasa.gov/opensearch/) (ex: PO.DAAC service).

Using the Parameter extension:

```
<Url type="application/rss+xml" indexOffset="1" pageOffset="1"  rel="collection" template="http://geo.spacebel.be:80/opensearch/request/?httpAccept=application/rss%2Bxml&amp;parentIdentifier={eo:parentIdentifier?}&amp;subject={dc:subject?}&amp;query={searchTerms?}&amp;startRecord={startIndex?}&amp;startPage={startPage?}&amp;maximumRecords={count?}&amp;startDate={time:start?}&amp;endDate={time:end?}&amp;type={dc:type?}&amp;title={dc:title?}&amp;publisher={dc:publisher?}&amp;bbox={geo:box?}&amp;name={geo:name?}&amp;lat={geo:lat?}&amp;lon={geo:lon?}&amp;radius={geo:radius?}&amp;uid={geo:uid?}&amp;organisationName={eo:organisationName?}&amp;productType={eo:productType?}&amp;platform={eo:platform?}&amp;instrument={eo:instrument?}&amp;classifiedAs={semantic:classifiedAs?}&amp;recordSchema={sru:recordSchema?}">
            
    <param:Parameter name="recordSchema" value="{sru:recordSchema}">
        <param:Option value="iso" label="ISO"/>             
    </param:Parameter>
    
    <param:Parameter name="type" value="{dc:type}">
        <param:Option value="dataset" label="dataset"/>     
        <param:Option value="collection" label="dataset series"/>       
        <param:Option value="service" label="service"/>             
    </param:Parameter>
    
    <param:Parameter name="parentIdentifier" value="{eo:parentIdentifier}">     
        <param:Option value="EOP:ESA:FEDEO" label="EOP:ESA:FEDEO"/>     
        <param:Option value="EOP:ESA:FEDEO:COLLECTIONS" label="EOP:ESA:FEDEO:COLLECTIONS"/>     
        <param:Option value="EOP:ESA:GPOD-EO" label="EOP:ESA:GPOD-EO"/>     
        <param:Option value="EOP:ESA:EO-VIRTUAL-ARCHIVE4" label="EOP:ESA:EO-VIRTUAL-ARCHIVE4"/>     
        <param:Option value="EOP:NASA:ECHO" label="EOP:NASA:ECHO"/>             
    </param:Parameter>
</Url>
```

Using the custom extension:

```
<Url type="application/atom+xml" template="http://podaac.jpl.nasa.gov/ws/search/dataset?keyword={searchTerms}&startIndex={startIndex?}&itemsPerPage={count?}&bbox={georss:box?}&startTime={time:start?}&endTime={time:end?}&datasetId={podaac:datasetId?}&instrument={podaac:instrument}&satellite={podaac:satellite}&fileFormat={podaac:fileFormat}&status={podaac:status}&processLevel={podaac:processLevel}&full={podaac:full}&sortBy={podaac:sortBy?}&pretty={podaac:pretty?}&format=atom"/>

<Query role="http://esipfed.org/ns/discovery/1.1/#validpatterns" podaac:instrument="\[a-zA-Z0-9-]*" podaac:satellite="\[a-zA-Z0-9-]*" podaac:fileFormat="HDF|NetCDF" podaac:status="OPEN|PREVIEW|SIMULATED|RETIRED" podaac:processLevel="2|2P|3|4" 
podaac:full="true|false" podaac:datasetId="\[a-zA-Z0-9-]*" podaac:sortBy="timeAsc|timeDesc" podaac:pretty="true|false"/>
```

The things that immediately jump out to me are 1) the PO.DAAC structure using ESIP "valid patterns" seems ill-suited for large enumerations, 2) for a parameter like "instrument" that is assumed to be a known list, a generic regex pattern (alphanumeric of any case and length) doesn't provide enough understanding to generate API documentation or a user-friendly interface, 3) the Parameter element is limited in scope (no format definitions, no range constraints, etc) and 4) I, at least, am unclear on when a Parameter with enumeration is included for a query parameter and when it isn't.  

But almost all of this is part of the CWIC best practices document. One update would maybe be related to the use of Accept headers versus query parameters for the response format. And a more consistent error response would be fantastic.

Issues (in no particular order): 

- url encoding support (must, must not be, either way) (Url/@template values not always url-encoded in the responses)
- accept headers (accept headers vs accept format query param - issues with parsable errors)
- response result set total/subset values not always present
- if we assume that a searchTerms parameter is supported for the catalog service, ie "Return all datasets", not all services support an empty value in that query parameter (ex. EOSDIS Lance)
- not all dataset-level responses include another OSDD request but that could be dataset-as-granule and not 

Possible EUMETSAT URL:

http://vnavigator.eumetsat.int/discovery/os-description.xml

although the response is a template that isn't correctly executed:

```
template="$WEBAPP_SERVER_URL/OpenSearch
```


# So what happens with our list of OSDD responses?

In [16]:
import os
import pandas as pd
from datetime import datetime
import logging
from copy import deepcopy

reload(logging)

pd.set_option('precision', 2)

logger = logging.getLogger(__name__)
handler = logging.FileHandler(filename="opensearch_hrefs.log", mode="a")
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

def get_new_stat():
    return {
        'catalog_osdd': '',
        'catalog_osdd_status_code': -999,
        'catalog_osdd_atom_urls': 0, 
        'catalog_osdd_rss_urls': 0, 
        'search_url': '', 
        'search_content_type': '',
        'search_status_code': -999, 
        'search_item_total': -999, 
        'search_links': 0, 
        'search_rel_osdd': False,
        'error': ''
    }

with open('opensearch_hrefs.txt', 'r') as f:
    urls = [u.strip() for u in f.readlines()]
    
stats = []

logger.info('Output generated at {0}\n\n\n'.format(datetime.now()))

for index, url in enumerate(urls):
    if index % 100 == 0:
        print '{0} complete'.format(index)
        
    # init an empty row
    stat = get_new_stat()
    
    stat['catalog_osdd'] = url
    
    logger.info('Processing {0}: {1}'.format(index, url))
    
    status, content, headers = execute_request(url)
    
    logger.info('\tResponse: {0}'.format(status))
    
    stat['catalog_osdd_status_code'] = status
    if status != 200:
        stats.append(stat)
        continue
    
    parsed = parse_response(content, headers)
    if 'error' in parsed:
        logger.info('\tInvalid response: {0}'.format(parsed['error']))
        stat['error'] = parsed['error']
        stats.append(stat)
        continue
    
    xml = parsed['xml']
    
    extract_atoms = extract_urls(xml)
    stat['catalog_osdd_atom_urls'] = len(extract_atoms)
    
    for extract_atom in extract_atoms:
        searches, headers = generate_requests(extract_atom)
        
        for search in searches:
            search_status, search_content, search_headers = execute_request(search, headers)
            
            new_stat = deepcopy(stat)
            new_stat['search_url'] = search
            new_stat['search_status_code'] = search_status
            new_stat['search_content_type'] = headers.get('content-type', '')
            
            parsed = parse_response(search_content, search_headers)
            if 'error' in parsed:
                new_stat['error'] = parsed['error']
                stats.append(new_stat)
                continue
            
            xml = parsed['xml']
            subset, total = extract_response_stats(xml)
            rel_elem = extract_search_rel(xml)
            links = extract_item_links(xml)
            
            new_stat['search_item_total'] = total
            new_stat['search_rel_osdd'] = rel_elem is not None
            new_stat['search_links'] = len(links)
            
            stats.append(new_stat)
    
    extract_rsss = extract_urls(xml, 'rss+xml')
    stat['catalog_osdd_rss_urls'] = len(extract_rsss)
    
    logger.info('\tFound {0} atom links and {1} rss links'.format(len(extract_atoms), len(extract_rsss)))
    
    for extract_rss in extract_rsss:
        searches, headers = generate_requests(extract_rss)
    
        for search in searches:
            search_status, search_content, search_headers = execute_request(search, headers)
            
            new_stat = deepcopy(stat)
            new_stat['search_url'] = search
            new_stat['search_status_code'] = search_status
            new_stat['search_content_type'] = headers.get('content-type', '')
            
            parsed = parse_response(search_content, search_headers)
            if 'error' in parsed:
                new_stat['error'] = parsed['error']
                stats.append(new_stat)
                continue
            
            xml = parsed['xml']
            subset, total = extract_response_stats(xml)
            rel_elem = extract_search_rel(xml)
            links = extract_item_links(xml)
            
            new_stat['search_item_total'] = total
            new_stat['search_rel_osdd'] = rel_elem is not None
            new_stat['search_links'] = len(links)
            
            stats.append(new_stat)
     
    stats.append(stat)

0 complete
100 complete
200 complete
300 complete


In [17]:
import csv

with open('opensearch_hrefs_stats.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=stats[0].keys())
    
    writer.writeheader()
    for stat in stats:
        writer.writerow(stat)

len(stats)

513

In [18]:
df = pd.read_csv('opensearch_hrefs_stats.csv')
df

Unnamed: 0,catalog_osdd_rss_urls,search_url,search_status_code,catalog_osdd_atom_urls,search_rel_osdd,catalog_osdd,search_links,search_content_type,error,search_item_total,catalog_osdd_status_code
0,1,http://academiccommons.columbia.edu/catalog.rs...,200,0,False,http://academiccommons.columbia.edu/catalog/op...,500,,,Unknown,200
1,1,,-999,0,False,http://academiccommons.columbia.edu/catalog/op...,0,,,-999,200
2,0,http://admissions.chem.ox.ac.uk/SearchResults....,200,1,True,http://admissions.chem.ox.ac.uk/SearchEngineIn...,0,,,0,200
3,0,,-999,1,False,http://admissions.chem.ox.ac.uk/SearchEngineIn...,0,,,-999,200
4,0,,-999,0,False,http://arsf-dan.nerc.ac.uk/trac/search/opensearch,0,,,-999,200
5,0,,-999,0,False,http://astrodocs.wr.usgs.gov/opensearch_desc.php,0,,,-999,200
6,0,,-999,0,False,http://aura.abdn.ac.uk/open-search/description...,0,,,-999,200
7,1,http://bahainyc.org/syndication.axd?q=,200,0,False,http://bahainyc.org/opensearch.axd,10,,,Unknown,200
8,1,,-999,0,False,http://bahainyc.org/opensearch.axd,0,,,-999,200
9,0,,-999,0,False,http://bakerinstitute.org/search/open-search/,0,,,-999,200


reliability of namespaces for os feed identification

```
<feed xml:lang="en" 
    xmlns="http://www.w3.org/2005/Atom" 
    xmlns:time="http://a9.com/-/opensearch/extensions/time/1.0/" 
    xmlns:os="http://a9.com/-/spec/opensearch/1.1/" 
    xmlns:dc="http://purl.org/dc/elements/1.1/" 
    xmlns:georss="http://www.georss.org/georss" 
    xmlns:gml="http://www.opengis.net/gml" 
    xmlns:geo="http://a9.com/-/opensearch/extensions/geo/1.0/" 
    xmlns:metalink="urn:ietf:params:xml:ns:metalink" 
    xmlns:xlink="http://www.w3.org/1999/xlink">
    <title>Catalogue Search Feed for ERS-2 SAR Image SAR Annotated Raw Data Product Level 0 (ER02_SAR_RAW_0P)</title>
```



In [9]:
import os
import requests
import json
from lxml import etree
import csv

def extract_namespaces(xml):
    document_namespaces = dict(xml.xpath('/*/namespace::*'))
    if None in document_namespaces:
        document_namespaces['default'] = document_namespaces[None]
        del document_namespaces[None]

    # now run through any child namespace issues
    all_namespaces = xml.xpath('//namespace::*')
    for i, ns in enumerate(all_namespaces):
        if ns[1] in document_namespaces.values():
            continue
        new_key = ns[0] if ns[0] else 'default%s' % i
        document_namespaces[new_key] = ns[1]

    return document_namespaces

os_ns = 'http://a9.com/-/spec/opensearch/1.1/'

outputs = []
with open('opensearch_hrefs_stats.csv', 'r') as csvfile:
    recs = csv.reader(csvfile)
    next(recs, None)
    for rec in recs:
        search_url = rec[1]
        
        if not search_url:
            continue
        
        print search_url
           
        assumed_atom = rec[3]
        # print '\tIs assumed ATOM? ', int(assumed_atom) == 1 
        
        try:
            req = requests.get(search_url, timeout=10)
        except requests.exceptions.ReadTimeout:
            continue
        if req.status_code != 200:
            continue
        
        content = req.content
        
        try:
            xml = etree.fromstring(content)
        except:
            print '\tFAILED XML'
            continue
        namespaces = extract_namespaces(xml)
        tag = xml.tag.split('}')[-1]
        
        if os_ns in namespaces.values():
            print '\tgood namespace, tag=', tag, 'ATOM? ', tag == 'feed' and int(assumed_atom) == 1 
        else:
            print '\tno namespace, tag=', tag
            
        outputs.append(','.join([search_url, tag, str(os_ns in namespaces.values()), str(tag == 'feed'), str(int(assumed_atom) == 1)]))
        
with open('opensearch_identity_text.csv', 'w') as f:
    f.write('search_url,tag,contains_ns,atom_root,assumed_atom\n')
    f.write('\n'.join(outputs))

    

http://academiccommons.columbia.edu/catalog.rss?q=
	no namespace, tag= rss
http://admissions.chem.ox.ac.uk/SearchResults.ashx?q=
	good namespace, tag= feed ATOM?  True
http://bahainyc.org/syndication.axd?q=
	no namespace, tag= rss
http://caspian.iwlearn.org/opensearch_atom.xml?b_size%3Aint=5&SearchableText=
	good namespace, tag= feed ATOM?  True
http://caspian.iwlearn.org/opensearch_atom.xml?b_size%3Aint=5&SearchableText=None
	good namespace, tag= feed ATOM?  True
http://citizenscienceassociation.org/?feed=atom&s=
	no namespace, tag= feed
http://citizenscienceassociation.org/?feed=atom&s=photography
	no namespace, tag= feed
http://climateaudit.org/?feed=atom&s=
	no namespace, tag= feed
http://climateaudit.org/?feed=atom&s=photography
	no namespace, tag= feed
http://coastwatch.pfeg.noaa.gov/erddap/opensearch1.1/search?itemsPerPage=5&searchTerms=&format=atom
	good namespace, tag= feed ATOM?  True
http://coastwatch.pfeg.noaa.gov/erddap/opensearch1.1/search?itemsPerPage=5&searchTerms=tempe