# Extract copyright links from SHERPA/RoMEO

This notebook downloads information on publishers from the [SHERPA/RoMEO API](http://www.sherpa.ac.uk/romeo/apimanual.php).

In [1]:
import collections
import html
import urllib.request

import lxml.etree
import pandas

## Download XML

In [2]:
url = 'http://www.sherpa.ac.uk/romeo/api29.php?all=yes&showfunder=none'
path, headers = urllib.request.urlretrieve(url, 'data/romeo-publishers.xml')
headers.get('Date')

'Thu, 19 Apr 2018 20:10:28 GMT'

## Read and process XML

In [3]:
tree = lxml.etree.parse('data/romeo-publishers.xml')

In [4]:
publishers = tree.findall('publishers/publisher')
len(publishers)

2973

In [5]:
def get_publisher_info(publisher):
    """
    Get basic information from a SHERPA/RoMEO publisher XML element
    """
    info = {
        'romeo_id': publisher.get('id'),
        'publisher_name': html.unescape(publisher.findtext('name').strip()),
        'publisher_alias': publisher.findtext('alias').strip(),
    }
    return collections.OrderedDict(info)

In [6]:
rows = list()
publisher = publishers[0]
for publisher in publishers:
    info = get_publisher_info(publisher)
    for link in publisher.findall('copyrightlinks/copyrightlink'):
        row = info.copy()
        row['link_type'] = link.findtext('copyrightlinktext').strip()
        row['link_url'] = link.findtext('copyrightlinkurl').strip()
        rows.append(row)
link_df = pandas.DataFrame(rows)
len(link_df)

2799

In [7]:
link_df.head(2)

Unnamed: 0,romeo_id,publisher_name,publisher_alias,link_type,link_url
0,3031,АБВ-пресс,ABV-press,Policy,http://agx.abvpress.ru/jour/about/submissions#...
1,2014,Česká pedagogická společnost,Czech Pedagogical Society,Policy,http://www.ped.muni.cz/pedor/index.php?option=...


In [8]:
# Save link dataframe to a TSV file
link_df.to_csv('data/romeo-publisher-links.tsv', sep='\t', index=False)

In [9]:
# Most common link types
link_df.link_type.value_counts().head(10)

Policy                     1818
Example Policy              167
NIH policy                   56
publication rights           56
Other funding agencies       38
Instructions to authors      28
Other funding agenies        18
                             14
Self-archiving               10
NIH Policy                   10
Name: link_type, dtype: int64