In [1]:
import requests
import json
from bs4 import BeautifulSoup

##### in: ID of Eprint record  
##### out: URL to JSON representation of it

In [2]:
def getRecUrl(record_id):
    return f"http://cedadocs.ceda.ac.uk/cgi/export/eprint/{record_id}/JSON/ceda-eprint-{record_id}.js"

##### in: ID of Eprint record
##### out: JSON representation of record

In [3]:
def getRecJson(record_id):
    r = requests.get(getRecUrl(record_id))
    return r.json()

##### in: resumptionToken generated by OAI
##### out: link to XML representation

In [4]:
def get_url_by_token(token_part):
    return f'http://cedadocs.ceda.ac.uk/cgi/oai2?verb=ListRecords&resumptionToken={token_part}'

##### in: resumptionToken generated by OAI
##### out: XML representation 

In [5]:
def get_source_by_token(token):
    r = requests.get(get_url_by_token(token))
    return BeautifulSoup(r.text, 'html.parser')

##### It produces list of valid IDs basing on the OAI response 

In [6]:
ids = []
token = 'metadataPrefix%3Doai_dc%26offset%3D0'
while True:
    try:
        html = get_source_by_token(token)
        for rec_id in html.find_all('identifier'):
            ids.append(int(rec_id.text.split(':')[2]))
        token = html.resumptiontoken.text
    except AttributeError:
        break


In [7]:
ids = ids[1:]

##### in: JSON representation of Eprints record
##### out: dictionary of Zenodo atrributes describing given type

In [8]:
def convertType(eprintsJSON): 
    recordType = eprintsJSON['type']
    
    if recordType == 'monograph':
        recordType += f"/{eprintsJSON['monograph_type']}"
    
    typeDict = {'artefact': 'physicalobject',
                'article': 'publication/article', 
                'book': 'publication/book',
                'book_section': 'publication/section',
                'conference_item': 'publication/conferencepaper',
                'exhibition': 'other',
                'image': 'image', # image_type?
                'other': 'other',
                'teaching_resource': 'lesson',
                'video': 'video',
                'audio': 'video',
                'dataset': 'dataset',
                'composition': 'other',
                'performance': 'other',
                'experiment': 'other',
                'patent': 'publication/patent',
                'thesis': 'publication/thesis',
                'monograph/work_paper': 'publication/workpaper',
                'monograph/other' : 'publication/other',
                'monograph/structured_metadata': 'publication/other',
                'monograph/discussion_paper': 'publication/other',
                'monograph/documentation': 'publication/other',
                'monograph/manual': 'publication/other',
                'monograph/minutes': 'publication/other',
                'monograph/annual_report': 'publication/report',
                'monograph/project_report': 'publication/report',
                'monograph/technical_report': 'publication/report'
               }
    outType = typeDict[recordType].split('/')

    result = dict()
    result['type'] = outType[0]
    if outType[0] == 'publication':
        result['publication_type'] = outType[1]
        if outType[1] == 'section':
            try:
                result['partof_title'] = eprintsJSON['book_title']
            except KeyError:
                pass
    elif outType[0] == 'image':
        result['image_type'] = 'other'
    
    return result

##### in: JSON representation of Eprints record
##### out: list of creators in Zenodo format

In [9]:
def convertCreators(creatorsListJSON):
    result = []
    for c in creatorsListJSON:
        creator = dict()
        creator['name'] = f"{c['name']['family']}, {c['name']['given']}"
        # what about ID in Eprints?
        result.append(creator)
    return result

##### in: JSON representation of Eprints record
##### out: list of contributors in Zenodo format

In [10]:
def convertContributors(contributorsListJSON):
    result = []
    for c in contributorsListJSON:
        contributor = dict()
        contributor['name'] = f"{c['name']['family']}, {c['name']['given']}"
        # what about ID in Eprints?
        contributor['type'] = 'Other' # most of the types in Eprints are null
        
        result.append(contributor)
    return result

##### in: name of the field to be found
##### out: ID of the first record containing given field

In [11]:
def getRecWithField(fieldName):

    for i in ids:
        try:
            r = getRecJson(i)
            f = r[fieldName]
            return i
        except:
            continue

##### out: dict of record IDs and their content

In [12]:
def generateListJSON():
    output = dict()
    for i in ids:
        try:
            output[i] = getRecJson(i)
        except:
            continue
    return output

In [13]:
myData = generateListJSON()

##### in: name of the field to be found
##### out: ID and value of the first record containing given field

In [14]:
def getRecFromData(fieldName):
    fieldName = fieldName.split('/')
    for i, x in enumerate(fieldName):
        if x.isnumeric():
            fieldName[i] = int(x)
    for i in ids:
        try:
            json = myData[i]
            for f in fieldName:
                json = json[f]
            return i, json
        except:
            continue
    return -1

##### in: name of the field to be found
##### out: set of possible value under given key

In [37]:
def getSetOfValues(fieldName):
    output = set()
    
    fieldName = fieldName.split('/')
    for i, x in enumerate(fieldName):
        if x.isnumeric():
            fieldName[i] = int(x)
            
    for i in ids:
        try:
            json = myData[i]
            recType = json['type']
            for f in fieldName:
                json = json[f]
            output.add((recType, json))
        except:
            continue
    return output

##### in: JSON representation of Eprints record
##### out: list of some metadata fields in Zenodo format

In [29]:
def convertBasicMetadata(eprintsJSON):
    result = dict()
    
    result.update(mapFunction(eprintsJSON, 'date', 'publication_date', '0000-00-00'))
    result.update(mapFunction(eprintsJSON, 'title', 'title', 'Title is missing'))
    result.update(mapFunction(eprintsJSON, 'abstract', 'description', 'Description is missing'))
    result.update(mapFunction(eprintsJSON, 'succeeds', 'related_identifiers'))
    result.update(mapFunction(eprintsJSON, 'isbn', 'imprint_isbn'))
    result.update(mapFunction(eprintsJSON, 'isbn', 'imprint_isbn'))
    result.update(mapFunction(eprintsJSON, 'event_dates', 'conference_dates'))
    result.update(mapFunction(eprintsJSON, 'event_location', 'conference_place'))
    result.update(mapFunction(eprintsJSON, 'event_title', 'conference_title'))
    
    
    

    return result

##### in: ePrints record in JSON format, name of the field in eprint format, name of the field in Zenodo format, (optional) alternative value in case the field is missing 
##### out: dict of Zenodo name as a key and eprints value as the value

In [26]:
def mapFunction(eprintsJSON, eprintName, zenodoName, alt=""):
    if eprintName in eprintsJSON:
        return {zenodoName: eprintsJSON[eprintName]}
    elif alt:
        return {zenodoName: alt}
    return {}
        

## --------------------------------------------------------

In [39]:
getRecFromData('event_location')

(116, 'University of Exeter')

In [17]:
r = getRecJson(1326)

In [28]:
convertBasicMetadata(getRecJson(200))

{'publication_date': '2008-12-02',
 'title': 'HadGEM1 1pc run metadata',
 'description': 'Description is missing',
 'related_identifiers': 39}

In [55]:
field = 'publication'
print(getRecFromData(field))
print(getSetOfValues(field))

(67, 'Limnology and Oceanography')
{('article', 'Bioresource Technology'), ('article', 'Annalen der Meteorologie'), ('article', 'Ariadne'), ('article', 'Limnology and Oceanography'), ('article', 'International Journal of Digital Curation'), ('article', 'Health Statistics Quarterly'), ('book', 'Global Energy and Water Cycle Experiment (GEWEX) News'), ('monograph', 'OPCHANGE'), ('monograph', 'Vaisala Instrument Documentation'), ('monograph', 'Met Office'), ('article', 'RAL Space Remote Sensing Group - Technical Report'), ('monograph', 'Project Publication'), ('monograph', 'Met Office MIDAS User Guide')}


In [None]:
getRecFromData