In [1]:
import xml.etree.ElementTree as ET
from os import listdir

In [2]:
oai = '{http://www.openarchives.org/OAI/2.0/}'
oai_dc = '{http://www.openarchives.org/OAI/2.0/oai_dc/}'
dc = '{http://purl.org/dc/elements/1.1/}'

In [3]:
def field_pctg_dir(directory, field):
    """checks how many items in the given directory have field f non-empty. """
    has_field = [
        field_pctg_f(f'{directory}/{f}', field) 
        for f in listdir(directory)
    ]
    return sum(has_field) / len(has_field)

In [6]:
def field_pctg_f(f, field):
    """checks how many items in the given file have field f non-empty. """
    root = ET.parse(f).getroot()
    has_field = []
    for record in root.find(f'{oai}ListRecords'):
        header = record.find(f'{oai}header')
        if record.tag == f'{oai}resumptionToken':
            continue
        elif 'status' in header.attrib and header.attrib['status'] == 'deleted':
            continue
        try:
            metadata = record.find(f'{oai}metadata').find(f'{oai_dc}dc')
            included = False
            for this_field in metadata.findall('*'):
                if this_field.tag == field and \
                        this_field.text != "Peer Reviewed":
                    included = True
                    break
            has_field.append(included)
        except AttributeError:
            print(header)  #.find('identifier').text)
    return sum(has_field) / len(has_field)

In [13]:
field = f'{dc}description'
edoc = field_pctg_dir('../data/xml/edoc', field)
depositonce = field_pctg_dir('../data/xml/depositonce', field)
refubium = field_pctg_dir('../data/xml/refubium', field)
print(f'Edoc: {round(edoc, 2)}')
print(f'Depositonce: {round(depositonce, 2)}')
print(f'Refubium: {round(refubium, 2)}')
print(f'All: {round((edoc + depositonce + refubium) / 3, 2)}')

Edoc: 0.56
Depositonce: 0.99
Refubium: 0.95
All: 0.83
