# Dataverse API workflow

Agenda:

-  Get the list of DOIs of R packages

-  For each package get a list of files that it contains

-  Download the files

In [1]:
import requests
import re

In [2]:
# some constants
dataverse_key = "2a301287-c8e8-43f5-9862-cf084b310341"
max_retries=5

# defining some constants
r_file_query = "fileContentType:type/x-r-syntax"

# initialize variables to store current state of scraping
page_num = 0
r_dois = []

# 1. Get the list of DOIs

In [3]:
myresults = requests.get("https://dataverse.harvard.edu/api/search/", 
                         params= {"q": r_file_query, "type": "file",
                                              "key": dataverse_key, "start": str(1000 * page_num),
                                              "per_page": str(1000)}).json()['data']['items']

print("Parsing results from page {}...".format(page_num))

Parsing results from page 0...


## extract the DOI (if any) from the result

In [4]:
for myresult in myresults:
    doi_match = re.search("(doi:[^,]*)", myresult['dataset_persistent_id'])
    if doi_match:
        r_dois.append(doi_match.group(1) + '\n')

In [5]:
myresults[1]

{u'checksum': {u'type': u'MD5', u'value': u'4b077329d83599e1f59cc779d9477a02'},
 u'dataset_citation': u'Dolezal, Martin; Ennser-Jedenastik, Laurenz; M\xc3\xbcller, Wolfgang C.; Winkler, Anna Katharina, 2016, "Replication data for: Analyzing Manifestos in their Electoral Context: A New Approach Applied to Austria, 2002\xe2\x80\x932008", https://doi.org/10.7910/DVN/27864, Harvard Dataverse, V1',
 u'dataset_id': u'46890',
 u'dataset_name': u'Replication data for: Analyzing Manifestos in their Electoral Context: A New Approach Applied to Austria, 2002\xe2\x80\x932008',
 u'dataset_persistent_id': u'doi:10.7910/DVN/27864',
 u'description': u'',
 u'file_content_type': u'type/x-r-syntax',
 u'file_id': u'2507267',
 u'file_persistent_id': u'doi:10.7910/DVN/27864/EDKVG8',
 u'file_type': u'R Syntax',
 u'md5': u'4b077329d83599e1f59cc779d9477a02',
 u'name': u'replication appendix.r',
 u'published_at': u'2016-03-11T17:54:53Z',
 u'size_in_bytes': 26833,
 u'type': u'file',
 u'url': u'https://dataverse.

In [6]:
doi = r_dois[1]
doi

u'doi:10.7910/DVN/27864\n'

# 2. Get the list of files in the package

In [7]:
files = requests.get(
        "https://dataverse.harvard.edu/api/datasets/export?exporter=dataverse_json&persistentId={0}".format(doi),
        params= {"key": dataverse_key})

In [12]:
files.json().keys()

[u'publisher',
 u'protocol',
 u'authority',
 u'datasetVersion',
 u'publicationDate',
 u'persistentUrl',
 u'identifier',
 u'id']

In [13]:
import json
json_data = json.loads(files.text)

## find in dict the list of files 

In [14]:
files_list = json_data['datasetVersion']['files']

In [15]:
files_list[1]

{u'dataFile': {u'contentType': u'text/plain; charset=US-ASCII',
  u'description': u'',
  u'filename': u'crimpunish.csv',
  u'id': 2507229,
  u'md5': u'7f9aec0fb547c6dd39edff91b5a85279',
  u'originalFormatLabel': u'UNKNOWN',
  u'storageIdentifier': u'266210'},
 u'datasetVersionId': 42985,
 u'description': u'',
 u'label': u'crimpunish.csv',
 u'version': 1}

# 3. Get each file contents

In [16]:
f = files_list[1]
filename = f['dataFile']['filename']
fileid = f['dataFile']['id']
fileid

2507229

In [17]:
response = requests.get("https://dataverse.harvard.edu/api/access/datafile/" + 
                            str(fileid), params={"key": dataverse_key})

In [18]:
response

<Response [200]>