# Get metadata from Dataverse for each DOI

In [1]:
import requests as req
import json
import math
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
from string import Template
query_base=Template('https://dataverse.harvard.edu/api/search?q="${doi}"&show_facets=true')

In [27]:
import pandas
unique_dois = pandas.read_csv("../get-dois/dataset_dois.txt", names=['DOI'])

In [28]:
unique_dois.head()

Unnamed: 0,DOI
0,doi:10.7910/DVN/U3QJQZ
1,doi:10.7910/DVN/Z7H44N
2,doi:10.7910/DVN/HRLHA4
3,doi:10.7910/DVN/RJWU7A
4,doi:10.7910/DVN/FAAMAX


In [29]:
unique_dois = unique_dois['DOI'].values.tolist()

In [30]:
len(unique_dois)

2170

In [31]:
query = query_base.substitute(doi = unique_dois[0])
res = req.get(query)
res_dict = json.loads(res.text)

In [32]:
time = str(res_dict['data']['items'][0]['published_at'][:4])
time

'2020'

In [None]:
all_metadata=""
all_metadata = ','.join(["DOI","publicationDate","publisher","subject"]) + os.linesep
for i,doi in enumerate(unique_dois):
        query = query_base.substitute(doi = doi)

        res = req.get(query)
        res_dict = json.loads(res.text)
        try:
            time = res_dict['data']['items'][0]['published_at'][:4]
        except IndexError:
            print res_dict['data']['items']
            
        try:
            subs = res_dict['data']['facets'][0]['subject_ss']['labels']
            subject = ";".join(str(s.keys()[0]) for s in subs)
        except KeyError:
            subs = "NA"
        
        try:
            publisher = res_dict['data']['items'][0]['identifier_of_dataverse']
        except:
            print doi
            print res_dict['data']
            
        metadata_line = ','.join([doi.strip(), time, publisher.replace(",",";"), subject.replace(",",";")])
        #print metadata_line
        all_metadata += metadata_line + os.linesep

In [34]:
import pandas as pd 
from StringIO import StringIO

df = pd.read_csv(StringIO(all_metadata))
#df = pd.read_csv(StringIO(all_metadata), delimiter=',')
df.head()

Unnamed: 0,DOI,publicationDate,publisher,subject
0,doi:10.7910/DVN/U3QJQZ,2020,harvard,Medicine; Health and Life Sciences
1,doi:10.7910/DVN/Z7H44N,2017,PSRM,Social Sciences
2,doi:10.7910/DVN/HRLHA4,2016,pan,Mathematical Sciences;Social Sciences
3,doi:10.7910/DVN/RJWU7A,2015,intertransferuschina,Social Sciences
4,doi:10.7910/DVN/FAAMAX,2018,BLS-PNAS,Social Sciences


In [35]:
len(df)

285

In [None]:
with open('data/all_metadata.txt','w') as f:
    f.write(all_metadata.encode('utf8'))

# Get publisher metadata from Dataverse

In [1]:
import pandas as pd
metadata = pd.read_csv("data/all_metadata.txt")
metadata.head()

Unnamed: 0,doi,publicationDate,publisher,subject
0,doi:10.7910/DVN/U3QJQZ,2020,harvard,Medicine; Health and Life Sciences
1,doi:10.7910/DVN/HRLHA4,2016,pan,Mathematical Sciences;Social Sciences
2,doi:10.7910/DVN/RJWU7A,2015,intertransferuschina,Social Sciences
3,doi:10.7910/DVN/FAAMAX,2018,BLS-PNAS,Social Sciences
4,doi:10.7910/DVN/PAHRCK,2019,internationalinteractions,Social Sciences


In [2]:
unique = metadata['publisher'].unique()
unique[:10]

array(['harvard', 'pan', 'intertransferuschina', 'BLS-PNAS',
       'internationalinteractions', 'isq', 'monogan', 'govdept', 'restat',
       'palcomms'], dtype=object)

In [3]:
from string import Template
import requests as req
import json
import os

qbase=Template('https://dataverse.harvard.edu/api/dataverses/${iddv}')

all_journals = ""
for i,iddv in enumerate(unique):
    query = qbase.substitute(iddv = iddv)
    res = req.get(query)
    res_dict = json.loads(res.text)
    
    try:
        name = res_dict['data']['name']
        dvtype = res_dict['data']['dataverseType']
        
        metadata_line = '\t'.join([iddv.strip(), dvtype, name])
        #print metadata_line
        all_journals += metadata_line + os.linesep
    except:
        print res_dict

{u'status': u'ERROR', u'message': u"Can't find dataverse with identifier='palcomms'"}
{u'status': u'ERROR', u'message': u"Can't find dataverse with identifier='SPSRleemann2015'"}


In [4]:
from StringIO import StringIO
dj = pd.read_csv(StringIO(all_journals), sep='\t', names=['publisher', 'type', 'name'])

dj.head()

Unnamed: 0,publisher,type,name
0,harvard,ORGANIZATIONS_INSTITUTIONS,Harvard Dataverse
1,pan,JOURNALS,Political Analysis Dataverse
2,intertransferuschina,RESEARCH_PROJECTS,Behind Parent-Child Relationship and Intergene...
3,BLS-PNAS,RESEARCH_PROJECTS,Bechtel Liesch Scheve PNAS
4,internationalinteractions,JOURNALS,International Interactions (II): Empirical and...


In [5]:
len(metadata)

2002

In [6]:
len(dj)

288

In [7]:
merged = pd.merge(metadata, dj, on='publisher')

In [8]:
len(merged)

1998

In [9]:
merged.head()

Unnamed: 0,doi,publicationDate,publisher,subject,type,name
0,doi:10.7910/DVN/U3QJQZ,2020,harvard,Medicine; Health and Life Sciences,ORGANIZATIONS_INSTITUTIONS,Harvard Dataverse
1,doi:10.7910/DVN/6KEXM7,2020,harvard,Social Sciences,ORGANIZATIONS_INSTITUTIONS,Harvard Dataverse
2,doi:10.7910/DVN/EEUTHP,2019,harvard,Social Sciences,ORGANIZATIONS_INSTITUTIONS,Harvard Dataverse
3,doi:10.7910/DVN/WBI9RT,2015,harvard,Social Sciences,ORGANIZATIONS_INSTITUTIONS,Harvard Dataverse
4,doi:10.7910/DVN/N0PBQ9,2015,harvard,Social Sciences,ORGANIZATIONS_INSTITUTIONS,Harvard Dataverse


In [10]:
merged.to_csv('data/metadata_merged.csv')