In [341]:
import argparse
import os.path
import time
import pandas as pd
pd.options.mode.chained_assignment = None 
import datetime
import configparser 
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
print (now)

2018-12-30 23:42


# Load Input CSV file

In [342]:
config = configparser.ConfigParser()
config.sections()
config.read('config.ini')
input_file=config['DEFAULT']['input_file']
print ('Input file:', input_file)

Input file: sample-imput/activities.csv


In [343]:
df_input_file=pd.read_csv(input_file, header=None, delimiter='|', names = ["orcid", "relation_type", "name", "source", "identifier", "file_name"])

In [344]:
df_input_file.describe()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
count,1000,1000,1000,1000,1000,1000
unique,407,2,703,3,691,1000
top,0000-0001-5738-1893,educations,Universidad Central,RINGGOLD,27979,0000-0001-5123-8893_employments_4445793
freq,9,608,7,907,7,1


In [345]:
df_input_file.head()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
0,0000-0002-1484-1893,employments,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_employments_5636083
1,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636056
2,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636066
3,0000-0002-1484-1893,educations,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_educations_5636079
4,0000-0001-5057-7893,employments,University of Oregon College of Education,RINGGOLD,143853,0000-0001-5057-7893_employments_64516


### Check the sources

In [346]:
# Raise error if there are new identifier types
array_acceptable_sources=['RINGGOLD', 'GRID','FUNDREF']
if not set(df_input_file.source.unique()).issubset(array_acceptable_sources):
    odd_item= set(df_input_file.source.unique()).difference(array_acceptable_sources)
    error_message='Unknown Source Type: {} '.format(odd_item)
    raise TypeError(error_message)

In [347]:
# Number of org IDs per source
df=df_input_file.groupby('source')['orcid'].count()
df

source
FUNDREF      32
GRID         61
RINGGOLD    907
Name: orcid, dtype: int64

In [348]:
# Number of unique org IDs per source
df=df_input_file.groupby('source')['identifier'].nunique()
df

source
FUNDREF      23
GRID         45
RINGGOLD    623
Name: identifier, dtype: int64

In [349]:
# Number of unique org name per source
df=df_input_file.groupby('source')['name'].nunique()
df

source
FUNDREF      23
GRID         45
RINGGOLD    636
Name: name, dtype: int64

# Create nodes_orcid_ringgold.csv

In [350]:
df_ringgold = df_input_file[df_input_file['source']=='RINGGOLD']
df_ringgold.head()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
0,0000-0002-1484-1893,employments,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_employments_5636083
1,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636056
2,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636066
3,0000-0002-1484-1893,educations,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_educations_5636079
4,0000-0001-5057-7893,employments,University of Oregon College of Education,RINGGOLD,143853,0000-0001-5057-7893_employments_64516


In [351]:
#Select only three columns that we need.
df_ringgold = df_ringgold[['name','source','identifier']]
df_ringgold.head()

Unnamed: 0,name,source,identifier
0,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
1,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740
2,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740
3,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
4,University of Oregon College of Education,RINGGOLD,143853


In [352]:
# Remove any duplicate rows except the first one.
df_ringgold=df_ringgold.drop_duplicates(keep='first')
df_ringgold.head()

Unnamed: 0,name,source,identifier
0,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
1,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740
3,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
4,University of Oregon College of Education,RINGGOLD,143853
8,Nanjing University,RINGGOLD,12581


In [353]:
df_ringgold['key']='researchgraph.org/ringgold/'+df_ringgold['identifier']
df_ringgold['source']='orcid.org'
df_ringgold['local_id']=df_ringgold['identifier']
df_ringgold['ringgold']=df_ringgold['identifier']
df_ringgold['last_updated']= now
df_ringgold.head()

Unnamed: 0,name,source,identifier,key,local_id,ringgold,last_updated
0,Instituto Politécnico do Porto Escola Superior...,orcid.org,203255,researchgraph.org/ringgold/203255,203255,203255,2018-12-30 23:42
1,Universidade do Porto Faculdade de Desporto,orcid.org,224740,researchgraph.org/ringgold/224740,224740,224740,2018-12-30 23:42
3,Instituto Politécnico do Porto Escola Superior...,orcid.org,203255,researchgraph.org/ringgold/203255,203255,203255,2018-12-30 23:42
4,University of Oregon College of Education,orcid.org,143853,researchgraph.org/ringgold/143853,143853,143853,2018-12-30 23:42
8,Nanjing University,orcid.org,12581,researchgraph.org/ringgold/12581,12581,12581,2018-12-30 23:42


In [354]:
df_ringgold.to_csv('nodes_orcid_ringgold.csv', index=False, columns=['key','source','local_id','name', 'ringgold'], sep="|")

# Create relation_orcid_fundref.csv

In [355]:
df_fundref = df_input_file[df_input_file['source']=='FUNDREF']
df_fundref.head()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
17,0000-0003-0302-7893,educations,University of Twente,FUNDREF,http://dx.doi.org/10.13039/501100001834,0000-0003-0302-7893_educations_1394328
141,0000-0002-4550-9893,educations,Universita degli Studi di Bari Aldo Moro,FUNDREF,http://dx.doi.org/10.13039/501100005362,0000-0002-4550-9893_educations_5356895
167,0000-0001-8287-8893,employments,University of Minnesota,FUNDREF,http://dx.doi.org/10.13039/100007249,0000-0001-8287-8893_employments_4576991
193,0000-0002-6042-6893,employments,Universidad Autónoma de San Luis Potosí,FUNDREF,http://dx.doi.org/10.13039/501100005324,0000-0002-6042-6893_employments_4896520
276,0000-0002-9103-8893,educations,Consejo Superior de Investigaciones Científicas,FUNDREF,http://dx.doi.org/10.13039/501100003339,0000-0002-9103-8893_educations_1514980


In [356]:
df_fundref['from_key']='researchgraph.org/orcid/' + df_fundref['orcid']
#df_fundref=df_fundref.replace("http://dx.doi.org/", "")
#df_fundref=df_fundref.replace("https://dx.doi.org/", "")
df_fundref['identifier'].replace({"https://dx.doi.org/": ""}, inplace=True, reg)
df_fundref['to_uri']= 'researchgraph.org/fundref/' + df_fundref['identifier']
df_fundref['label']=df_fundref['relation_type']
df_fundref.loc[df_fundref['label']=='educations','label']='education'
df_fundref.loc[df_fundref['label']=='employments','label']='employment'
df_fundref.head()


Unnamed: 0,orcid,relation_type,name,source,identifier,file_name,from_key,to_uri,label
17,0000-0003-0302-7893,educations,University of Twente,FUNDREF,http://dx.doi.org/10.13039/501100001834,0000-0003-0302-7893_educations_1394328,researchgraph.org/orcid/0000-0003-0302-7893,researchgraph.org/fundref/http://dx.doi.org/10...,education
141,0000-0002-4550-9893,educations,Universita degli Studi di Bari Aldo Moro,FUNDREF,http://dx.doi.org/10.13039/501100005362,0000-0002-4550-9893_educations_5356895,researchgraph.org/orcid/0000-0002-4550-9893,researchgraph.org/fundref/http://dx.doi.org/10...,education
167,0000-0001-8287-8893,employments,University of Minnesota,FUNDREF,http://dx.doi.org/10.13039/100007249,0000-0001-8287-8893_employments_4576991,researchgraph.org/orcid/0000-0001-8287-8893,researchgraph.org/fundref/http://dx.doi.org/10...,employment
193,0000-0002-6042-6893,employments,Universidad Autónoma de San Luis Potosí,FUNDREF,http://dx.doi.org/10.13039/501100005324,0000-0002-6042-6893_employments_4896520,researchgraph.org/orcid/0000-0002-6042-6893,researchgraph.org/fundref/http://dx.doi.org/10...,employment
276,0000-0002-9103-8893,educations,Consejo Superior de Investigaciones Científicas,FUNDREF,http://dx.doi.org/10.13039/501100003339,0000-0002-9103-8893_educations_1514980,researchgraph.org/orcid/0000-0002-9103-8893,researchgraph.org/fundref/http://dx.doi.org/10...,education


In [357]:
df_fundref.to_csv('relation_orcid_fundref.csv', index=False, columns=['from_key','to_uri','label'], sep="|")