In [415]:
import argparse
import os.path
import time
import pandas as pd
pd.options.mode.chained_assignment = None 
import datetime
import configparser 
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
print (now)

2018-12-31 00:04


# Load Input CSV file
---

In [416]:
config = configparser.ConfigParser()
config.sections()
config.read('config.ini')
input_file=config['DEFAULT']['input_file']
print ('Input file:', input_file)

Input file: sample-imput/activities.csv


In [417]:
df_input_file=pd.read_csv(input_file, header=None, delimiter='|', names = ["orcid", "relation_type", "name", "source", "identifier", "file_name"])

In [418]:
df_input_file.describe()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
count,1000,1000,1000,1000,1000,1000
unique,407,2,703,3,691,1000
top,0000-0001-5738-1893,educations,Universidad Central,RINGGOLD,27979,0000-0001-5123-8893_employments_4445793
freq,9,608,7,907,7,1


In [419]:
df_input_file.head()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
0,0000-0002-1484-1893,employments,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_employments_5636083
1,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636056
2,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636066
3,0000-0002-1484-1893,educations,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_educations_5636079
4,0000-0001-5057-7893,employments,University of Oregon College of Education,RINGGOLD,143853,0000-0001-5057-7893_employments_64516


### Check the sources

In [420]:
# Raise error if there are new identifier types
array_acceptable_sources=['RINGGOLD', 'GRID','FUNDREF']
if not set(df_input_file.source.unique()).issubset(array_acceptable_sources):
    odd_item= set(df_input_file.source.unique()).difference(array_acceptable_sources)
    error_message='Unknown Source Type: {} '.format(odd_item)
    raise TypeError(error_message)

In [421]:
# Number of org IDs per source
df=df_input_file.groupby('source')['orcid'].count()
df

source
FUNDREF      32
GRID         61
RINGGOLD    907
Name: orcid, dtype: int64

In [422]:
# Number of unique org IDs per source
df=df_input_file.groupby('source')['identifier'].nunique()
df

source
FUNDREF      23
GRID         45
RINGGOLD    623
Name: identifier, dtype: int64

In [423]:
# Number of unique org name per source
df=df_input_file.groupby('source')['name'].nunique()
df

source
FUNDREF      23
GRID         45
RINGGOLD    636
Name: name, dtype: int64

# Create nodes_orcid_ringgold.csv
---

In [424]:
df_ringgold = df_input_file[df_input_file['source']=='RINGGOLD']
df_ringgold.head()

Unnamed: 0,orcid,relation_type,name,source,identifier,file_name
0,0000-0002-1484-1893,employments,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_employments_5636083
1,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636056
2,0000-0002-1484-1893,educations,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740,0000-0002-1484-1893_educations_5636066
3,0000-0002-1484-1893,educations,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255,0000-0002-1484-1893_educations_5636079
4,0000-0001-5057-7893,employments,University of Oregon College of Education,RINGGOLD,143853,0000-0001-5057-7893_employments_64516


In [425]:
#Select only three columns that we need.
df_ringgold = df_ringgold[['name','source','identifier']]
df_ringgold.head()

Unnamed: 0,name,source,identifier
0,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
1,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740
2,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740
3,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
4,University of Oregon College of Education,RINGGOLD,143853


In [426]:
# Remove any duplicate rows except the first one.
df_ringgold=df_ringgold.drop_duplicates(keep='first')
df_ringgold.head()

Unnamed: 0,name,source,identifier
0,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
1,Universidade do Porto Faculdade de Desporto,RINGGOLD,224740
3,Instituto Politécnico do Porto Escola Superior...,RINGGOLD,203255
4,University of Oregon College of Education,RINGGOLD,143853
8,Nanjing University,RINGGOLD,12581


In [427]:
df_ringgold['key']='researchgraph.org/ringgold/'+df_ringgold['identifier']
df_ringgold['source']='orcid.org'
df_ringgold['local_id']=df_ringgold['identifier']
df_ringgold['ringgold']=df_ringgold['identifier']
df_ringgold['last_updated']= now

In [428]:
df_ringgold.to_csv('nodes_orcid_ringgold.csv', index=False, columns=['key','source','local_id','name', 'ringgold'], sep="|")

# Create relation_orcid_fundref.csv
---

In [429]:
df_fundref = df_input_file[df_input_file['source']=='FUNDREF']

In [430]:
df_fundref['from_key']='researchgraph.org/orcid/' + df_fundref['orcid']
df_fundref['identifier'].replace({"http://dx.doi.org/": ""}, inplace=True, regex=True)
df_fundref['to_uri']= 'researchgraph.org/fundref/' + df_fundref['identifier']
df_fundref['label']=df_fundref['relation_type']
df_fundref.loc[df_fundref['label']=='educations','label']='education'
df_fundref.loc[df_fundref['label']=='employments','label']='employment'

In [431]:
df_fundref.to_csv('relation_orcid_fundref.csv', index=False, columns=['from_key','to_uri','label'], sep="|")

# Create relation_orcid_ringold.csv
---

In [432]:
df_ringgold = df_input_file[df_input_file['source']=='RINGGOLD']

In [433]:
df_ringgold['from_key']='researchgraph.org/orcid/' + df_ringgold['orcid']
df_ringgold['to_uri']= 'researchgraph.org/ringgold/' + df_ringgold['identifier']
df_ringgold['label']=df_ringgold['relation_type']
df_ringgold.loc[df_ringgold['label']=='educations','label']='education'
df_ringgold.loc[df_ringgold['label']=='employments','label']='employment'

In [434]:
df_ringgold.to_csv('relation_orcid_ringgold.csv', index=False, columns=['from_key','to_uri','label'], sep="|")

# Create relation_orcid_grid.csv
---

In [435]:
df_grid = df_input_file[df_input_file['source']=='GRID']

In [436]:
df_grid['identifier'].replace({"grid.": ""}, inplace=True, regex=True)
df_grid['from_key']='researchgraph.org/orcid/' + df_grid['orcid']
df_grid['to_uri']= 'researchgraph.org/grid/' + df_grid['identifier']
df_grid['label']=df_grid['relation_type']
df_grid.loc[df_grid['label']=='educations','label']='education'
df_grid.loc[df_grid['label']=='employments','label']='employment'

In [437]:
df_grid.to_csv('relation_orcid_grid.csv', index=False, columns=['from_key','to_uri','label'], sep="|")