## Gateway to Research 


Gateway to Research (GtR) is a system which provide access to research grants awarded UK public bodies.

It offers a Web Interface as well as an API.

Data are regulary updated quarterly (second week of April, July, October and January) and data quality occasionaly improved.



## Step 1 : download data from API 

In [2]:
import requests
import json
import time
import datetime
import math
import os
import pandas as pd 


In [4]:
def gtr_get_data(xDestFolder, entities='projects'):
    '''
    bulk retrieve data from Gateway to research
    various entities can be retrieved
    entities can be
    - 'projects',
    - 'organisations',
    - 'persons',
    - 'publications',
    - 'ipr'

    There are other interesting data which are not considered now:
    such as spinouts, impact summaries etc ...

    Exact Url specifications can be found here
    https://gtr.ukri.org/resources/GtR-2-API-v1.7.5.pdf


    '''
    # specify the entities to be retrieved
    ## N.B: only projects, organisations , persons and funds have been tested in this iteration

    if entities == 'projects':
        xUrlBase = 'https://gtr.ukri.org/gtr/api/projects.json?p={}&s=100'
    if entities == 'organisations':
        xUrlBase = 'https://gtr.ukri.org/gtr/api/organisations.json?p={}&s=100'
    if entities == 'persons':
        xUrlBase = 'https://gtr.ukri.org/gtr/api/persons.json?p={}&s=100'
    if entities == 'funds':
        xUrlBase = 'https://gtr.ukri.org/gtr/api/funds.json?p={}&s=100'
        
    if entities == 'publications':
        xUrlBase = 'https://gtr.ukri.org/gtr/api/outcomes/publications.json?p={}&s=100'
    if entities == 'ipr':
        xUrlBase = 'https://gtr.ukri.org/gtr/api/outcomes/intellectualproperties.json?p={}&s=100'
        
        
        

    # Page 1
    xPageNr = str(1)
    xUrl = xUrlBase.format(xPageNr)
    q1 = requests.get(xUrl)
    q2 = json.loads(q1.content)
    xResultPage = str(q2['page'])
    # get total page number for remaining pages
    xTotalPageNr = q2['totalPages']
    print('total number of pages to fech:', xTotalPageNr)
    xPageRange = [str(x) for x in range(1, xTotalPageNr)[1:]]
    # save page 1
    time.sleep(1)
    x_time_stamp = str(datetime.datetime.now()).replace('-', '_').replace(':', '_').replace('.', '_').replace(' ', '_')
    xFileName = xDestFolder + xResultPage + '_' + x_time_stamp
    with open(xFileName, 'w') as f:
        json.dump(q2, f)

    # remaining pages
    for xPageNr in xPageRange:
        try:
            xUrl = xUrlBase.format(xPageNr)
            time.sleep(2)
            q1 = requests.get(xUrl)
            q2 = json.loads(q1.content)
            xResultPage = str(q2['page'])

            x_time_stamp = str(datetime.datetime.now()).replace('-', '_').replace(':', '_').replace('.', '_').replace(
                ' ', '_')
            xFileName = xDestFolder + xResultPage + '_' + x_time_stamp
            with open(xFileName, 'w') as f:
                json.dump(q2, f)
                
            if int(xPageNr) % 100 == 0:
                print('page_done: ', xPageNr)
                
        except:
            print('error on page:', xPageNr)

    print ('GTR projects data fetched and saved !')

In [5]:
xFld_projects = '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_projects/'
xFld_persons =  '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_persons/'
xFld_organisations =  '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_organisations/'
xFld_funds =  '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_funds/'

#gtr_get_data(xDestFolder = xFld_projects, entities='projects')

#gtr_get_data(xDestFolder = xFld_persons, entities='persons')
#gtr_get_data(xDestFolder = xFld_organisations, entities='organisations')

#gtr_get_data(xDestFolder = xFld_funds, entities='funds')

total number of pages to fech: 1437
page_done:  100
page_done:  200
page_done:  300
page_done:  400
page_done:  500
page_done:  600
page_done:  700
page_done:  800
page_done:  900
page_done:  1000
page_done:  1100
page_done:  1200
page_done:  1300
page_done:  1400
GTR projects data fetched and saved !


In [None]:
## !!! check possible problem with projects, total number does not match 

In [3]:
for i in range(1, 1001):
    if i % 100 == 0:
        print(i)

100
200
300
400
500
600
700
800
900
1000


## STEP 2 : parse the downloaded data and save into an SQL database¶

In [6]:
## parse project 


def gtr_parse_projects(xFile):
    '''
    #parse project data from Gateway to research
    RETURN:
    return a dictionary with project data (list) and a list of "relations"
    #N.B: metadata on relations such as PI, Funding Organisation or Performing Organisation
    # should be fetched separately
    #for example given their IDs

    '''

    xDictRes = {}

    xlst_links_res = []  # results links
    xlst_projectData = []
    xlst_identifiers = []    

    with open(xFile, 'r') as ff:
        xData = ff.read()
    xDataJson = json.loads(xData)
    xlst_projects = xDataJson.get('project')

    xFields_project = ['id',
                       'created',
                       'href',
                       'status',
                       'title',
                       'grantCategory',
                       'abstractText',
                       #'potentialImpact',
                       'leadFunder',
                       'identifiers',
                       'researchTopics',
                       'researchSubjects',
                       'healthCategories',
                       'researchActivities',
                       'leadOrganisationDepartment'
                       ]

    for x_project in xlst_projects:

        # links
        xFields_links = ['id', 'href', 'rel']

        x_data_project = dict.fromkeys(xFields_project)
        x_data_project['id'] = x_project['id']
        x_data_project['created'] = x_project['created']

        x_data_project['href'] = x_project['href']
        x_data_project['status'] = x_project['status']
        x_data_project['title'] = x_project['title']
        x_data_project['grantCategory'] = x_project['grantCategory']

        x_data_project['abstractText'] = x_project['abstractText']

        #if x_project.get('potentialImpact'):
        #    x_data_project['potentialImpact'] = x_project['potentialImpact']

        if x_project.get('identifiers'):
            lst_identifiers = x_project['identifiers']['identifier']
            x_data_project['identifiers'] = ';'.join([x['type'] + '_' + x['value'] for x in lst_identifiers])

        if x_project.get('researchTopics'):
            xlst_research_topics = x_project['researchTopics']['researchTopic']
            x_data_project['researchTopics'] = ';'.join([x['id'] + '_' + x['text'] for x in xlst_research_topics])

        if x_project.get('researchSubjects'):
            xlst_research_topics = x_project['researchSubjects']['researchSubject']
            x_data_project['researchSubjects'] = ';'.join([x['id'] + '_' + x['text'] for x in xlst_research_topics])

        if x_project.get('healthCategories'):
            xlst_research_topics = x_project['healthCategories']['healthCategory']
            x_data_project['healthCategories'] = ';'.join([x['id'] + '_' + x['text'] for x in xlst_research_topics])

        if x_project.get('researchActivities'):
            xlst_research_topics = x_project['researchActivities']['researchActivity']
            x_data_project['researchActivities'] = ';'.join([x['id'] + '_' + x['text'] for x in xlst_research_topics])

        if x_project.get('leadOrganisationDepartment'):
            x_data_project['leadOrganisationDepartment'] = x_project['leadOrganisationDepartment']
            
        if x_project.get('leadFunder'):
            x_data_project['leadFunder'] = x_project['leadFunder']
            
            

        xlst_projectData.append(x_data_project)

        # process links
        xlst_links = x_project['links']['link']

        for xDict in xlst_links:
            xDictLink = dict.fromkeys(xFields_links)
            xDictLink['id'] = x_project['id']
            xDictLink['href'] = xDict['href']
            xDictLink['rel'] = xDict['rel']
            xlst_links_res.append(xDictLink)
            
        #process identifiers 
        if x_project.get('identifiers'):
            xlst_ids = x_project['identifiers']['identifier']
        
            for xDict_id in xlst_ids:
                xDict_id['id'] = x_project['id']       
                xlst_identifiers.append(xDict_id)
            

    xDictRes['grant_data'] = xlst_projectData
    xDictRes['grant_links'] = xlst_links_res
    xDictRes['grant_identifiers'] = xlst_identifiers            
            
    return xDictRes

In [None]:
## parse persons 

def gtr_parse_person(xFile):
    '''
    parse Files for Person
    return : a list of dictionary with
    id and names
    '''
    xres_list_persons = []

    with open(xFile, 'r') as ff:
        xData = ff.read()
    xDataJson = json.loads(xData)
    xlst_persons = xDataJson['person']

    for x_person in xlst_persons:
        xDictRes = dict.fromkeys(['person_id',
                                  'person_name_last',
                                  'person_name_first',
                                  'person_name_others'])

        xDictRes['person_id'] = x_person['id']
        xDictRes['person_name_last'] = x_person['surname']
        xDictRes['person_name_first'] = x_person['firstName']
        xDictRes['person_name_others'] = x_person['otherNames']

        xres_list_persons.append(xDictRes)

    return xres_list_persons

In [None]:
## parse organisations 

def gtr_parse_org(xFile):
    '''
    parse Files for Organisations
    return : a list of dictionary with
    org_id and org_name and main adress county and town
    '''

    xres_list_orgs = []

    with open(xFile, 'r') as ff:
        xData = ff.read()
    xDataJson = json.loads(xData)

    xlst_orgs = xDataJson['organisation']

    for x_org in xlst_orgs:
        xDictRes = dict.fromkeys(['org_id', 'org_name',
                                  'address_main_county', 'address_main_region',
                                  'address_main_postcode', 'address_main_line1'])

        xDictRes['org_id'] = x_org['id']
        xDictRes['org_name'] = x_org['name']

        if x_org.get('addresses') and (x_org['addresses']).get('address'):
            xlst_addresses = x_org['addresses']['address']
            for x_address in xlst_addresses:
                if x_address['type'] == 'MAIN_ADDRESS':
                    if x_address.get('county'):
                        xDictRes['address_main_county'] = x_address['county']
                    if x_address.get('region'):
                        xDictRes['address_main_region'] = x_address['region']
                    if x_address.get('postCode'):
                        xDictRes['address_main_postcode'] = x_address['postCode']
                    if x_address.get('line1'):
                        xDictRes['address_main_line1'] = x_address['line1']

        xres_list_orgs.append(xDictRes)

    return xres_list_orgs

In [15]:
def gtr_parse_funds(xFile):
    '''
    parse Files for funds
    return : two lists 
    - list for funds 
    - list for funds _links 
    '''
    
    xRES_fund = []
    xRES_fund_links = []    
    
    
    xDictRes = {}
    
    with open(xFile, 'r') as xff:
        xdata = xff.read()

    q1 = json.loads(xdata)     
    
    xrecs = q1.get('fund')

    for xrec in xrecs:
        xrec_id = xrec.get('id')    
        #get the fund 
        xrec_fund = {k: xrec[k] for k in xrec if k not in ['links'] }
        ## deal with records 
        dict_fund = pd.json_normalize(xrec_fund, sep = '_').to_dict(orient = 'records')[0]
        xRES_fund.append(dict_fund)

        #deal wth links 
        if xrec.get('links'):
            xrec_links = xrec.get('links').get('link')

            for xlink in xrec_links:

                dict_fund_link = {k: xlink[k] for k in xlink if k not in ['otherAttributes'] }
                dict_fund_link['id_fund'] = xrec_id 
                xRES_fund_links.append(dict_fund_link)
            
    

    xDictRes['funds_data'] = xRES_fund
    xDictRes['funds_links'] = xRES_fund_links
    
    return xDictRes
    

### Parse and Save in DB 

In [7]:
xFld_project = '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_projects/'
xFld_org     = '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_organisations/'
xFld_person = '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_persons/'
xFld_funds =  '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_funds/'

In [48]:
## Database 
xPath_DB = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/'
xDB = xPath_DB  + 'grants_datasets.db'
xDBCon = 'sqlite:///' + xDB

In [9]:
# projects 

xRES_proj = []
xRES_links = []
xRES_ids = []

xfiles = [xFld_project + x for x in os.listdir(xFld_project)]#[0:10]
print('files to process:', len(xfiles))
for xfile in xfiles:
    xdict_res = gtr_parse_projects(xFile = xfile )
    xgrants_lst = xdict_res.get('grant_data')
    xlinks_lst  = xdict_res.get('grant_links')
    xids_lst  = xdict_res.get('grant_identifiers')    
    
    xRES_proj.extend(xgrants_lst)
    xRES_links.extend(xlinks_lst)
    xRES_ids.extend(xids_lst)
    

t1 = pd.DataFrame(xRES_proj)
t2 = pd.DataFrame(xRES_links)
t3 = pd.DataFrame(xRES_ids)

print('projects:', len(t1))
print('projects_links:', len(t2))
print('projects_identifiers:', len(t3))

## save 
t1.to_sql(name = 'gtr_grants', con = xDBCon, if_exists='replace', index=False, chunksize=10000)
t2.to_sql(name = 'gtr_grants_links', con = xDBCon, if_exists='replace', index=False, chunksize=10000)
t3.to_sql(name = 'gtr_grants_identifiers', con = xDBCon, if_exists='replace', index=False, chunksize=10000)


print('OK')


files to process: 1436
projects: 143600
projects_links: 2748303
projects_identifiers: 144227
OK


In [9]:
t3

Unnamed: 0,value,type,id
0,ES/M002470/2,RCUK,DE018461-A71F-4E29-BEA4-58A3F15853E2
1,10040413,RCUK,DE520E29-AA77-4E73-9498-56BE7D0703A8
2,2749757,RCUK,DEB26EF6-4186-4A28-A7BE-56F9EE6DBD4E
3,EP/X027392/1,RCUK,BD37896F-78AA-4952-88A5-494F659554CE
4,508746,RCUK,BD8D518D-D087-4194-8BAE-496C498C513D
...,...,...,...
95,ES/I038187/1,RCUK,8F2DA6F5-D816-47C3-8A07-53595784F1CF
96,ST/V002058/1,RCUK,8F5532E4-53F5-4342-82EA-53F85E1CEEDE
97,2875359,RCUK,2BCE2C91-159E-4785-877E-53F5F55DC9F6
98,MR/X012794/1,RCUK,2D0907C2-F497-40FE-B5D5-529BA1C09004


In [None]:
# person

xRES_person = []
xfiles = [xFld_person + x for x in os.listdir(xFld_person)]
print(len(xfiles))
for xfile in xfiles:
    xlist_res = gtr_parse_person(xFile = xfile )
    xRES_person.extend(xlist_res )

t1 = pd.DataFrame(xRES_person)

print('projects:', len(t1))

## save 
t1.to_sql(name = 'gtr_persons', con = xDBCon, if_exists='replace', index=False, chunksize=10000)

print('OK')



In [None]:
# org 

xRES_org = []
xfiles = [xFld_org  + x for x in os.listdir(xFld_org )]
print(len(xfiles))
for xfile in xfiles:
    xlist_res = gtr_parse_org(xFile = xfile )
    xRES_org.extend(xlist_res )

t1 = pd.DataFrame(xRES_org)

print('projects:', len(t1))

## save 
t1.to_sql(name = 'gtr_organisations', con = xDBCon, if_exists='replace', index=False, chunksize=10000)

print('OK')


In [16]:
## funds 

xRES_funds = []
xRES_funds_links = []

xfiles = [xFld_funds  + x for x in os.listdir(xFld_funds )]
print('files to process:', len(xfiles))
for xfile in xfiles:
    xdict_res = gtr_parse_funds(xFile = xfile )
    xfunds_lst = xdict_res.get('funds_data')
    xlinks_lst  = xdict_res.get('funds_links')
    xRES_funds.extend(xfunds_lst)
    xRES_funds_links.extend(xlinks_lst)    
    

t1 = pd.DataFrame(xRES_funds)
t2 = pd.DataFrame(xRES_funds_links)

print('projects:', len(t1))
print('projects_links:', len(t2))


## save 
t1.to_sql(name = 'gtr_funds', con = xDBCon, if_exists='replace', index=False, chunksize=10000)
t2.to_sql(name = 'gtr_funds_links', con = xDBCon, if_exists='replace', index=False, chunksize=10000)


print('OK')



files to process: 1444
projects: 144400
projects_links: 288800
OK


In [None]:

create index idxgtr_funds_id  on gtr_funds (id);
create index idxgtr_funds_cat  on gtr_funds (category);

create index idxgtrfundslinks_id on gtr_funds_links (id_fund);
create index idxgtrfundslinks_rel on gtr_funds_links (rel);


create index idxgtr_grants_id  on gtr_grants (id);

create index idxgtr_identifiers_id  on gtr_grants (id);
create index idxgtr_identifiers_type  on gtr_grants ("type");

create index idxgtr_grantslinks_id  on gtr_grants_links (id);
create index idxgtr_grantslinks_rel  on gtr_grants_links (rel);

create index idxgtr_org_id  on gtr_organisations (org_id);
create index idxgtr_pers_id  on gtr_persons (person_id);




## PUBLICATIONS 

In [None]:
# get 
xFld_pubs  = '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_results/_publications/'

#gtr_get_data(xDestFolder = xFld_pubs , entities='publications')

In [None]:
# parse 

In [55]:
def process_pubs(xfile):
    '''
    extract publiications data anad lins 
    '''
    xRES_lst_pubs = []
    xRES_lst_pubs_links = []    
    
    
    #atmoc fields 
    xfields = ['ext', 'id', 'outcomeid', 'href', 'created', 'updated', 'title', 'type', 
           'abstractText', 'otherInformation', 'journalTitle', 'datePublished', 
           'publicationUrl', 'pubMedId', 'isbn', 'issn', 'seriesNumber', 'seriesTitle', 
           'subTitle', 'volumeTitle', 'doi', 'volumeNumber', 'issue', 'totalPages', 'edition', 
           'chapterNumber', 'chapterTitle', 'pageReference', 'conferenceEvent', 'conferenceLocation', 
           'conferenceNumber', 'author']

    xfields_links = ['href', 'rel', 'start', 'end', 'pub_id']
    

    with open(xfile, 'r') as xff:
        xdata = json.loads(xff.read())
        xrecs = xdata.get('publication')
        
        for xrec in xrecs:
            #pd 
            xid = xrec.get('id')
            # records _publications : atomic values 
            xrec_pub = {k: xrec[k] for k in xrec if k in xfields }
            xRES_lst_pubs.append(xrec_pub)
            # links : list             
            xlinks_list = xrec.get('links').get('link')
            for xlink in xlinks_list:
                xrec_link = {k: xlink[k] for k in xlink if k in xfields_links }
                xrec_link['pub_id'] = xid
                xRES_lst_pubs_links.append(xrec_link)
    
    return  xRES_lst_pubs, xRES_lst_pubs_links

In [60]:
xFld_pubs  = '/media/mike/DATA4T/_data_repository/__others/__funding_data/1000_GTR/_results/_publications/'

xfiles = [xFld_pubs + x for x in os.listdir(xFld_pubs)]

xlst_pubs = []
xlst_pubs_links = []



for xfile in xfiles: #random.sample(xfiles, 50):
    q1, q2 = process_pubs(xfile)
    xlst_pubs.extend(q1)
    xlst_pubs_links.extend(q2)    
    
t1 = pd.DataFrame(xlst_pubs)
t2 = pd.DataFrame(xlst_pubs_links)
    


t1.to_sql(name = 'gtr_results_publications', con = xDBCon, if_exists='replace', index=False, chunksize=10000)
t2.to_sql(name = 'gtr_results_publications_links', con = xDBCon, if_exists='replace', index=False, chunksize=10000)


print('saved, publications', len(t1))
print('saved, publications links ', len(t2))      



saved, publications 984100
saved, publications links  984100


OK
