## EU Funding Data 

## STEP 1 : download the datasets 


#### project data #### 

wget https://cordis.europa.eu/data/FP1/cordis-fp1projects.xlsx

wget https://cordis.europa.eu/data/FP1/cordis-fp1organizations.xlsx

wget https://cordis.europa.eu/data/FP2/cordis-fp2projects.xlsx

wget https://cordis.europa.eu/data/FP2/cordis-fp2organizations.xlsx

wget https://cordis.europa.eu/data/FP3/cordis-fp3projects.xlsx

wget https://cordis.europa.eu/data/FP3/cordis-fp3organizations.xlsx

wget https://cordis.europa.eu/data/FP4/cordis-fp4projects.xlsx

wget https://cordis.europa.eu/data/FP4/cordis-fp4organizations.xlsx

wget https://cordis.europa.eu/data/FP5/cordis-fp5projects.xlsx

wget https://cordis.europa.eu/data/FP5/cordis-fp5organizations.xlsx

wget https://cordis.europa.eu/data/FP6/cordis-fp6projects.xlsx

wget https://cordis.europa.eu/data/FP6/cordis-fp6organizations.xlsx

wget https://cordis.europa.eu/data/cordis-fp7projects-xlsx.zip

wget https://cordis.europa.eu/data/cordis-h2020projects-xlsx.zip

wget https://cordis.europa.eu/data/cordis-HORIZONprojects-xlsx.zip


#### programmes####
FP6 and FP7 as csv

wget https://cordis.europa.eu/data/reference/cordisref-FP6programmes.csv

wget https://cordis.europa.eu/data/reference/cordisref-FP7programmes.csv

wget https://cordis.europa.eu/data/reference/cordisref-H2020programmes-xlsx.zip

wget https://cordis.europa.eu/data/reference/cordisref-HORIZONprogrammes-xlsx.zip


#### PI (ERC ONLY)### 

N.B:

FP7 no longer provided as stand-alone file there : parse the projects files in XML

wget https://cordis.europa.eu/data/cordis-fp7projects-xml.zip

wget https://cordis.europa.eu/data/cordis-h2020-erc-pi.xlsx

HE :

ERC PI file not yet provided


#### get other reference data#### 

wget https://cordis.europa.eu/data/reference/cordisref-countries.csv


### STEP 2 : save into a database 

In [7]:
import os 
import inflection as infl  ## to deal with fields names which are camel case 
import pandas as pd 

import zipfile
import xmltodict

In [15]:
import warnings
warnings.simplefilter("ignore")

In [2]:
## Resulting database - we use sqlite for easy replication 

xPath_DB = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/'
xDB = xPath_DB  + 'grants_datasets.db'
xDBCon = 'sqlite:///' + xDB


In [5]:
### Path for downloaded datasets 
xPath_downloaded_datasets = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/__downloaded'
xPath_dsets_EU = os.path.join(xPath_downloaded_datasets, '01_EU_FP/01_projects') 

####  FP1 to FP6 are single excel files each  ### 

In [6]:
xFile_lst = ['cordis-fp1projects.xlsx', 'cordis-fp2projects.xlsx', 
             'cordis-fp3projects.xlsx', 'cordis-fp4projects.xlsx', 
             'cordis-fp5projects.xlsx', 'cordis-fp6projects.xlsx',
             'cordis-fp1organizations.xlsx', 'cordis-fp2organizations.xlsx', 
             'cordis-fp3organizations.xlsx', 'cordis-fp4organizations.xlsx',             
             'cordis-fp5organizations.xlsx', 'cordis-fp6organizations.xlsx'        
            ]

for xFile in xFile_lst:
    xFile_name = os.path.join(xPath_dsets_EU , xFile)
    
    #name of the table in sqlite 
    xtab_name = xFile.replace('-', '_').replace('.xlsx', '')
    #dataframe 
    df = pd.read_excel(xFile_name)
    # change the names of the columns from Camelcas to under_score 
    xcols = [infl.underscore(x) for x in df.columns]
    df.columns = xcols
    
    #Save 
    df.to_sql(name = xtab_name,  con = xDBCon,  if_exists =  'replace', index = False) 
    
    print('saved:', xtab_name) #, '---records:', len(df))
    

print('OK')

saved: cordis_fp1projects
saved: cordis_fp2projects
saved: cordis_fp3projects
saved: cordis_fp4projects
saved: cordis_fp5projects
saved: cordis_fp6projects
saved: cordis_fp1organizations
saved: cordis_fp2organizations
saved: cordis_fp3organizations
saved: cordis_fp4organizations
saved: cordis_fp5organizations
saved: cordis_fp6organizations
OK


#### FP7 #### 

In [16]:
xFile = 'cordis-fp7projects-xlsx.zip'

xfile_zip = os.path.join(xPath_dsets_EU , xFile)

with zipfile.ZipFile(xfile_zip) as zf:
    xtabname_prefix = xfile_zip.split('/')[-1:][0].split('.')[0].replace('-', '_').replace('xlsx', '')
    

    for xfile in zf.filelist:
        xfile_name = xfile.filename
        
        if xfile_name.endswith('.xlsx'):
            #print(xfile_name)
        
        
            xtab_name_suffix = xfile.filename.replace('/', '_').replace('.xlsx', '')
            xtab_name = xtabname_prefix  + infl.underscore(xtab_name_suffix)
            xtab_name = xtab_name.replace('_xlsx_', '_')
        
            with zf.open(xfile_name) as myZip:
                #print (xtab_name)

                df = pd.read_excel(myZip, engine="openpyxl")#, nrows =5) 
                xcols = [infl.underscore(x) for x in df.columns]
                df.columns = xcols
                #save 
                df.to_sql(name = xtab_name,  con = xDBCon,  if_exists =  'replace', index = False) 

                print('saved:', xtab_name, '---records:', len(df))
            
            
            
            
print('OK')



saved: cordis_fp7projects_organization ---records: 140008
saved: cordis_fp7projects_legal_basis ---records: 25785
saved: cordis_fp7projects_topics ---records: 26153
saved: cordis_fp7projects_euro_sci_voc ---records: 68017
saved: cordis_fp7projects_web_link ---records: 8160
saved: cordis_fp7projects_web_item ---records: 11763
saved: cordis_fp7projects_project ---records: 25785
OK


#### H2020  #### 

In [17]:

xFile = 'cordis-h2020projects-xlsx.zip'

xfile_zip = os.path.join(xPath_dsets_EU , xFile)


with zipfile.ZipFile(xfile_zip) as zf:
    xtabname_prefix = xfile_zip.split('/')[-1:][0].split('.')[0].replace('-', '_').replace('xlsx', '')
    

    for xfile in zf.filelist:
        xfile_name = xfile.filename
        
        if xfile_name.endswith('.xlsx'):
            #print(xfile_name)
        
        
            xtab_name_suffix = xfile.filename.replace('/', '_').replace('.xlsx', '')
            xtab_name = xtabname_prefix  + infl.underscore(xtab_name_suffix)
            xtab_name = xtab_name.replace('_xlsx_', '_')            
        
            with zf.open(xfile_name) as myZip:
                #print (xtab_name)

                df = pd.read_excel(myZip, engine="openpyxl")#, nrows =5) 
                xcols = [infl.underscore(x) for x in df.columns]
                df.columns = xcols
                #save 
                df.to_sql(name = xtab_name,  con = xDBCon,  if_exists =  'replace', index = False) 

                print('saved:', xtab_name, '---records:', len(df))
            
            
            
            
print('OK')

saved: cordis_h2020projects_project ---records: 35385
saved: cordis_h2020projects_organization ---records: 177871
saved: cordis_h2020projects_legal_basis ---records: 65791
saved: cordis_h2020projects_topics ---records: 35385
saved: cordis_h2020projects_euro_sci_voc ---records: 120041
saved: cordis_h2020projects_web_link ---records: 178370
saved: cordis_h2020projects_web_item ---records: 9
OK


#### Horizon Europe  #### 

In [18]:
xFile = 'cordis-HORIZONprojects-xlsx.zip'

xfile_zip = os.path.join(xPath_dsets_EU , xFile)

with zipfile.ZipFile(xfile_zip) as zf:
    xtabname_prefix = xfile_zip.split('/')[-1:][0].split('.')[0].replace('-', '_').replace('xlsx', '')
    

    for xfile in zf.filelist:
        xfile_name = xfile.filename
        
        if xfile_name.endswith('.xlsx'):
            #print(xfile_name)
        
        
            xtab_name_suffix = xfile.filename.replace('/', '_').replace('.xlsx', '')
            xtab_name = xtabname_prefix  + infl.underscore(xtab_name_suffix)
            xtab_name = xtab_name.replace('_xlsx_', '_')  
        
            with zf.open(xfile_name) as myZip:
                #print (xtab_name)

                df = pd.read_excel(myZip, engine="openpyxl")#, nrows =5) 
                xcols = [infl.underscore(x) for x in df.columns]
                df.columns = xcols
                #save 
                df.to_sql(name = xtab_name,  con = xDBCon,  if_exists =  'replace', index = False) 

                print('saved:', xtab_name, '---records:', len(df))
            
            
            
            
print('OK')

saved: cordis_HORIZONprojects_project ---records: 8557
saved: cordis_HORIZONprojects_organization ---records: 53422
saved: cordis_HORIZONprojects_legal_basis ---records: 11671
saved: cordis_HORIZONprojects_topics ---records: 8557
saved: cordis_HORIZONprojects_euro_sci_voc ---records: 26779
saved: cordis_HORIZONprojects_web_link ---records: 1387
saved: cordis_HORIZONprojects_web_item ---records: 1
OK


## Principal Inverstigators - ERC only 

In [24]:
xlst_res = []

xFile = 'cordis-fp7projects-xml.zip'

xfile_zip = os.path.join(xPath_dsets_EU , xFile)


with zipfile.ZipFile(xfile_zip, 'r') as zip_ref:
    for file_name in zip_ref.namelist():
        if file_name.endswith('.xml'):
            with zip_ref.open(file_name) as file:
                xdata = file.read()
                xdict = xmltodict.parse(xdata).get('project')
                ## get project data 
                xproj_id = xdict.get('id') 
                xproj_rcn = xdict.get('rcn') 
                xproj_acronym = xdict.get('acronym')   
                
                xrels = xdict.get('relations')
                if not xrels:
                    continue                    
                    
                
                xorgs = xdict.get('relations').get('associations').get('organization')
                if isinstance(xorgs , dict):
                    xorgs = [xorgs ]
                    
                if not xorgs:
                    continue
                    
                
                for xorg in xorgs:
                    #get organisations data 
                    xorg_id =  xorg.get('id')
                    xorg_rcn = xorg.get('rcn')
                    
                    if not xorg.get('relations'):
                        continue 
                    xassociations = xorg.get('relations').get('associations')
                    if not xassociations:
                        continue
                    

                    # get person data 
                    xperss = xassociations.get('person')
                    
                    if not xperss:
                        continue 
                    
                                      
                    if isinstance(xperss , dict):
                        xperss = [xperss]
                        

                    for xpers in xperss:
                        xpers_dict = {}
                        xpers_dict['proj_id'] = xproj_id
                        xpers_dict['proj_rcn'] = xproj_rcn 
                        xpers_dict['proj_acronym'] = xproj_acronym

                        xpers_dict['pers_type'] = xpers.get('@type')
                        xpers_dict['pi_rcn'] = xpers.get('rcn')                            
                        xpers_dict['pi_title'] = xpers.get('title')
                        xpers_dict['pi_name_first'] = xpers.get('firstNames')
                        xpers_dict['pi_name_last'] = xpers.get('lastName')   

                        xpers_dict['org_id'] = xorg_id
                        xpers_dict['org_rcn'] = xorg_rcn   


                        #print(xpers_dict) 
                        xlst_res.append(xpers_dict)
                
                
t1 = pd.DataFrame(xlst_res)
print(len(t1))

xtab_name = 'cordis_fp7_persons'
t1.to_sql(name = xtab_name,  con = xDBCon,  if_exists =  'replace', index = False) 
print('saved:', xtab_name, '---records:', len(t1))

#print('OK')


140362
saved: cordis_fp7_persons ---records: 140362


In [23]:
## H2020 

xFile = os.path.join(xPath_dsets_EU , 'cordis-h2020-erc-pi.xlsx')

df = pd.read_excel(xFile)

xcols = [infl.underscore(x) for x in df.columns]
df.columns = xcols


xtab_name = 'cordis_h2020_persons'
df.to_sql(name = xtab_name,  con = xDBCon,  if_exists =  'replace', index = False) 
print('saved:', xtab_name, '---records:', len(df))




saved: cordis_h2020_persons ---records: 8043


## Results - publications 

In [3]:
import os 
import pandas as pd 


xPath = '/media/mike/DATA4T/_data_repository/__others/fp_data/__publications/__corda_20231003/'

xl_fp7 = xPath + 'FP7PC_DM_PROJ_PUBLICATIONS.xlsx'
xl_h2020_lst = [xPath + 'cordis-h2020projectPublications-xlsx/xlsx/' + x 
                for x in os.listdir(xPath + 'cordis-h2020projectPublications-xlsx/xlsx/')]
xl_he = xPath + 'cordis-HORIZONprojectPublications-xlsx/xlsx/projectPublications.xlsx'



In [4]:
xPath_DB = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/'
xDB = xPath_DB  + 'grants_datasets.db'
xDBCon = 'sqlite:///' + xDB

In [5]:
# FP7 
dset_fp7 = pd.read_excel(xl_fp7 )
dset_fp7.columns = [x.lower() for x in dset_fp7.columns]

len(dset_fp7)


dset_fp7.to_sql(name = 'cordis_publications_fp7',con = xDBCon , if_exists = 'replace', index = False) 

print('saved FP7:',  len(dset_fp7))
    
print('OK')


saved FP7: 305549
OK


In [7]:
#H 2020 
for xFile in xl_h2020_lst:
    xtab_name = 'cordis_publications_h2020_' + xFile.split('/')[-1:][0].replace('.xlsx', '')
    #print(xtab_name)
    
    dset_h2020 = pd.read_excel(xFile)
    
    #convert every column to strings
    dset_h2020 = dset_h2020.astype(str)    
    
    
    
    dset_h2020.columns = [x.lower() for x in dset_h2020.columns]
    
    
    dset_h2020.to_sql(name = xtab_name,con = xDBCon , if_exists = 'replace', index = False)     
    
    
    print('saved :', xtab_name, len(dset_h2020))
    

print('OK')
    

saved : cordis_publications_h2020_projectPublications_4 55001
saved : cordis_publications_h2020_projectPublications_5 55001
saved : cordis_publications_h2020_projectPublications_7 25704
saved : cordis_publications_h2020_projectPublications_6 55001
saved : cordis_publications_h2020_projectPublications_2 55001
saved : cordis_publications_h2020_projectPublications 55001
saved : cordis_publications_h2020_projectPublications_3 55001
OK
