In [136]:
import numpy as np
import pandas as pd
from IPython.display import display

## **EU CORDIS DATAPREPROCESSING**
This notebook transforms a specified EU cordis folder inputs into a preprocessed .pkl file. The final .pkl file will contain only NL projects filtered by whether any of the associated organizations contain a Dutch vatNumber.


## **TABLE OF CONTENTS**:
- [HOW TO USE](#1)
- [SELECT DATA](#2)
- [MERGE DATA AND FILTER ON NL](#3)
- [PREPROCESSING](#4)
- [EXPORT TO PKL](#5)

## **HOW TO USE**
1. Ensure the input files organization.csv and project.csv are downloaded by eu_dataloader.ipynb.
2. Select the folder name in the cell below.
3. Run the entire notebook to produce a .pkl file.

## **SELECT DATA** <a class="anchor" id="2"></a>

In [None]:
# Choose one of the below options (note that this project should already have been downloaded with Cordis EU dataloader) :
# eu_cordis_2007_2013
# eu_cordis_2014_2020
# eu_cordis_2021_2027

EU_CORDIS_PROJECT_NAME = 'eu_cordis_2007_2013' 


df_organization = pd.read_csv(f'../data/{EU_CORDIS_PROJECT_NAME}/files/csv/organization.csv', delimiter=';')
df_projects = pd.read_csv(f'../data/{EU_CORDIS_PROJECT_NAME}/files/csv/project.csv', delimiter=';')

In [138]:
display(df_organization.head(2))
print(df_organization.shape)

Unnamed: 0,projectID,projectAcronym,organisationID,vatNumber,name,shortName,SME,activityType,street,postCode,...,contactForm,contentUpdateDate,rcn,order,role,ecContribution,netEcContribution,totalCost,endOfParticipation,active
0,217257,IMERA-PLUS,999559296.0,,EURAMET EV - EUROPEAN ASSOCIATION OF NATIONAL ...,EURAMET,,REC,Bundesallee 100,38116,...,https://ec.europa.eu/info/funding-tenders/oppo...,2023-03-10 20:03:22,2267129,1,coordinator,21000000.0,,,False,
1,217246,BONUS+,999806646.0,,BALTIC ORGANISATIONS NETWORK FOR FUNDING SCIEN...,BONUS EEIG,,OTH,PASILANRAITIO 9 B,240,...,https://ec.europa.eu/info/funding-tenders/oppo...,2022-09-03 22:47:18,2193825,1,coordinator,7266762.69,,,False,


(123419, 25)


In [139]:
display(df_projects.head(2))
print(df_projects.shape)

Unnamed: 0,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,topics,ecSignatureDate,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi
0,217257,IMERA-PLUS,CLOSED,Implementing Metrology in the European Researc...,2007-06-01,2012-05-31,64865124,21000000,FP7-GA,ERANET,,FP7,,FP7-2007-ERANET-4.2.2.2,CSA-ERA-Plus,,The central nerve in the spine of our high-tec...,2023-03-10 20:03:22,86251,
1,217246,BONUS+,CLOSED,Multilateral call for research projects within...,2007-05-10,2012-05-09,2251221996,726676269,FP7-GA,ERANET,,FP7,,FP7-2007-ERANET-4.2.2.2,CSA-ERA-Plus,,BONUS EEIG - representing altogether 10 RTD or...,2022-09-03 22:47:18,86250,


(21814, 20)


## **MERGE DATA AND FILTER ON NL** <a class="anchor" id="3"></a>

In [140]:
# Here we merge projects and organizations to only keep the NL data.
# If any of the VAT numbers of the organization start with 'NL' then we keep the data.

# Group by projectID and aggregate vatNumber into lists
df_organization_grouped = df_organization.groupby('projectID')['vatNumber'].apply(list).reset_index()

# Merge df_projects with the grouped df_organization on the id and projectID columns
df_merged = pd.merge(df_projects, df_organization_grouped, how='left', left_on='id', right_on='projectID')

# Drop the redundant projectID column
df_merged = df_merged.drop(columns=['projectID'])

# Ensure vatNumber is treated as a list even if it is NaN
df_merged['vatNumber'] = df_merged['vatNumber'].apply(lambda x: x if isinstance(x, list) else [])

# Filter records where any of the values in vatNumber start with 'NL'
df_projects = df_merged[df_merged['vatNumber'].apply(lambda x: any(isinstance(v, str) and v.startswith('NL') for v in x))].reset_index()

display(df_projects.head(2))
print(df_projects.shape)

Unnamed: 0,index,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,...,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi,vatNumber
0,16,601714,DIAGMAL,CLOSED,Translation of the direct-on-blood PCR-NALFIA ...,2013-09-01,2017-02-28,356574911,2652374,FP7-HEALTH,...,FP7,,FP7-HEALTH-2013-INNOVATION-2,CP-FP,,Accurate diagnosis of malaria is essential to ...,2019-08-02 11:04:44,108624,,"[NL002564476B01, FI23444528, nan, nan, NL00462..."
1,25,223226,PHARVAT,CLOSED,Platform for the Harmonization of Vaccine Adju...,2009-11-01,2011-10-31,33401816,300000,FP7-HEALTH,...,FP7,,FP7-HEALTH-2007-B,CSA-SA,,Adjuvants are critical to the quality and magn...,2017-05-29 17:42:01,92315,,"[NL803597691B01, DE815127939, nan]"


(4010, 22)


In [141]:
# Replacing commas with periods in the 'ecMaxContribution' column
df_projects['ecMaxContribution'] = df_projects['ecMaxContribution'].str.replace(',', '.')

# Ensure the 'ecMaxContribution' column is numeric
df_projects['ecMaxContribution'] = pd.to_numeric(df_projects['ecMaxContribution'], errors='coerce')

In [142]:
# Only the year value from startDate will be used
df_projects['startDate'] = df_projects['startDate'].str.extract(r'(\d{4})')
df_projects['startDate'] = df_projects['startDate'] + '-01-01'
display(df_projects.head(2))
print(df_projects.shape)

Unnamed: 0,index,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,...,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi,vatNumber
0,16,601714,DIAGMAL,CLOSED,Translation of the direct-on-blood PCR-NALFIA ...,2013-01-01,2017-02-28,356574911,2652374.0,FP7-HEALTH,...,FP7,,FP7-HEALTH-2013-INNOVATION-2,CP-FP,,Accurate diagnosis of malaria is essential to ...,2019-08-02 11:04:44,108624,,"[NL002564476B01, FI23444528, nan, nan, NL00462..."
1,25,223226,PHARVAT,CLOSED,Platform for the Harmonization of Vaccine Adju...,2009-01-01,2011-10-31,33401816,300000.0,FP7-HEALTH,...,FP7,,FP7-HEALTH-2007-B,CSA-SA,,Adjuvants are critical to the quality and magn...,2017-05-29 17:42:01,92315,,"[NL803597691B01, DE815127939, nan]"


(4010, 22)


## **PREPROCESSING** <a class="anchor" id="4"></a>

In [143]:
# text column contains title and summary
df_projects['text'] = df_projects['title'] + ' ' + df_projects['objective']
df_projects['text'] = df_projects['text'].apply(lambda x: str(x) if not isinstance(x, str) else x)

# Function for preprocessing textual data (for BERT)
import re
def preprocess_text(text):
    # Lowercase the text and remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # removing non-alphabetic characters
    # Use regex to keep only words
    text = ' '.join(re.findall(r'\b[a-z]+\b', text))
    return text

# Apply the preprocessing to both 'title' and 'summary_en'
df_projects['text'] = df_projects['text'].apply(preprocess_text)

In [144]:
display(df_projects.head(2))
print(df_projects.shape)

Unnamed: 0,index,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,...,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi,vatNumber,text
0,16,601714,DIAGMAL,CLOSED,Translation of the direct-on-blood PCR-NALFIA ...,2013-01-01,2017-02-28,356574911,2652374.0,FP7-HEALTH,...,,FP7-HEALTH-2013-INNOVATION-2,CP-FP,,Accurate diagnosis of malaria is essential to ...,2019-08-02 11:04:44,108624,,"[NL002564476B01, FI23444528, nan, nan, NL00462...",translation of the directonblood pcrnalfia sys...
1,25,223226,PHARVAT,CLOSED,Platform for the Harmonization of Vaccine Adju...,2009-01-01,2011-10-31,33401816,300000.0,FP7-HEALTH,...,,FP7-HEALTH-2007-B,CSA-SA,,Adjuvants are critical to the quality and magn...,2017-05-29 17:42:01,92315,,"[NL803597691B01, DE815127939, nan]",platform for the harmonization of vaccine adju...


(4010, 23)


## **EXPORT TO PKL** <a class="anchor" id="5"></a>

In [145]:
# Export data to pickle
df_projects.to_pickle(f"../data/{EU_CORDIS_PROJECT_NAME}_preprocessed.pkl")