In [None]:
# Import packages
from google.colab import drive
import pandas as pd
import csv
import json


In [None]:
#Connect with DRIVE env
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Function to change format to float
def type_to_float(df,float_variables):
  for var in float_variables:
    # Assuming 'column_name' is the column containing numbers with commas instead of dots
    df[var] = df[var].str.replace(',', '.')
    # Convert the column to numeric if necessary
    df[var] = pd.to_numeric(df[var])

# Define variables to change to float
float_variables = ['ecmaxcontribution', 'totalcost']

# Function to replace nas to zeros
def nas_to_zero(df,replace_variables):
  df[replace_variables] = df[replace_variables].fillna(0)

## FP7

In [None]:
#Load the publications from the FP7 call
FP7_publications = pd.read_csv('/content/drive/MyDrive/data/RawCORDISdata/FP7_publications.csv', sep=';')

In [None]:
#Load projects from FP7 call
FP7_projects = pd.read_csv('/content/drive/MyDrive/data/RawCORDISdata/FP7_projects.csv', sep=';')

In [None]:
#To avoid conflict user lowercase and rename columns
FP7_projects.columns = FP7_projects.columns.str.lower()
FP7_projects = FP7_projects.rename(columns={'id':'project_id', 'title':'title_projects', 'contentupdatedate':'contentupdatedate_projects'})
# Convert variables to float
type_to_float(FP7_projects,float_variables)

In [None]:
#To avoid conflict user lowercase and rename columns
FP7_publications.columns = FP7_publications.columns.str.lower()
FP7_publications = FP7_publications.rename(columns={'title':'title_publications', 'author':'authors'})

In [None]:
#Merge each publication to the correct project
FP7_merged = pd.merge(FP7_projects, FP7_publications, on="project_id", how='outer', indicator=True)

In [None]:
#Delete the columns that are not present in the datasets for other calls
del_columns = ['qa_processed_doi', 'publication_type', 'pages', 'publisher','grantdoi', 'volume', 'record_id', 'repository_url','rcn']
for col in del_columns:
    del FP7_merged[col]

#Create columns:
#Correct "call"
FP7_merged['funding_body'] = 'FP7'
#Unknown values but relevant column in other datasets
FP7_merged['publication_date'] = 'unknown'
FP7_merged['projectacronym'] = 'unknown'

In [None]:
#Test that the df is correct and the columns are the correct type
display(FP7_merged.head())
FP7_merged.info()

Unnamed: 0,project_id,acronym,status,title_projects,startdate,enddate,totalcost,ecmaxcontribution,legalbasis,topics,...,objective,contentupdatedate_projects,title_publications,authors,doi,journal_title,_merge,funding_body,publication_date,projectacronym
0,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Membrane-Assisted Growth of DNA Origami Nanost...,"Samet Kocabey , Susanne Kempter , Jonathan Lis...",10.1021/acsnano.5b00161,ACS Nano,both,FP7,unknown,unknown
1,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Alignment and Graphene-Assisted Decoration of ...,"Kevin Martens , Timon Funck , Susanne Kempter ...",10.1002/smll.201503382,Small,both,FP7,unknown,unknown
2,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,DNA-Assembled Nanoparticle Rings Exhibit Elect...,"Eva-Maria Roller , Larousse Khosravi Khorashad...",10.1021/nl5046473,Nano Letters,both,FP7,unknown,unknown
3,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Cellular Uptake of Tile-Assembled DNA Nanotubes,"Samet Kocabey 1, Hanna Meinl 2, Iain S. MacPhe...",10.3390/nano5010047,Nanomaterials,both,FP7,unknown,unknown
4,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Molecular force spectroscopy with a DNA origam...,"Philipp C. Nickels , Bettina Wünsch , Phil Hol...",10.1126/science.aah5974,Science,both,FP7,unknown,unknown


<class 'pandas.core.frame.DataFrame'>
Int64Index: 315780 entries, 0 to 315779
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   project_id                  315780 non-null  int64   
 1   acronym                     315713 non-null  object  
 2   status                      315702 non-null  object  
 3   title_projects              315713 non-null  object  
 4   startdate                   315591 non-null  object  
 5   enddate                     315591 non-null  object  
 6   totalcost                   315713 non-null  float64 
 7   ecmaxcontribution           315713 non-null  float64 
 8   legalbasis                  315713 non-null  object  
 9   topics                      315709 non-null  object  
 10  ecsignaturedate             0 non-null       float64 
 11  frameworkprogramme          315713 non-null  object  
 12  mastercall                  0 non-null       float64 
 13 

## Horizon 2020

In [None]:
# Load the publications from Horizon 2020
H2020_publications = pd.read_csv('/content/drive/MyDrive/data/RawCORDISdata/Horizon2020_publications.csv', sep=';')

In [None]:
# Load the projects file from Horizon 2020
H2020_projects = pd.read_csv('/content/drive/MyDrive/data/RawCORDISdata/Horizon2020_projects.csv', sep=';')

In [None]:
# Reorganize projects file from Horizon 2020
H2020_projects.columns = H2020_projects.columns.str.lower()
H2020_projects = H2020_projects.rename(columns={'id':'project_id', 'title':'title_projects', 'contentupdatedate':'contentupdatedate_projects'})
H2020_projects['project_id'] = H2020_projects['project_id'].astype('object')

type_to_float(H2020_projects,float_variables)
del H2020_projects['rcn']

In [None]:
# Reorganize publications file from Horizon 2020
H2020_publications.columns = H2020_publications.columns.str.lower()
H2020_publications = H2020_publications.rename(columns={'projectid':'project_id', 'title':'title_publications','id':'project_id_full', 'journaltitle':'journal_title', 'publishedyear':'publication_date'})
del H2020_publications['contentupdatedate']
del H2020_publications['rcn']
H2020_publications['project_id'] = H2020_publications['project_id'].astype('object')

In [None]:
#Merge projects and publications
H2020_merged = pd.merge(H2020_projects, H2020_publications, on="project_id", how='outer', indicator=True)

#Delete the columns that we deemed unnecessary
colums_del_2020 = ['isbn', 'issn', 'journalnumber', 'project_id_full', 'publishedpages', 'ispublishedas', 'collection', 'grantdoi']
for col in colums_del_2020:
    del H2020_merged[col]

#H2020_merged = H2020_merged.rename(columns={'grantdoi':'doi'})
H2020_merged['funding_body'] = 'H2020'

In [None]:
#Test that the merged df is correct and the columns are the correct type
display(H2020_merged.head())
H2020_merged.info()

Unnamed: 0,project_id,acronym,status,title_projects,startdate,enddate,totalcost,ecmaxcontribution,legalbasis,topics,...,objective,contentupdatedate_projects,title_publications,authors,journal_title,publication_date,doi,projectacronym,_merge,funding_body
0,810630,VALORTECH,SIGNED,ERA Chair for Food (By-) Products Valorisation...,2018-07-01,2023-12-31,2498625.0,2498625.0,H2020-EU.4.c.,WIDESPREAD-03-2017,...,"Advanced food processing technologies, minimum...",2023-05-13 18:49:37,Comparison of chemical composition of Hypericu...,"Rusalepp, Linda; Raal, Ain; Püssa, Tõnu; Mäeor...",Biochemical Systematics and Ecology,2017.0,10.1016/j.bse.2017.06.004,VALORTECH,both,H2020
1,810630,VALORTECH,SIGNED,ERA Chair for Food (By-) Products Valorisation...,2018-07-01,2023-12-31,2498625.0,2498625.0,H2020-EU.4.c.,WIDESPREAD-03-2017,...,"Advanced food processing technologies, minimum...",2023-05-13 18:49:37,Changes in rheological properties of Edam-type...,"Henno, Merike; Jõudu, Ivi; Kaart, Tanel; Veski...",Agricultural and Food Science,2017.0,10.23986/afsci.63132,VALORTECH,both,H2020
2,810630,VALORTECH,SIGNED,ERA Chair for Food (By-) Products Valorisation...,2018-07-01,2023-12-31,2498625.0,2498625.0,H2020-EU.4.c.,WIDESPREAD-03-2017,...,"Advanced food processing technologies, minimum...",2023-05-13 18:49:37,Fractionation of sea buckthorn pomace and seed...,"V. Kitrytė, D. Povilaitis, V. Kraujalienė, V. ...",LWT – Food Science and Technology,2017.0,10.1016/j.lwt.2017.02.041,VALORTECH,both,H2020
3,810630,VALORTECH,SIGNED,ERA Chair for Food (By-) Products Valorisation...,2018-07-01,2023-12-31,2498625.0,2498625.0,H2020-EU.4.c.,WIDESPREAD-03-2017,...,"Advanced food processing technologies, minimum...",2023-05-13 18:49:37,Effect of post-harvest flame-defoliation on st...,"Reelika RÄTSEP, Ulvi MOOR, Ele VOOL, Kadri KARP",Zemdirbyste-Agriculture,2015.0,10.13080/z-a.2015.102.051,VALORTECH,both,H2020
4,810630,VALORTECH,SIGNED,ERA Chair for Food (By-) Products Valorisation...,2018-07-01,2023-12-31,2498625.0,2498625.0,H2020-EU.4.c.,WIDESPREAD-03-2017,...,"Advanced food processing technologies, minimum...",2023-05-13 18:49:37,Phytochemical characterization and antimicrobi...,"Radenkovs, Vitalijs; Püssa, Tõnu; Juhnevica-Ra...",Food Bioscience,2018.0,10.1016/j.fbio.2018.05.010,VALORTECH,both,H2020


<class 'pandas.core.frame.DataFrame'>
Int64Index: 381255 entries, 0 to 381254
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   project_id                  381255 non-null  object  
 1   acronym                     381255 non-null  object  
 2   status                      381255 non-null  object  
 3   title_projects              381255 non-null  object  
 4   startdate                   381244 non-null  object  
 5   enddate                     381244 non-null  object  
 6   totalcost                   381255 non-null  float64 
 7   ecmaxcontribution           381255 non-null  float64 
 8   legalbasis                  377220 non-null  object  
 9   topics                      381255 non-null  object  
 10  ecsignaturedate             381255 non-null  object  
 11  frameworkprogramme          381255 non-null  object  
 12  mastercall                  381255 non-null  object  
 13 

## Horizon E

In [None]:
#Load the Horizon Europe projects file
HE_projects = pd.read_csv('/content/drive/MyDrive/data/RawCORDISdata/HorizonE_projects.csv', sep=';')

In [None]:
#Transform columns to lowercase to avoid errors
HE_projects.columns = HE_projects.columns.str.lower()

#Rename the columns to match the previous dataframes
HE_projects = HE_projects.rename(columns={'id':'project_id', 'title':'title_projects', 'contentupdatedate':'contentupdatedate_projects'})

#Delete and modify the necessary columns
del HE_projects['rcn']
type_to_float(HE_projects,float_variables)

In [None]:
#Load the Horizon Europe publications file file
HE_publications = pd.read_csv('/content/drive/MyDrive/data/RawCORDISdata/HorizonE_publications.csv', sep=';')

In [None]:
#Transform columns to lowercase to avoid errors
HE_publications.columns = HE_publications.columns.str.lower()

#Rename the columns to match the previous dataframes
HE_publications = HE_publications.rename(columns={'projectid':'project_id', 'title':'title_publications','id':'project_id_full','journaltitle':'journal_title', 'publishedyear':'publication_date'})

#Delete and modify the necessary columns
del HE_publications['contentupdatedate']
del HE_publications['rcn']
HE_publications.columns = HE_publications.columns.astype('object')

In [None]:
#Merge the project and publications dfs
HE_merged = pd.merge(HE_projects, HE_publications, on="project_id", how='outer', indicator=True)
HE_merged['funding_body'] = 'HE'

#Delete the unnecesary columns
colums_del = ['isbn', 'issn', 'journalnumber', 'project_id_full', 'publishedpages', 'ispublishedas', 'collection', 'grantdoi']
for col in colums_del:
    del HE_merged[col]

In [None]:
# To check match columns - but code no needed
different_columns = []

# Columns in df1 but not in df2
for col in H2020_merged.columns:
    if col not in FP7_merged.columns:
        different_columns.append(col)

# Columns in df2 but not in df1
for col in FP7_merged.columns:
    if col not in H2020_merged.columns:
        different_columns.append(col)

different_columns = list(set(different_columns))  # Remove duplicates

print("Different column names:")
print(different_columns)

Different column names:
[]


In [None]:
# Merge cordis data - from the 3 frameworks
cordis_projects_publications = pd.concat([FP7_merged, H2020_merged, HE_merged], ignore_index=True)

In [61]:
#Print to verify the shape of the merged dfs and verify the number of columns
print(FP7_merged.shape)
print(H2020_merged.shape)
print(HE_merged.shape)

print('The full merged df should be', len(FP7_merged)+len(H2020_merged)+len(HE_merged),' entries long')

(315780, 26)
(381255, 26)
(9796, 26)
The full merged df should be 706831  entries long


In [63]:
display(cordis_projects_publications.head())
print(cordis_projects_publications.shape)

Unnamed: 0,project_id,acronym,status,title_projects,startdate,enddate,totalcost,ecmaxcontribution,legalbasis,topics,...,objective,contentupdatedate_projects,title_publications,authors,doi,journal_title,_merge,funding_body,publication_date,projectacronym
0,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Membrane-Assisted Growth of DNA Origami Nanost...,"Samet Kocabey , Susanne Kempter , Jonathan Lis...",10.1021/acsnano.5b00161,ACS Nano,both,FP7,unknown,unknown
1,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Alignment and Graphene-Assisted Decoration of ...,"Kevin Martens , Timon Funck , Susanne Kempter ...",10.1002/smll.201503382,Small,both,FP7,unknown,unknown
2,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,DNA-Assembled Nanoparticle Rings Exhibit Elect...,"Eva-Maria Roller , Larousse Khosravi Khorashad...",10.1021/nl5046473,Nano Letters,both,FP7,unknown,unknown
3,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Cellular Uptake of Tile-Assembled DNA Nanotubes,"Samet Kocabey 1, Hanna Meinl 2, Iain S. MacPhe...",10.3390/nano5010047,Nanomaterials,both,FP7,unknown,unknown
4,336440,ORCA,CLO,Optical Responses Controlled by DNA Assembly,2013-12-01,2018-11-30,1433840.0,1433840.0,FP7-IDEAS-ERC,ERC-SG-PE5,...,Artificially constructed materials can be desi...,2016-12-19 10:56:33,Molecular force spectroscopy with a DNA origam...,"Philipp C. Nickels , Bettina Wünsch , Phil Hol...",10.1126/science.aah5974,Science,both,FP7,unknown,unknown


(706831, 26)


In [None]:
cordis_projects_publications.to_csv('/content/drive/MyDrive/data/cordis_projects_publications.csv', sep=';')

# COUNTS DATA

In [None]:
#Set the path to the counts data file
counts_data = "/content/drive/MyDrive/data/results_for_analysis.json"

In [None]:
# Load json function
def load_json(file_path):
    with open(file_path) as json_file:
        return json.load(json_file)

#Load the file and turn into df
loadedfile = load_json(counts_data)
counts_data_csv = pd.DataFrame(loadedfile)

In [None]:
display(counts_data_csv)

Unnamed: 0,pmcid,agg_sentence_index,agg_n_fem,agg_n_male,agg_perc_fem,agg_perc_male,agg_sample,clean_n_fem,clean_n_male,clean_perc_fem,...,article_categories,article_title,authors,copyright_info,funding,publisher_id,doi,journal_title,keywords,publication_date
0,PMC9683380,"[3, 5, 6, 7, 8]","[null, null, ""\""352\"""", null, null]","[null, null, ""\""328\"""", null, null]","[null, null, null, null, null]","[null, null, null, null, null]","[""\""##3\"""", ""\""##3\"""", ""\""70\"""", ""\""##6\"""", ""\...",[352],[328],[],...,"[""Endocrinology""]",Differential diagnostic value of plain CT scan...,"[""Zhijiang Han"", ""Mengwei Wu"", ""Peiying Wei"", ...","Copyright © 2022 Han, Wu, Wei, Zhu, Zhang, Din...",[],,10.3389/fendo.2022.1007870,Frontiers in Endocrinology,"[""adrenal gland neoplasms"", ""adrenal adenoma"",...",2022-11-09 00:00:00
1,PMC5137654,"[1, 3, 4]","[""\""30\"""", null, null]","[""\""30\"""", null, null]","[null, null, null]","[null, null, null]","[""\""60\"""", ""\""16\"""", ""\""46\""""]",[30],[30],[],...,"[""Spine""]",Posterior hemivertebra resection and monosegme...,"[""X Zhu"", ""X Wei"", ""J Chen"", ""C Li"", ""M Li"", ""...",Copyright © 2013 Royal College of Surgeons,[],650173,10.1308/003588414X13824511650173,Annals of The Royal College of Surgeons of Eng...,"[""Hemivertebra resection"", ""Monosegmental fusi...",
2,PMC7906844,[1],[null],[null],[null],[null],"[""\""thirty\""""]",[],[],[],...,"[""Research Article""]",Can silver diamine fluoride or silver nanopart...,"[""Jaqueline Costa Favaro"", ""Yana Cosendey Tole...",Copyright © 2021. The Korean Academy of Conser...,"[{""institution"": ""Coordena\u00e7\u00e3o de Ape...",2021460211,10.5395/rde.2021.46.e7,Restorative Dentistry & Endodontics,"[""Cariostatic agents"", ""Dental caries"", ""Nanop...",
3,PMC3387267,[12],[null],[null],[null],[null],"[""\""12\""""]",[],[],[],...,"[""Research Article"", ""Biology"", ""Medicine""]",NOTCH1 Signaling Promotes Human T-Cell Acute L...,"[""Wenxue Ma"", ""Alejandro Gutierrez"", ""Daniel J...",Ma et al.,[],PONE-D-12-13887,10.1371/journal.pone.0039725,PLoS ONE,[],
4,PMC4221596,[28],[null],[null],[null],[null],"[""\""90\""""]",[],[],[],...,"[""Original Article""]",A Cancer Cell-Activatable Aptamer-Reporter Sys...,"[""Zihua Zeng"", ""Ching-Hsuan Tung"", ""Youli Zu""]",Copyright © 2014 American Society of Gene & Ce...,[],,10.1038/mtna.2014.36,Molecular Therapy. Nucleic Acids,"[""aptamer-reporter"", ""cell-activatable"", ""circ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166006,PMC5432274,"[0, 2, 3, 4]","[null, null, ""\""99\"""", null]","[null, null, ""\""181\"""", null]","[null, null, null, null]","[null, null, null, null]","[""\""280\"""", ""\""280\"""", null, ""\""48\""""]",[99],[181],[],...,"[""Research Paper""]",Expression and functional regulation of stemne...,"[""Zhuan Lv"", ""Jane J. Yu"", ""Wei-Jie Zhang"", ""L...",Copyright: © 2017 Lv et al.,[],15624,10.18632/oncotarget.15624,Oncotarget,"[""Lgr5"", ""esophageal squamous cell carcinoma"",...",2017-04-18 00:00:00
166007,PMC6317672,"[0, 1]","[null, null]","[null, null]","[null, null]","[null, null]","[""\""20\"""", ""\""six\""""]",[],[],[],...,"[""Articles""]",Role of the long non-coding RNA-Annexin A2 pse...,"[""Yeletai Nuerzhati"", ""Rui Dong"", ""Zai Song"", ...",Copyright: © Nuerzhati et al.,[],ijmm-43-02-0739,10.3892/ijmm.2018.4023,International Journal of Molecular Medicine,"[""Annexin A2 pseudogene 3"", ""Annexin A2"", ""bil...",
166008,PMC3155515,[28],[null],[null],[null],[null],"[""\""76\""""]",[],[],[],...,"[""Research Article"", ""Biology""]",Transcriptome Sequencing of the Blind Subterra...,"[""Assaf Malik"", ""Abraham Korol"", ""Sariel H\u00...",Malik et al.,[],PONE-D-11-07297,10.1371/journal.pone.0021227,PLoS ONE,[],
166009,PMC61449,"[13, 31]","[null, null]","[null, null]","[null, null]","[null, null]","[""\""five\"""", ""\""five\""""]",[],[],[],...,"[""Research Article""]",Absence of rapid selection for acyclovir or pe...,"[""Robert T Sarisky"", ""H Ron Bartus"", ""Shelley ...",Copyright © 2001 Sarisky et al; licensee BioMe...,[],1471-2334-1-24,10.1186/1471-2334-1-24,BMC Infectious Diseases,[],


In [None]:
#Transform to csv to merge
counts_data_csv.to_csv('counts_data.csv')
counts_data_csv.columns

Index(['pmcid', 'agg_sentence_index', 'agg_n_fem', 'agg_n_male',
       'agg_perc_fem', 'agg_perc_male', 'agg_sample', 'clean_n_fem',
       'clean_n_male', 'clean_perc_fem', 'clean_perc_male', 'clean_sample',
       'max_n_fem', 'max_n_male', 'max_perc_fem', 'max_perc_male',
       'max_sample', 'article_categories', 'article_title', 'authors',
       'copyright_info', 'funding', 'publisher_id', 'doi', 'journal_title',
       'keywords', 'publication_date'],
      dtype='object')

In [64]:
#Create the list of DOIs to filter the merged dataframe and only keep data that matches with the counts_data
list_doi = counts_data_csv['doi'].tolist()

In [None]:
# Subset cordis
cordis_subset = cordis_projects_publications[cordis_projects_publications['doi'].isin(list_doi)]

# Change names
cordis_subset = cordis_subset.rename(columns={'acronym':'acronym_projects', 'projectacronym':'acronym_publications'})

# Specify the suffix to add
suffix_cordis = '_cordis'

# Add the suffix to all columns except for 'funding_body'
for column in cordis_subset.columns:
    if column != 'funding_body' and column != 'doi':
        cordis_subset.rename(columns={column: f"{column}{suffix_cordis}"}, inplace=True)

# Display the subset DataFrame
display(cordis_subset.head())

       project_id_cordis acronym_projects_cordis status_cordis  \
41                281666     RC3H1/2-SPECIFICITY           CLO   
537               322605             META-GROWTH           CLO   
574               322605             META-GROWTH           CLO   
578               322605             META-GROWTH           CLO   
1007              261349                  UNITAS           CLO   
...                  ...                     ...           ...   
695853         101016834               HosmartAI        SIGNED   
696278            826404                   CUREX        CLOSED   
699495         101046846                MAPWORMS        SIGNED   
701225         101046041                 CoVICIS        SIGNED   
702787         101039672                   TEMPO        SIGNED   

                                    title_projects_cordis startdate_cordis  \
41      Specificity of Rc3h1/2 proteins in post-transc...       2011-11-01   
537     Metabolic regulation of growth and body com

In [65]:
#Check that all columns are correct and check data types
cordis_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3436 entries, 41 to 702787
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   project_id_cordis                  3436 non-null   object  
 1   acronym_projects_cordis            3435 non-null   object  
 2   status_cordis                      3435 non-null   object  
 3   title_projects_cordis              3435 non-null   object  
 4   startdate_cordis                   3435 non-null   object  
 5   enddate_cordis                     3435 non-null   object  
 6   totalcost_cordis                   3435 non-null   float64 
 7   ecmaxcontribution_cordis           3435 non-null   float64 
 8   legalbasis_cordis                  3395 non-null   object  
 9   topics_cordis                      3435 non-null   object  
 10  ecsignaturedate_cordis             1731 non-null   object  
 11  frameworkprogramme_cordis          3435 

In [None]:
#Merge the counts data with the recently created cordis subset
merged_data = pd.merge(counts_data_csv, cordis_subset, on="doi", how='inner')
merged_data
replace_variables = ['totalcost_cordis', 'ecmaxcontribution_cordis', 'clean_n_male', 'clean_n_fem', 'clean_perc_male', 'clean_perc_fem', 'max_n_male', 'max_n_fem', 'max_perc_male', 'max_perc_fem' ]

# Call function to replace zeros
nas_to_zero(merged_data,replace_variables)

In [67]:
#Visualize the merged dataframe
display(merged_data.head())
print(merged_data.info())

Unnamed: 0,pmcid,agg_sentence_index,agg_n_fem,agg_n_male,agg_perc_fem,agg_perc_male,agg_sample,clean_n_fem,clean_n_male,clean_perc_fem,...,nature_cordis,objective_cordis,contentupdatedate_projects_cordis,title_publications_cordis,authors_cordis,journal_title_cordis,_merge_cordis,funding_body,publication_date_cordis,acronym_publications_cordis
0,PMC4242463,"[1, 4, 6, 10, 13, 24]","[null, null, null, ""\""##7\"""", null, null]","[null, null, null, null, null, null]","[null, null, null, null, null, null]","[null, null, null, null, null, null]","[""\""36\"""", ""\""##7\"""", ""\""##4\"""", ""\""56\"""", ""\""...",[7],[],[],...,,"""Genome-wide association studies of complex tr...",2015-03-10 19:10:33,Genetic characterization of Greek population i...,"Panoutsopoulou, K., Hatzikotoulas, K., Xifara,...",Nature Communications,both,FP7,unknown,unknown
1,PMC5512773,"[1, 3]","[null, null]","[null, null]","[null, null]","[null, null]","[""\""seven\"""", ""\""eleven\""""]",[],[],[],...,,Next generation sequencing (NGS) is revolution...,2022-02-09 10:28:40,Single-molecule quantification of 5-hydroxymet...,"Noa Gilat , Tzlil Tabachnik , Amit Shwartz , T...",Clinical epigenetics,both,FP7,unknown,unknown
2,PMC5512773,"[1, 3]","[null, null]","[null, null]","[null, null]","[null, null]","[""\""seven\"""", ""\""eleven\""""]",[],[],[],...,,Cytogenetic diagnostic approaches provide info...,2022-08-15 14:25:56,Single-molecule quantification of 5-hydroxymet...,"Noa Gilat, Tzlil Tabachnik, Amit Shwartz, Tama...",Clinical Epigenetics,both,H2020,2017.0,BeyondSeq
3,PMC4492817,[32],[null],[null],[null],[null],"[""\""14\""""]",[],[],[],...,,Tetralogy of Fallot (TOF) is the most common c...,2017-05-26 01:37:18,Antisense-mediated exon skipping: a therapeuti...,"M. Gramlich , L. S. Pane , Q. Zhou , Z. Chen ,...",EMBO Molecular Medicine,both,FP7,unknown,unknown
4,PMC4492817,[32],[null],[null],[null],[null],"[""\""14\""""]",[],[],[],...,,Tetralogy of Fallot (TOF) is the most common c...,2017-05-26 01:37:18,Antisense-mediated exon skipping: a therapeuti...,"M. Gramlich , L. S. Pane , Q. Zhou , Z. Chen ,...",EMBO Molecular Medicine,both,FP7,unknown,unknown


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3436 entries, 0 to 3435
Data columns (total 52 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   pmcid                              3436 non-null   object  
 1   agg_sentence_index                 3436 non-null   object  
 2   agg_n_fem                          3436 non-null   object  
 3   agg_n_male                         3436 non-null   object  
 4   agg_perc_fem                       3436 non-null   object  
 5   agg_perc_male                      3436 non-null   object  
 6   agg_sample                         3436 non-null   object  
 7   clean_n_fem                        3436 non-null   object  
 8   clean_n_male                       3436 non-null   object  
 9   clean_perc_fem                     3436 non-null   object  
 10  clean_perc_male                    3436 non-null   object  
 11  clean_sample                       3436 non

In [None]:
#Save data to the correct file location
merged_data.to_csv('/content/drive/MyDrive/data/final_subset.csv',sep=';')

In [66]:
#Print the shape of the selected projects from each funding project's intersection with the counts.
filtered_fp7 = merged_data[merged_data['funding_body'] == 'FP7']
print('The shape of the FP7 filtered is:',filtered_fp7.shape)
print(' ---- ')
filtered_h2020 = merged_data[merged_data['funding_body'] == 'H2020']
print('The shape of the H2020 filtered is:',filtered_h2020.shape)
print(' ---- ')
filtered_he = merged_data[merged_data['funding_body'] == 'HE']
print('The shape of the HORIZON E filtered is:',filtered_he.shape)

The shape of the FP7 filtered is: (1705, 52)
 ---- 
The shape of the H2020 filtered is: (1728, 52)
 ---- 
The shape of the HORIZON E filtered is: (3, 52)
