# Merging the datasets of publications related to COVID-19

In [1]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from string import punctuation

## 1. Defining the required functions

In [2]:
# Defining the function "clean_title".
def clean_title(title):
    if len(title) == 1 and title in punctuation:
        return None
    if title[0] in punctuation:
        title = title[1:]
    if title[-1] in punctuation:
        title = title[:-1]
    return re.sub(r"\s+", " ", title).lower()

## 2. Getting and preprocessing the datasets

### 2.1. arXiv

In [3]:
# Getting the data.
df_arxiv = pd.read_csv("../../data/prepared/arxiv_covid_19.csv", header=0, dtype={"id": "str"})

In [4]:
# Checking the dataframe.
df_arxiv.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date
0,2009.11008,"('Image and Video Processing', 'Computer Visio...",Attention with Multiple Sources Knowledges for...,"({'id': '-3656862960144035448', 'name': 'Duy M...","Until now, Coronavirus SARS-CoV-2 has caused m...",2020-09-23
1,2009.10931,"('Quantitative Methods', 'Machine Learning')",Drug Repurposing for COVID-19 using Graph Neur...,"({'id': '-4571039949857585287', 'name': 'Kangl...",Amid the pandemic of 2019 novel coronavirus di...,2020-09-23
2,2009.10808,"('Machine Learning', 'Applications')",Using Machine Learning to Develop a Novel COVI...,"({'id': '-1643832521739170778', 'name': 'Anuj ...",COVID19 is now one of the most leading causes ...,2020-09-22
3,2009.10648,"('Social and Information Networks', 'Physics a...",Google COVID-19 community mobility reports: in...,"({'id': '1593276023866582611', 'name': 'Gabrie...",Social distancing (SD) has been critical in th...,2020-09-17
4,2009.10608,"('Image and Video Processing', 'Computer Visio...",Dual Encoder Fusion U-Net (DEFU-Net) for Cross...,"({'id': '-4095306500263987581', 'name': 'Lipei...",A number of methods based on the deep learning...,2020-09-11


In [5]:
# Visualizing the information of dataset.
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2335 non-null   object
 1   subject_areas     2335 non-null   object
 2   title             2335 non-null   object
 3   authors           2335 non-null   object
 4   abstract          2335 non-null   object
 5   publication_date  2335 non-null   object
dtypes: object(6)
memory usage: 109.6+ KB


In [6]:
# Defining the "None" value for the "NaN" values.
df_arxiv.replace({np.nan: None}, inplace=True)

In [7]:
# Changing the type of features.
df_arxiv.loc[:, ["subject_areas", "authors"]] = df_arxiv.loc[:, ["subject_areas", "authors"]].apply(
    lambda x: x.apply(eval))
df_arxiv.publication_date = pd.to_datetime(df_arxiv.publication_date)

In [8]:
# Defining the feature "source".
df_arxiv["source"] = "arXiv"

In [9]:
# Normalizing the feature "title".
df_arxiv.title = df_arxiv.title.apply(clean_title)

In [10]:
# Checking the result.
df_arxiv.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,source
0,2009.11008,"(Image and Video Processing, Computer Vision a...",attention with multiple sources knowledges for...,"({'id': '-3656862960144035448', 'name': 'Duy M...","Until now, Coronavirus SARS-CoV-2 has caused m...",2020-09-23,arXiv
1,2009.10931,"(Quantitative Methods, Machine Learning)",drug repurposing for covid-19 using graph neur...,"({'id': '-4571039949857585287', 'name': 'Kangl...",Amid the pandemic of 2019 novel coronavirus di...,2020-09-23,arXiv
2,2009.10808,"(Machine Learning, Applications)",using machine learning to develop a novel covi...,"({'id': '-1643832521739170778', 'name': 'Anuj ...",COVID19 is now one of the most leading causes ...,2020-09-22,arXiv
3,2009.10648,"(Social and Information Networks, Physics and ...",google covid-19 community mobility reports: in...,"({'id': '1593276023866582611', 'name': 'Gabrie...",Social distancing (SD) has been critical in th...,2020-09-17,arXiv
4,2009.10608,"(Image and Video Processing, Computer Vision a...",dual encoder fusion u-net (defu-net) for cross...,"({'id': '-4095306500263987581', 'name': 'Lipei...",A number of methods based on the deep learning...,2020-09-11,arXiv


In [11]:
# Visualizing the information of dataset.
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                2335 non-null   object        
 1   subject_areas     2335 non-null   object        
 2   title             2335 non-null   object        
 3   authors           2335 non-null   object        
 4   abstract          2335 non-null   object        
 5   publication_date  2335 non-null   datetime64[ns]
 6   source            2335 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 127.8+ KB


### 2.2. bioRxiv

In [12]:
# Getting the data.
df_biorxiv = pd.read_csv("../../data/prepared/biorxiv_covid_19.csv", header=0, dtype={"id": "str"})

In [13]:
# Checking the dataframe.
df_biorxiv.head()

Unnamed: 0,doi,title,publication_date,source,id,abstract,author_affil,subject_areas
0,10.1101/2020.08.25.20181545,"""I walk around like my hands are covered in mu...",2020-08-31,medRxiv,2020.08.25.20181545,Objectives: To investigate how and why Canadia...,"({'id': '-4223852819809795845', 'name': 'Robyn...","('public and global health',)"
1,10.1101/2020.05.28.120709,"""Monoclonal-type"" plastic antibodies for SARS-...",2020-05-28,bioRxiv,2020.05.28.120709,Summary of the ideaOur idea is focused on the ...,"({'id': '4304627819863036756', 'name': 'France...","('synthetic biology',)"
2,10.1101/2020.04.16.20067884,"""No test is better than a bad test"": Impact of...",2020-04-22,medRxiv,2020.04.16.20067884,Testing is viewed as a critical aspect of any ...,"({'id': '-1190320489506440084', 'name': 'Nicho...","('epidemiology',)"
3,10.1101/2020.06.04.20122812,'Drawing on Wisdom to Cope with Adversity:' A ...,2020-06-07,medRxiv,2020.06.04.20122812,Background: Mental health has become one of th...,"({'id': '6026351697434995649', 'name': 'Jose M...","('psychiatry and clinical psychology',)"
4,10.1101/2020.07.11.20151308,'Trained immunity' from Mycobacterium spp. exp...,2020-07-14,medRxiv,2020.07.11.20151308,Protective variables for COVID-19 are unknown....,"({'id': '-655212003565356990', 'name': 'Samer ...","('infectious diseases',)"


In [14]:
# Visualizing the information of dataset.
df_biorxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9027 entries, 0 to 9026
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   doi               9027 non-null   object
 1   title             9027 non-null   object
 2   publication_date  9027 non-null   object
 3   source            9027 non-null   object
 4   id                9027 non-null   object
 5   abstract          9023 non-null   object
 6   author_affil      9027 non-null   object
 7   subject_areas     9027 non-null   object
dtypes: object(8)
memory usage: 564.3+ KB


In [15]:
# Defining the "None" value for the "NaN" values.
df_biorxiv.replace({np.nan: None}, inplace=True)

In [16]:
# Changing the type of features.
df_biorxiv.author_affil.loc[df_biorxiv.author_affil.notnull()] = df_biorxiv.author_affil.loc[
    df_biorxiv.author_affil.notnull()].apply(eval)
df_biorxiv.subject_areas.loc[df_biorxiv.subject_areas.notnull()] = df_biorxiv.subject_areas.loc[
    df_biorxiv.subject_areas.notnull()].apply(eval)
df_biorxiv.publication_date = pd.to_datetime(df_biorxiv.publication_date)

In [17]:
# Normalizing the feature "title".
df_biorxiv.title = df_biorxiv.title.apply(clean_title)

In [18]:
# Checking the dataframe.
df_biorxiv.head()

Unnamed: 0,doi,title,publication_date,source,id,abstract,author_affil,subject_areas
0,10.1101/2020.08.25.20181545,i walk around like my hands are covered in mud...,2020-08-31,medRxiv,2020.08.25.20181545,Objectives: To investigate how and why Canadia...,"({'id': '-4223852819809795845', 'name': 'Robyn...","(public and global health,)"
1,10.1101/2020.05.28.120709,"monoclonal-type"" plastic antibodies for sars-c...",2020-05-28,bioRxiv,2020.05.28.120709,Summary of the ideaOur idea is focused on the ...,"({'id': '4304627819863036756', 'name': 'France...","(synthetic biology,)"
2,10.1101/2020.04.16.20067884,"no test is better than a bad test"": impact of ...",2020-04-22,medRxiv,2020.04.16.20067884,Testing is viewed as a critical aspect of any ...,"({'id': '-1190320489506440084', 'name': 'Nicho...","(epidemiology,)"
3,10.1101/2020.06.04.20122812,drawing on wisdom to cope with adversity:' a s...,2020-06-07,medRxiv,2020.06.04.20122812,Background: Mental health has become one of th...,"({'id': '6026351697434995649', 'name': 'Jose M...","(psychiatry and clinical psychology,)"
4,10.1101/2020.07.11.20151308,trained immunity' from mycobacterium spp. expo...,2020-07-14,medRxiv,2020.07.11.20151308,Protective variables for COVID-19 are unknown....,"({'id': '-655212003565356990', 'name': 'Samer ...","(infectious diseases,)"


In [19]:
# Visualizing the information of dataset.
df_biorxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9027 entries, 0 to 9026
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   doi               9027 non-null   object        
 1   title             9027 non-null   object        
 2   publication_date  9027 non-null   datetime64[ns]
 3   source            9027 non-null   object        
 4   id                9027 non-null   object        
 5   abstract          9023 non-null   object        
 6   author_affil      9027 non-null   object        
 7   subject_areas     9027 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 564.3+ KB


### 2.3. PubMed

In [20]:
# Getting the data.
df_pubmed = pd.read_csv("../../data/prepared/pubmed_covid_19.csv", header=0, dtype={"pubmed_id": "str"})

In [21]:
# Checking the dataframe.
df_pubmed.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi
0,32966253,Post-COVID-19 management guidelines for orthod...,,,Journal of clinical orthodontics : JCO,2020-09-24,"({'name': 'Jae Hyun Park', 'id': '491700415152...",
1,32966252,Orthodontics in the COVID-19 Era: The way forw...,,,Journal of clinical orthodontics : JCO,2020-09-24,"({'name': 'M Srirengalakshmi', 'id': '26238586...",
2,32964105,Unintended consequences of COVID-19: Opportuni...,,"('coronavirus infections', 'mechanical ventila...",Canadian journal of respiratory therapy : CJRT...,2020-09-24,"({'name': 'Patricia McClurg', 'id': '-28749460...",
3,32965930,StatPearls,Amidst the coronavirus 2019-nCoV (COVID-19) pa...,,,2020,"({'name': 'Onyinyechukwu Okorji', 'id': '65138...",
4,32963099,,The membrane-anchored spike (S) protein of sev...,"('S2 fusion peptide-containing domain', 'coron...",mSystems,2020-09-24,"({'name': 'Nishant Shekhar', 'id': '7707225331...",10.1128/mSystems.00382-20


In [22]:
# Visualizing the information of dataset.
df_pubmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55503 entries, 0 to 55502
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pubmed_id         55503 non-null  object
 1   title             55407 non-null  object
 2   abstract          32029 non-null  object
 3   auth_keywords     28215 non-null  object
 4   vehicle_name      55485 non-null  object
 5   publication_date  55503 non-null  object
 6   author_affil      54983 non-null  object
 7   doi               54480 non-null  object
dtypes: object(8)
memory usage: 3.4+ MB


In [23]:
# Defining the "None" value for the "NaN" values.
df_pubmed.replace({np.nan: None}, inplace=True)

In [24]:
# Changing the type of features.
df_pubmed.auth_keywords.loc[df_pubmed.auth_keywords.notnull()] = df_pubmed.auth_keywords.loc[
    df_pubmed.auth_keywords.notnull()].apply(eval)
df_pubmed.author_affil.loc[df_pubmed.author_affil.notnull()] = df_pubmed.author_affil.loc[
    df_pubmed.author_affil.notnull()].apply(eval)
df_pubmed.publication_date = pd.to_datetime(df_pubmed.publication_date)

In [25]:
# Defining the feature "source".
df_pubmed["source"] = "PubMed"

In [26]:
# Normalizing the feature "title".
df_pubmed.title.loc[df_pubmed.title.notnull()] = df_pubmed.title.loc[
    df_pubmed.title.notnull()].apply(clean_title)

In [27]:
# Checking the dataframe.
df_pubmed.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi,source
0,32966253,post-covid-19 management guidelines for orthod...,,,Journal of clinical orthodontics : JCO,2020-09-24,"({'name': 'Jae Hyun Park', 'id': '491700415152...",,PubMed
1,32966252,orthodontics in the covid-19 era: the way forw...,,,Journal of clinical orthodontics : JCO,2020-09-24,"({'name': 'M Srirengalakshmi', 'id': '26238586...",,PubMed
2,32964105,unintended consequences of covid-19: opportuni...,,"(coronavirus infections, mechanical ventilator...",Canadian journal of respiratory therapy : CJRT...,2020-09-24,"({'name': 'Patricia McClurg', 'id': '-28749460...",,PubMed
3,32965930,statpearls,Amidst the coronavirus 2019-nCoV (COVID-19) pa...,,,2020-01-01,"({'name': 'Onyinyechukwu Okorji', 'id': '65138...",,PubMed
4,32963099,,The membrane-anchored spike (S) protein of sev...,"(S2 fusion peptide-containing domain, coronavi...",mSystems,2020-09-24,"({'name': 'Nishant Shekhar', 'id': '7707225331...",10.1128/mSystems.00382-20,PubMed


In [28]:
# Visualizing the information of dataset.
df_pubmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55503 entries, 0 to 55502
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   pubmed_id         55503 non-null  object        
 1   title             55405 non-null  object        
 2   abstract          32029 non-null  object        
 3   auth_keywords     28215 non-null  object        
 4   vehicle_name      55485 non-null  object        
 5   publication_date  55503 non-null  datetime64[ns]
 6   author_affil      54983 non-null  object        
 7   doi               54480 non-null  object        
 8   source            55503 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 3.8+ MB


### 2.4. Scopus

In [29]:
# Getting the data.
df_scopus = pd.read_csv("../../data/prepared/scopus_covid_19.csv", header=0,
    dtype={"id": "str", "eid": "str", "pii": "str", "pubmed_id": "str"})

In [30]:
# Checking the dataframe.
df_scopus.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,index_terms,issn,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references
0,85090092938,,,12th International Conference on Intelligent N...,The proceedings contain 48 papers. The special...,2021-01-01,0,eng,Book Series,k,...,,21945365 21945357,12th International Conference on Intelligent N...,Springer,,"('Control and Systems Engineering', 'Computer ...",,,0,
1,85090051630,,,23rd International Conference on Network-Based...,The proceedings contain 61 papers. The special...,2021-01-01,0,eng,Book Series,k,...,,21945365 21945357,23rd International Conference on Network-Based...,Springer,,"('Control and Systems Engineering', 'Computer ...",,,0,
2,85090760353,,,Open educational resources in Canada 2020,"© 2020, Canadian Network for Innovation in Edu...",2020-12-01,0,eng,Journal,j,...,,14996685,Canadian Journal of Learning and Technology,Canadian Network for Innovation in Education,"({'id': '60018510', 'affiliation': 'Athabasca ...","('Education', 'Computer Science Applications',...","({'id': '6507436970', 'name': 'Rory McGreal'},)","({'id': '6507436970', 'name': 'Rory McGreal', ...",25,"({'id': '85090794360', 'title': None, 'doi': N..."
3,85082677914,,,Industry reacts to coronavirus outbreak: Nonwo...,,2020-12-01,0,eng,Trade Journal,d,...,,01634429,Nonwovens Industry,Rodman Publications Inc.,,"('Business and International Management', 'Mat...","({'id': '57193877330', 'name': 'Tara Olivo'},)","({'id': '57193877330', 'name': 'Tara Olivo', '...",0,
4,85090891903,,,Proceedings of the 6th International Conferenc...,The proceedings contain 105 papers. The topics...,2020-09-14,0,eng,Conference Proceeding,p,...,,,6th International Conference on Engineering an...,Association for Computing Machinery,,"('Computer Networks and Communications', 'Comp...",,,0,


In [31]:
# Visualizing the information of dataset.
df_scopus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53266 entries, 0 to 53265
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                53266 non-null  object
 1   doi               51734 non-null  object
 2   pubmed_id         35006 non-null  object
 3   title             53266 non-null  object
 4   abstract          30189 non-null  object
 5   publication_date  53266 non-null  object
 6   citation_num      53266 non-null  int64 
 7   language          52982 non-null  object
 8   production_type   53266 non-null  object
 9   source_type       53266 non-null  object
 10  auth_keywords     28011 non-null  object
 11  index_terms       23975 non-null  object
 12  issn              53176 non-null  object
 13  vehicle_name      53266 non-null  object
 14  publisher         53262 non-null  object
 15  affiliations      49380 non-null  object
 16  subject_areas     53203 non-null  object
 17  authors     

In [32]:
# Defining the "None" value for the "NaN" values.
df_scopus.replace({np.nan: None}, inplace=True)

In [33]:
# Changing the type of features.
df_scopus.auth_keywords.loc[df_scopus.auth_keywords.notnull()] = df_scopus.auth_keywords.loc[
    df_scopus.auth_keywords.notnull()].apply(eval)
df_scopus.index_terms.loc[df_scopus.index_terms.notnull()] = df_scopus.index_terms.loc[
    df_scopus.index_terms.notnull()].apply(eval)
df_scopus.affiliations.loc[df_scopus.affiliations.notnull()] = df_scopus.affiliations.loc[
    df_scopus.affiliations.notnull()].apply(eval)
df_scopus.subject_areas.loc[df_scopus.subject_areas.notnull()] = df_scopus.subject_areas.loc[
    df_scopus.subject_areas.notnull()].apply(eval)
df_scopus.authors.loc[df_scopus.authors.notnull()] = df_scopus.authors.loc[
    df_scopus.authors.notnull()].apply(eval)
df_scopus.author_affil.loc[df_scopus.author_affil.notnull()] = df_scopus.author_affil.loc[
    df_scopus.author_affil.notnull()].apply(eval)
df_scopus.references.loc[df_scopus.references.notnull()] = df_scopus.references.loc[
    df_scopus.references.notnull()].apply(eval)
df_scopus.publication_date = pd.to_datetime(df_scopus.publication_date)

In [34]:
# Defining the feature "source".
df_scopus["source"] = "Scopus"

In [35]:
# Normalizing the feature "title".
df_scopus.title = df_scopus.title.apply(clean_title)

In [36]:
# Checking the dataframe.
df_scopus.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,issn,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references,source
0,85090092938,,,12th international conference on intelligent n...,The proceedings contain 48 papers. The special...,2021-01-01,0,eng,Book Series,k,...,21945365 21945357,12th International Conference on Intelligent N...,Springer,,"(Control and Systems Engineering, Computer Sci...",,,0,,Scopus
1,85090051630,,,23rd international conference on network-based...,The proceedings contain 61 papers. The special...,2021-01-01,0,eng,Book Series,k,...,21945365 21945357,23rd International Conference on Network-Based...,Springer,,"(Control and Systems Engineering, Computer Sci...",,,0,,Scopus
2,85090760353,,,open educational resources in canada 2020,"© 2020, Canadian Network for Innovation in Edu...",2020-12-01,0,eng,Journal,j,...,14996685,Canadian Journal of Learning and Technology,Canadian Network for Innovation in Education,"({'id': '60018510', 'affiliation': 'Athabasca ...","(Education, Computer Science Applications, Man...","({'id': '6507436970', 'name': 'Rory McGreal'},)","({'id': '6507436970', 'name': 'Rory McGreal', ...",25,"({'id': '85090794360', 'title': None, 'doi': N...",Scopus
3,85082677914,,,industry reacts to coronavirus outbreak: nonwo...,,2020-12-01,0,eng,Trade Journal,d,...,01634429,Nonwovens Industry,Rodman Publications Inc.,,"(Business and International Management, Materi...","({'id': '57193877330', 'name': 'Tara Olivo'},)","({'id': '57193877330', 'name': 'Tara Olivo', '...",0,,Scopus
4,85090891903,,,proceedings of the 6th international conferenc...,The proceedings contain 105 papers. The topics...,2020-09-14,0,eng,Conference Proceeding,p,...,,6th International Conference on Engineering an...,Association for Computing Machinery,,"(Computer Networks and Communications, Compute...",,,0,,Scopus


In [37]:
# Visualizing the information of dataset.
df_scopus.info()     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53266 entries, 0 to 53265
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                53266 non-null  object        
 1   doi               51734 non-null  object        
 2   pubmed_id         35006 non-null  object        
 3   title             53266 non-null  object        
 4   abstract          30189 non-null  object        
 5   publication_date  53266 non-null  datetime64[ns]
 6   citation_num      53266 non-null  int64         
 7   language          52982 non-null  object        
 8   production_type   53266 non-null  object        
 9   source_type       53266 non-null  object        
 10  auth_keywords     28011 non-null  object        
 11  index_terms       23975 non-null  object        
 12  issn              53176 non-null  object        
 13  vehicle_name      53266 non-null  object        
 14  publisher         5326

## 3. Merging/Joining the datasets

In [38]:
# Checking the duplicated records between arXiv and bioRxiv.
df_arxiv.id[df_arxiv.title.isin(df_biorxiv.title.values)].size

138

In [39]:
# Checking the duplicated records between arXiv and PubMed.
df_arxiv.id[df_arxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)].size

206

In [40]:
# Checking the duplicated records between arXiv and Scopus.
df_arxiv.id[df_arxiv.title.isin(df_scopus.title.values)].size

187

In [41]:
# Checking the duplicated records between bioRxiv and PubMed.
df_biorxiv.id[df_biorxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) &
    df_biorxiv.doi.isin(df_pubmed.doi[df_pubmed.doi.notnull()].values)].size

787

In [42]:
# Checking the duplicated records between bioRxiv and Scopus.
df_biorxiv.id[df_biorxiv.title.isin(df_scopus.title.values) &
    df_biorxiv.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].size

0

In [43]:
# Checking the duplicated records between PubMed and Scopus by only the feature "pubmed_id".
df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[df_scopus.pubmed_id.notnull()].values)].size

32741

In [44]:
# Checking the duplicated records between PubMed and Scopus by the features "title" and "doi".
df_pubmed.pubmed_id[~df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[df_scopus.pubmed_id.notnull()].values) &
    df_pubmed.title.isin(df_scopus.title.values) &
    df_pubmed.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].size

9068

In [45]:
# Filling the missing values of PubMed's features "title" and "doi" with data from Scopus.
df_pubmed.loc[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.title.isnull(), "title"] = \
    df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.title.isnull()].apply(
        lambda x: df_scopus.title[df_scopus.pubmed_id == x].item())
df_pubmed.loc[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.doi.isnull(), "doi"] = \
    df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.doi.isnull()].apply(
        lambda x: np.reshape(df_scopus.doi[df_scopus.pubmed_id == x].values, -1)[0] \
            if df_scopus.doi[df_scopus.pubmed_id == x].size > 0 else None)
df_pubmed.loc[df_pubmed.doi[df_pubmed.doi.notnull()].isin(df_scopus.doi[df_scopus.doi.notnull()].values) &
    df_pubmed.title.isnull(), "title"] = df_pubmed.doi[df_pubmed.doi[df_pubmed.doi.notnull()].isin(
        df_scopus.doi[df_scopus.doi.notnull()].values) & df_pubmed.title.isnull()].apply(
            lambda x: df_scopus.title[df_scopus.doi == x].item())

In [46]:
# Filling the missing values of PubMed's features "title", "abstract", "subject_areas" and "doi" with data from bioRxiv.
df_pubmed.loc[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.title.isnull(), "title"] = \
    df_pubmed.doi[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.title.isnull()].apply(
        lambda x: df_biorxiv.title[df_biorxiv.doi == x].item())
df_pubmed.loc[df_pubmed.title.isin(df_biorxiv.title.values) & df_pubmed.doi.isnull(), "doi"] = \
    df_pubmed.loc[df_pubmed.title.isin(df_biorxiv.title.values) & df_pubmed.doi.isnull(), ["doi", "title"]].apply(
        lambda x: df_biorxiv.doi[df_biorxiv.title == x.title].item() if not x.doi else x.doi, axis=1)
df_pubmed.loc[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.abstract.isnull(), "abstract"] = \
    df_pubmed.doi[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.abstract.isnull()].apply(
        lambda x: df_biorxiv.abstract[df_biorxiv.doi == x].item())
df_pubmed.loc[df_pubmed.doi.isin(df_biorxiv.doi.values), "subject_areas"] = df_pubmed.doi[
    df_pubmed.doi.isin(df_biorxiv.doi.values)].apply(lambda x: df_biorxiv.subject_areas[
        df_biorxiv.doi == x].item())

In [47]:
# Filling the missing values of PubMed's features "abstract" and "subject_areas" with data from arXiv.
df_pubmed.loc[df_pubmed.title.isin(df_arxiv.title.values) & df_pubmed.abstract.isnull(), "abstract"] = \
    df_pubmed.title[df_pubmed.title.isin(df_arxiv.title.values) & df_pubmed.abstract.isnull()].apply(
        lambda x: df_arxiv.abstract[df_arxiv.title == x].item())
df_pubmed.loc[df_pubmed.title.isin(df_arxiv.title.values), "subject_areas"] = df_pubmed.title[
    df_pubmed.title.isin(df_arxiv.title.values)].apply(
        lambda x: df_arxiv.subject_areas[df_arxiv.title == x].item())

In [48]:
# Filling the missing values of Scopus' features "abstract" and "subject_areas" with data from arXiv.
df_scopus.loc[df_scopus.title.isin(df_arxiv.title.values) & df_scopus.abstract.isnull(), "abstract"] = \
    df_scopus.title[df_scopus.title.isin(df_arxiv.title.values) & df_scopus.abstract.isnull()].apply(
        lambda x: df_arxiv.abstract[df_arxiv.title == x].item())
df_scopus.loc[df_scopus.title.isin(df_arxiv.title.values) & df_scopus.subject_areas.isnull(),
    "subject_areas"] = df_scopus.title[df_scopus.title.isin(df_arxiv.title.values) &
        df_scopus.subject_areas.isnull()].apply(lambda x: df_arxiv.subject_areas[df_arxiv.title == x].item())

In [49]:
# Filling the missing values of Scopus' features "doi" and "pubmed_id" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.doi.isnull(), "doi"] = \
    df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.doi.isnull(),
        ["doi", "pubmed_id"]].apply(lambda x: df_pubmed.doi[df_pubmed.pubmed_id == x.pubmed_id].item() \
            if not x.doi else x.doi, axis=1)
df_scopus.loc[df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)
    & df_scopus.pubmed_id.isnull() & df_scopus.doi[
        df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values), "pubmed_id"] = \
df_scopus.loc[df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)
    & df_scopus.pubmed_id.isnull() & df_scopus.doi[
        df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values),
    ["pubmed_id", "title", "doi"]].apply(lambda x: x.pubmed_id if x.pubmed_id else np.reshape(
        df_pubmed.pubmed_id[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].values, -1)[0] \
            if df_pubmed.pubmed_id[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].size > 0 \
                else None, axis=1)

In [50]:
# Filling the missing values of Scopus' feature "abstract" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.abstract.isnull(), "abstract"] = \
    df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.abstract.isnull(),
        ["abstract", "pubmed_id"]].apply(lambda x: df_pubmed.abstract[
            df_pubmed.pubmed_id == x.pubmed_id].item() if not x.abstract else x.abstract, axis=1)
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.abstract.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values), "abstract"] = \
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.abstract.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values),
    ["abstract", "title", "doi"]].apply(lambda x: x.abstract if not x.abstract else np.reshape(
        df_pubmed.abstract[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].values, -1)[0] \
            if df_pubmed.abstract[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].size > 0 \
                else None, axis=1)

In [51]:
# Filling the missing values of Scopus' feature "auth_keywords" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.auth_keywords.isnull(),
    "auth_keywords"] = df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
        df_scopus.auth_keywords.isnull(), ["auth_keywords", "pubmed_id"]].apply(
            lambda x: df_pubmed.auth_keywords[df_pubmed.pubmed_id == x.pubmed_id].item() \
                if not x.auth_keywords else x.auth_keywords, axis=1)
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.auth_keywords.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values), "auth_keywords"] = \
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.auth_keywords.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values),
    ["auth_keywords", "title", "doi"]].apply(lambda x: x.auth_keywords if x.auth_keywords else np.reshape(
        df_pubmed.auth_keywords[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].values, -1)[0] \
            if df_pubmed.auth_keywords[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].size > 0 \
                else None, axis=1)

In [52]:
# Filling the missing values of Scopus' features "author_affil" and "subject_areas" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.author_affil.isnull(),
    "author_affil"] = df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
        df_scopus.author_affil.isnull(), ["author_affil", "pubmed_id"]].apply(
            lambda x: df_pubmed.author_affil[df_pubmed.pubmed_id == x.pubmed_id].item() \
                if not x.author_affil else x.author_affil, axis=1)
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.subject_areas.isnull(),
    "subject_areas"] = df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
        df_scopus.subject_areas.isnull(), ["subject_areas", "pubmed_id"]].apply(
            lambda x: df_pubmed.subject_areas[df_pubmed.pubmed_id == x.pubmed_id].item() \
                if not x.subject_areas else x.subject_areas, axis=1)

In [53]:
# Removing the duplicated records between arXiv and bioRxiv.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_biorxiv.title.values)]

In [54]:
# Removing the duplicated records between arXiv and PubMed.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)]

In [55]:
# Removing the duplicated records between arXiv and Scopus.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_scopus.title.values)]

In [56]:
# Removing the duplicated records between bioRxiv and PubMed.
df_biorxiv = df_biorxiv[~(df_biorxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) &
    df_biorxiv.doi.isin(df_pubmed.doi[df_pubmed.doi.notnull()].values))]

In [57]:
# Removing the duplicated records between bioRxiv and Scopus.
df_biorxiv = df_biorxiv[~(df_biorxiv.title.isin(df_scopus.title.values) &
    df_biorxiv.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values))]

In [58]:
# Removing the duplicated records between PubMed and Scopus.
idx_removed = df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[
    df_scopus.pubmed_id.notnull()].values)].index.to_list()
idx_removed += df_pubmed.pubmed_id[~df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[
        df_scopus.pubmed_id.notnull()].values) &
    df_pubmed.title.isin(df_scopus.title.values) &
    df_pubmed.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].index.to_list()
df_pubmed = df_pubmed[~df_pubmed.index.isin(list(set(idx_removed)))]

In [59]:
# Visualizing the final number of records for each dataset.
print("arXiv:", df_arxiv.id.size)
print("bioRxiv:", df_biorxiv.id.size)
print("PubMed:", df_pubmed.pubmed_id.size)
print("Scopus:", df_scopus.id.size)
print("Expected total number of records for the final dataset:",
      (df_arxiv.id.size + df_biorxiv.id.size + df_pubmed.pubmed_id.size + df_scopus.id.size))

arXiv: 1963
bioRxiv: 8232
PubMed: 13668
Scopus: 53266
Expected total number of records for the final dataset: 77129


In [60]:
# Merging/Joining the datasets.
df_final = pd.concat([df_arxiv, df_biorxiv, df_pubmed, df_scopus], ignore_index=True)

In [61]:
# Defining the "None" value for the "NaN" values.
df_final.replace({np.nan: None}, inplace=True)

In [62]:
# Renaming the feature "source".
df_final.rename(columns={"source": "data_source"}, inplace=True)

In [63]:
# Checking the dataframe.
df_final.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,citation_num,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references
0,2009.11008,"(Image and Video Processing, Computer Vision a...",attention with multiple sources knowledges for...,"({'id': '-3656862960144035448', 'name': 'Duy M...","Until now, Coronavirus SARS-CoV-2 has caused m...",2020-09-23,arXiv,,,,...,,,,,,,,,,
1,2009.10931,"(Quantitative Methods, Machine Learning)",drug repurposing for covid-19 using graph neur...,"({'id': '-4571039949857585287', 'name': 'Kangl...",Amid the pandemic of 2019 novel coronavirus di...,2020-09-23,arXiv,,,,...,,,,,,,,,,
2,2009.10808,"(Machine Learning, Applications)",using machine learning to develop a novel covi...,"({'id': '-1643832521739170778', 'name': 'Anuj ...",COVID19 is now one of the most leading causes ...,2020-09-22,arXiv,,,,...,,,,,,,,,,
3,2009.10648,"(Social and Information Networks, Physics and ...",google covid-19 community mobility reports: in...,"({'id': '1593276023866582611', 'name': 'Gabrie...",Social distancing (SD) has been critical in th...,2020-09-17,arXiv,,,,...,,,,,,,,,,
4,2009.10608,"(Image and Video Processing, Computer Vision a...",dual encoder fusion u-net (defu-net) for cross...,"({'id': '-4095306500263987581', 'name': 'Lipei...",A number of methods based on the deep learning...,2020-09-11,arXiv,,,,...,,,,,,,,,,


In [64]:
# Visualizing the information of dataset.
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77129 entries, 0 to 77128
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                63461 non-null  object        
 1   subject_areas     64357 non-null  object        
 2   title             77110 non-null  object        
 3   authors           54375 non-null  object        
 4   abstract          50038 non-null  object        
 5   publication_date  77129 non-null  datetime64[ns]
 6   data_source       77129 non-null  object        
 7   doi               73284 non-null  object        
 8   author_affil      74274 non-null  object        
 9   pubmed_id         57587 non-null  object        
 10  auth_keywords     36085 non-null  object        
 11  vehicle_name      66916 non-null  object        
 12  citation_num      53266 non-null  object        
 13  language          52982 non-null  object        
 14  production_type   5326

In [65]:
# Exporting the final dataset to CSV file.
df_final.to_csv("../../data/raw/final_raw.csv", index=False, quoting=csv.QUOTE_ALL)