# Merging the datasets of publications related to COVID-19

In [1]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from string import punctuation

## 1. Defining the required functions

In [2]:
# Defining the function "clean_title".
def clean_title(title):
    if len(title) == 1 and title in punctuation:
        return None
    if title[0] in punctuation:
        title = title[1:]
    if title[-1] in punctuation:
        title = title[:-1]
    return re.sub(r"\s+", " ", title).lower()

## 2. Getting and preprocessing the datasets

### 2.1. arXiv

In [3]:
# Getting the data.
df_arxiv = pd.read_csv("../../data/prepared/arxiv_covid_19.csv", header=0, dtype={"id": "str"})

In [4]:
# Checking the dataframe.
df_arxiv.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date
0,2103.03219,"('General Finance',)",The Impact of COVID-19 on Stock Market Volatil...,"({'id': '-8736464535247814394', 'name': 'Ateeb...",This paper examines the impact of coronavirus ...,2021-02-11
1,2103.03055,"('Image and Video Processing', 'Computer Visio...",Self-supervised deep convolutional neural netw...,"({'id': '1387464675565637337', 'name': 'Matej ...","Chest radiography is a relatively cheap, widel...",2021-03-04
2,2103.03038,"('Computer Vision and Pattern Recognition',)",Mobile Touchless Fingerprint Recognition: Impl...,"({'id': '1917885517247673923', 'name': 'Jannis...",This work presents an automated touchless fing...,2021-03-04
3,2103.02961,"('Image and Video Processing', 'Computer Visio...",Probabilistic combination of eigenlungs-based ...,"({'id': '-3330601887987387557', 'name': 'Juan ...",The outbreak of the COVID-19 (Coronavirus dise...,2021-03-04
4,2103.02917,"('Computers and Society', 'Computation and Lan...",MP Twitter Engagement and Abuse Post-first COV...,"({'id': '-6377212943789062063', 'name': 'Traci...",The UK has had a volatile political environmen...,2021-03-04


In [5]:
# Visualizing the information of dataset.
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3640 entries, 0 to 3639
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                3640 non-null   object
 1   subject_areas     3640 non-null   object
 2   title             3640 non-null   object
 3   authors           3640 non-null   object
 4   abstract          3640 non-null   object
 5   publication_date  3640 non-null   object
dtypes: object(6)
memory usage: 170.8+ KB


In [6]:
# Defining the "None" value for the "NaN" values.
df_arxiv.replace({np.nan: None}, inplace=True)

In [7]:
# Changing the type of features.
df_arxiv.loc[:, ["subject_areas", "authors"]] = df_arxiv.loc[:, ["subject_areas", "authors"]].apply(
    lambda x: x.apply(eval))
df_arxiv.publication_date = pd.to_datetime(df_arxiv.publication_date)

In [8]:
# Defining the feature "source".
df_arxiv["source"] = "arXiv"

In [9]:
# Normalizing the feature "title".
df_arxiv.title = df_arxiv.title.apply(clean_title)

In [10]:
# Checking the result.
df_arxiv.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,source
0,2103.03219,"(General Finance,)",the impact of covid-19 on stock market volatil...,"({'id': '-8736464535247814394', 'name': 'Ateeb...",This paper examines the impact of coronavirus ...,2021-02-11,arXiv
1,2103.03055,"(Image and Video Processing, Computer Vision a...",self-supervised deep convolutional neural netw...,"({'id': '1387464675565637337', 'name': 'Matej ...","Chest radiography is a relatively cheap, widel...",2021-03-04,arXiv
2,2103.03038,"(Computer Vision and Pattern Recognition,)",mobile touchless fingerprint recognition: impl...,"({'id': '1917885517247673923', 'name': 'Jannis...",This work presents an automated touchless fing...,2021-03-04,arXiv
3,2103.02961,"(Image and Video Processing, Computer Vision a...",probabilistic combination of eigenlungs-based ...,"({'id': '-3330601887987387557', 'name': 'Juan ...",The outbreak of the COVID-19 (Coronavirus dise...,2021-03-04,arXiv
4,2103.02917,"(Computers and Society, Computation and Language)",mp twitter engagement and abuse post-first cov...,"({'id': '-6377212943789062063', 'name': 'Traci...",The UK has had a volatile political environmen...,2021-03-04,arXiv


In [11]:
# Visualizing the information of dataset.
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3640 entries, 0 to 3639
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                3640 non-null   object        
 1   subject_areas     3640 non-null   object        
 2   title             3640 non-null   object        
 3   authors           3640 non-null   object        
 4   abstract          3640 non-null   object        
 5   publication_date  3640 non-null   datetime64[ns]
 6   source            3640 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 199.2+ KB


### 2.2. bioRxiv

In [12]:
# Getting the data.
df_biorxiv = pd.read_csv("../../data/prepared/biorxiv_covid_19.csv", header=0, dtype={"id": "str"})

In [13]:
# Checking the dataframe.
df_biorxiv.head()

Unnamed: 0,doi,title,publication_date,source,id,abstract,author_affil,subject_areas
0,10.1101/2020.08.25.20181545,"""I walk around like my hands are covered in mu...",2020-08-31,medRxiv,2020.08.25.20181545,ObjectivesTo investigate how and why Canadians...,"({'id': '8616496335208757239', 'name': 'Robyn ...","('public and global health',)"
1,10.1101/2020.05.28.120709,"""Monoclonal-type"" plastic antibodies for SARS-...",2020-05-28,bioRxiv,2020.05.28.120709,Summary of the ideaOur idea is focused on the ...,"({'id': '2070372118427817817', 'name': 'France...","('synthetic biology',)"
2,10.1101/2020.04.16.20067884,"""No test is better than a bad test"": Impact of...",2020-04-22,medRxiv,2020.04.16.20067884,Testing is viewed as a critical aspect of any ...,"({'id': '-7499246138313714402', 'name': 'Nicho...","('epidemiology',)"
3,10.1101/2020.12.22.20248719,"""There's No Place Like Home for The Holidays:""...",2020-12-24,medRxiv,2020.12.22.20248719,"In the US, public health officials discouraged...","({'id': '-1353686671842246904', 'name': 'Shrut...","('epidemiology',)"
4,10.1101/2021.01.29.21250626,"""This is really like waiting for war and this ...",2021-02-01,medRxiv,2021.01.29.21250626,Healthcare professionals (HCPs) are facing rem...,"({'id': '-8861698337242705120', 'name': 'Madle...","('intensive care and critical care medicine',)"


In [14]:
# Visualizing the information of dataset.
df_biorxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13789 entries, 0 to 13788
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   doi               13789 non-null  object
 1   title             13789 non-null  object
 2   publication_date  13789 non-null  object
 3   source            13789 non-null  object
 4   id                13789 non-null  object
 5   abstract          13788 non-null  object
 6   author_affil      13702 non-null  object
 7   subject_areas     13789 non-null  object
dtypes: object(8)
memory usage: 861.9+ KB


In [15]:
# Defining the "None" value for the "NaN" values.
df_biorxiv.replace({np.nan: None}, inplace=True)

In [16]:
# Changing the type of features.
df_biorxiv.author_affil.loc[df_biorxiv.author_affil.notnull()] = df_biorxiv.author_affil.loc[
    df_biorxiv.author_affil.notnull()].apply(eval)
df_biorxiv.subject_areas.loc[df_biorxiv.subject_areas.notnull()] = df_biorxiv.subject_areas.loc[
    df_biorxiv.subject_areas.notnull()].apply(eval)
df_biorxiv.publication_date = pd.to_datetime(df_biorxiv.publication_date)

In [17]:
# Normalizing the feature "title".
df_biorxiv.title = df_biorxiv.title.apply(clean_title)

In [18]:
# Checking the dataframe.
df_biorxiv.head()

Unnamed: 0,doi,title,publication_date,source,id,abstract,author_affil,subject_areas
0,10.1101/2020.08.25.20181545,i walk around like my hands are covered in mud...,2020-08-31,medRxiv,2020.08.25.20181545,ObjectivesTo investigate how and why Canadians...,"({'id': '8616496335208757239', 'name': 'Robyn ...","(public and global health,)"
1,10.1101/2020.05.28.120709,"monoclonal-type"" plastic antibodies for sars-c...",2020-05-28,bioRxiv,2020.05.28.120709,Summary of the ideaOur idea is focused on the ...,"({'id': '2070372118427817817', 'name': 'France...","(synthetic biology,)"
2,10.1101/2020.04.16.20067884,"no test is better than a bad test"": impact of ...",2020-04-22,medRxiv,2020.04.16.20067884,Testing is viewed as a critical aspect of any ...,"({'id': '-7499246138313714402', 'name': 'Nicho...","(epidemiology,)"
3,10.1101/2020.12.22.20248719,"there's no place like home for the holidays:"" ...",2020-12-24,medRxiv,2020.12.22.20248719,"In the US, public health officials discouraged...","({'id': '-1353686671842246904', 'name': 'Shrut...","(epidemiology,)"
4,10.1101/2021.01.29.21250626,this is really like waiting for war and this i...,2021-02-01,medRxiv,2021.01.29.21250626,Healthcare professionals (HCPs) are facing rem...,"({'id': '-8861698337242705120', 'name': 'Madle...","(intensive care and critical care medicine,)"


In [19]:
# Visualizing the information of dataset.
df_biorxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13789 entries, 0 to 13788
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   doi               13789 non-null  object        
 1   title             13789 non-null  object        
 2   publication_date  13789 non-null  datetime64[ns]
 3   source            13789 non-null  object        
 4   id                13789 non-null  object        
 5   abstract          13788 non-null  object        
 6   author_affil      13702 non-null  object        
 7   subject_areas     13789 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 861.9+ KB


### 2.3. PubMed

In [20]:
# Getting the data.
df_pubmed = pd.read_csv("../../data/prepared/pubmed_covid_19.csv", header=0, dtype={"pubmed_id": "str"})

In [21]:
# Checking the dataframe.
df_pubmed.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi
0,33666930,Deployment of the 1st Area Medical Laboratory ...,"In December 2019, an outbreak of pneumonia cau...",,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'William Washington', 'id': '3069391...",
1,33666929,The COVID-19 Army Rapid Assessment Tool (CARAT...,The COVID-19 pandemic poses unique challenges ...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Michael J Walters', 'id': '25963685...",
2,33666928,1ST Cavalry Division Forward's Defender Europe...,The 1st Cavalry Division Forward (1CD FWD) alo...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Chi L Truong', 'id': '4333851133578...",
3,33666927,A Comprehensive Overview of the US Army Dentis...,The historic outbreak of the novel coronavirus...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Shani O Thompson Burkes', 'id': '-9...",
4,33666926,"Nutrition, Immune Function, and Infectious Dis...",Consuming a diet meeting energy demands and pr...,"('COVID-19', 'energy intake', 'immune function...","Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Tracey J Smith', 'id': '-1746242993...",


In [22]:
# Visualizing the information of dataset.
df_pubmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105413 entries, 0 to 105412
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   pubmed_id         105413 non-null  object
 1   title             105219 non-null  object
 2   abstract          67849 non-null   object
 3   auth_keywords     58334 non-null   object
 4   vehicle_name      105340 non-null  object
 5   publication_date  105413 non-null  object
 6   author_affil      104429 non-null  object
 7   doi               103371 non-null  object
dtypes: object(8)
memory usage: 6.4+ MB


In [23]:
# Defining the "None" value for the "NaN" values.
df_pubmed.replace({np.nan: None}, inplace=True)

In [24]:
# Changing the type of features.
df_pubmed.auth_keywords.loc[df_pubmed.auth_keywords.notnull()] = df_pubmed.auth_keywords.loc[
    df_pubmed.auth_keywords.notnull()].apply(eval)
df_pubmed.author_affil.loc[df_pubmed.author_affil.notnull()] = df_pubmed.author_affil.loc[
    df_pubmed.author_affil.notnull()].apply(eval)
df_pubmed.publication_date = pd.to_datetime(df_pubmed.publication_date)

In [25]:
# Defining the feature "source".
df_pubmed["source"] = "PubMed"

In [26]:
# Normalizing the feature "title".
df_pubmed.title.loc[df_pubmed.title.notnull()] = df_pubmed.title.loc[
    df_pubmed.title.notnull()].apply(clean_title)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [27]:
# Checking the dataframe.
df_pubmed.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi,source
0,33666930,deployment of the 1st area medical laboratory ...,"In December 2019, an outbreak of pneumonia cau...",,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'William Washington', 'id': '3069391...",,PubMed
1,33666929,the covid-19 army rapid assessment tool (carat...,The COVID-19 pandemic poses unique challenges ...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Michael J Walters', 'id': '25963685...",,PubMed
2,33666928,1st cavalry division forward's defender europe...,The 1st Cavalry Division Forward (1CD FWD) alo...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Chi L Truong', 'id': '4333851133578...",,PubMed
3,33666927,a comprehensive overview of the us army dentis...,The historic outbreak of the novel coronavirus...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Shani O Thompson Burkes', 'id': '-9...",,PubMed
4,33666926,"nutrition, immune function, and infectious dis...",Consuming a diet meeting energy demands and pr...,"(COVID-19, energy intake, immune function, mic...","Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Tracey J Smith', 'id': '-1746242993...",,PubMed


In [28]:
# Visualizing the information of dataset.
df_pubmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105413 entries, 0 to 105412
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   pubmed_id         105413 non-null  object        
 1   title             105215 non-null  object        
 2   abstract          67849 non-null   object        
 3   auth_keywords     58334 non-null   object        
 4   vehicle_name      105340 non-null  object        
 5   publication_date  105413 non-null  datetime64[ns]
 6   author_affil      104429 non-null  object        
 7   doi               103371 non-null  object        
 8   source            105413 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 7.2+ MB


### 2.4. Scopus

In [29]:
# Getting the data.
df_scopus = pd.read_csv("../../data/prepared/scopus_covid_19.csv", header=0, dtype=object)

In [30]:
# Checking the dataframe.
df_scopus.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,issn,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references,period
0,85086071498,,,"Apping and visualisation of health data, le co...","© 2019 University of L'Aquila, Department of C...",2019-12-01,1.0,eng,Journal,j,...,18285961.0,DISEGNARECON,"University of L'Aquila, Department of Civil Co...","({'id': '60010110', 'affiliation': 'Università...","('Architecture', 'Visual Arts and Performing A...","({'id': '57218914310', 'name': 'Enrico Cicald'...","({'id': '57215860041', 'name': 'Michele Valent...",27.0,"({'id': '77949657266', 'title': 'Health resear...",12-2019
1,85098881043,,,CODS-COMAD 2021 - Proceedings of the 3rd ACM I...,The proceedings contain 93 papers. The topics ...,2020-01-02,0.0,eng,Conference Proceeding,p,...,,3rd ACM India Joint International Conference o...,Association for Computing Machinery,,"('Human-Computer Interaction', 'Computer Netwo...",,,0.0,,01-2020
2,85082342162,,32200398.0,The Novel Coronavirus (SARS-CoV-2) Epidemic,,2020-01-01,14.0,eng,Journal,j,...,3044602.0,"Annals of the Academy of Medicine, Singapore",NLM (Medline),"({'id': '60017161', 'affiliation': 'National U...","('Medicine (all)',)","({'id': '8161583900', 'name': 'Li Yang Hsu'}, ...","({'id': '57215908259', 'name': 'Jeremy Fy Lim'...",0.0,,01-2020
3,85083405993,,32291373.0,Gastrointestinal Presentation in COVID-19 in I...,Severe acute respiratory syndrome coronavirus ...,2020-01-01,7.0,eng,Journal,j,...,1259326.0,Acta medica Indonesiana,NLM (Medline),"({'id': '60069377', 'affiliation': 'Universita...","('Medicine (all)',)","({'id': '57202798959', 'name': 'Muhammad Khifz...","({'id': '14325991900', 'name': 'Siti Setiati',...",0.0,,01-2020
4,85083410524,,32291376.0,Clinical Progression of COVID-19 Patient with ...,"Coronavirus Disease 2019 (COVID-19), previousl...",2020-01-01,6.0,eng,Journal,j,...,1259326.0,Acta medica Indonesiana,NLM (Medline),"({'id': '60196806', 'affiliation': 'RSUP Persa...","('Medicine (all)',)","({'id': '36058554600', 'name': 'Erlina Burhan'...","({'id': '57216405941', 'name': 'Tina Reisa', '...",0.0,,01-2020


In [31]:
# Visualizing the information of dataset.
df_scopus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84526 entries, 0 to 84525
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                84526 non-null  object
 1   doi               81799 non-null  object
 2   pubmed_id         58269 non-null  object
 3   title             84526 non-null  object
 4   abstract          52326 non-null  object
 5   publication_date  84526 non-null  object
 6   citation_num      84526 non-null  object
 7   language          84077 non-null  object
 8   production_type   84526 non-null  object
 9   source_type       84526 non-null  object
 10  auth_keywords     46182 non-null  object
 11  index_terms       50712 non-null  object
 12  issn              82964 non-null  object
 13  vehicle_name      84526 non-null  object
 14  publisher         84522 non-null  object
 15  affiliations      78409 non-null  object
 16  subject_areas     84481 non-null  object
 17  authors     

In [32]:
# Defining the "None" value for the "NaN" values.
df_scopus.replace({np.nan: None}, inplace=True)

In [33]:
# Changing the type of features.
df_scopus.auth_keywords.loc[df_scopus.auth_keywords.notnull()] = df_scopus.auth_keywords.loc[
    df_scopus.auth_keywords.notnull()].apply(eval)
df_scopus.index_terms.loc[df_scopus.index_terms.notnull()] = df_scopus.index_terms.loc[
    df_scopus.index_terms.notnull()].apply(eval)
df_scopus.affiliations.loc[df_scopus.affiliations.notnull()] = df_scopus.affiliations.loc[
    df_scopus.affiliations.notnull()].apply(eval)
df_scopus.subject_areas.loc[df_scopus.subject_areas.notnull()] = df_scopus.subject_areas.loc[
    df_scopus.subject_areas.notnull()].apply(eval)
df_scopus.authors.loc[df_scopus.authors.notnull()] = df_scopus.authors.loc[
    df_scopus.authors.notnull()].apply(eval)
df_scopus.author_affil.loc[df_scopus.author_affil.notnull()] = df_scopus.author_affil.loc[
    df_scopus.author_affil.notnull()].apply(eval)
df_scopus.references.loc[df_scopus.references.notnull()] = df_scopus.references.loc[
    df_scopus.references.notnull()].apply(eval)
df_scopus.publication_date = pd.to_datetime(df_scopus.publication_date)

In [34]:
# Defining the feature "source".
df_scopus["source"] = "Scopus"

In [35]:
# Normalizing the feature "title".
df_scopus.title = df_scopus.title.apply(clean_title)

In [36]:
# Checking the dataframe.
df_scopus.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references,period,source
0,85086071498,,,"apping and visualisation of health data, le co...","© 2019 University of L'Aquila, Department of C...",2019-12-01,1.0,eng,Journal,j,...,DISEGNARECON,"University of L'Aquila, Department of Civil Co...","({'id': '60010110', 'affiliation': 'Università...","(Architecture, Visual Arts and Performing Arts...","({'id': '57218914310', 'name': 'Enrico Cicald'...","({'id': '57215860041', 'name': 'Michele Valent...",27.0,"({'id': '77949657266', 'title': 'Health resear...",12-2019,Scopus
1,85098881043,,,cods-comad 2021 - proceedings of the 3rd acm i...,The proceedings contain 93 papers. The topics ...,2020-01-02,0.0,eng,Conference Proceeding,p,...,3rd ACM India Joint International Conference o...,Association for Computing Machinery,,"(Human-Computer Interaction, Computer Networks...",,,0.0,,01-2020,Scopus
2,85082342162,,32200398.0,the novel coronavirus (sars-cov-2) epidemic,,2020-01-01,14.0,eng,Journal,j,...,"Annals of the Academy of Medicine, Singapore",NLM (Medline),"({'id': '60017161', 'affiliation': 'National U...","(Medicine (all),)","({'id': '8161583900', 'name': 'Li Yang Hsu'}, ...","({'id': '57215908259', 'name': 'Jeremy Fy Lim'...",0.0,,01-2020,Scopus
3,85083405993,,32291373.0,gastrointestinal presentation in covid-19 in i...,Severe acute respiratory syndrome coronavirus ...,2020-01-01,7.0,eng,Journal,j,...,Acta medica Indonesiana,NLM (Medline),"({'id': '60069377', 'affiliation': 'Universita...","(Medicine (all),)","({'id': '57202798959', 'name': 'Muhammad Khifz...","({'id': '14325991900', 'name': 'Siti Setiati',...",0.0,,01-2020,Scopus
4,85083410524,,32291376.0,clinical progression of covid-19 patient with ...,"Coronavirus Disease 2019 (COVID-19), previousl...",2020-01-01,6.0,eng,Journal,j,...,Acta medica Indonesiana,NLM (Medline),"({'id': '60196806', 'affiliation': 'RSUP Persa...","(Medicine (all),)","({'id': '36058554600', 'name': 'Erlina Burhan'...","({'id': '57216405941', 'name': 'Tina Reisa', '...",0.0,,01-2020,Scopus


In [37]:
# Visualizing the information of dataset.
df_scopus.info()     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84526 entries, 0 to 84525
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                84526 non-null  object        
 1   doi               81799 non-null  object        
 2   pubmed_id         58269 non-null  object        
 3   title             84526 non-null  object        
 4   abstract          52326 non-null  object        
 5   publication_date  84526 non-null  datetime64[ns]
 6   citation_num      84526 non-null  object        
 7   language          84077 non-null  object        
 8   production_type   84526 non-null  object        
 9   source_type       84526 non-null  object        
 10  auth_keywords     46182 non-null  object        
 11  index_terms       50712 non-null  object        
 12  issn              82964 non-null  object        
 13  vehicle_name      84526 non-null  object        
 14  publisher         8452

## 3. Merging/Joining the datasets

In [38]:
# Checking the duplicated records between arXiv and bioRxiv.
df_arxiv.id[df_arxiv.title.isin(df_biorxiv.title.values)].size

157

In [39]:
# Checking the duplicated records between arXiv and PubMed.
df_arxiv.id[df_arxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)].size

429

In [40]:
# Checking the duplicated records between arXiv and Scopus.
df_arxiv.id[df_arxiv.title.isin(df_scopus.title.values)].size

403

In [41]:
# Checking the duplicated records between bioRxiv and PubMed.
df_biorxiv.id[df_biorxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) &
    df_biorxiv.doi.isin(df_pubmed.doi[df_pubmed.doi.notnull()].values)].size

1384

In [42]:
# Checking the duplicated records between bioRxiv and Scopus.
df_biorxiv.id[df_biorxiv.title.isin(df_scopus.title.values) &
    df_biorxiv.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].size

0

In [43]:
# Checking the duplicated records between PubMed and Scopus by only the feature "pubmed_id".
df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[df_scopus.pubmed_id.notnull()].values)].size

54211

In [44]:
# Checking the duplicated records between PubMed and Scopus by the features "title" and "doi".
df_pubmed.pubmed_id[~df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[df_scopus.pubmed_id.notnull()].values) &
    df_pubmed.title.isin(df_scopus.title.values) &
    df_pubmed.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].size

10655

In [45]:
# Filling the missing values of PubMed's features "title" and "doi" with data from Scopus.
df_pubmed.loc[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.title.isnull(), "title"] = \
    df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.title.isnull()].apply(
        lambda x: df_scopus.title[df_scopus.pubmed_id == x].iloc[0])
df_pubmed.loc[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.doi.isnull(), "doi"] = \
    df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id.values) & df_pubmed.doi.isnull()].apply(
        lambda x: np.reshape(df_scopus.doi[df_scopus.pubmed_id == x].values, -1)[0] \
            if df_scopus.doi[df_scopus.pubmed_id == x].size > 0 else None)
df_pubmed.loc[df_pubmed.doi[df_pubmed.doi.notnull()].isin(df_scopus.doi[df_scopus.doi.notnull()].values) &
    df_pubmed.title.isnull(), "title"] = df_pubmed.doi[df_pubmed.doi[df_pubmed.doi.notnull()].isin(
        df_scopus.doi[df_scopus.doi.notnull()].values) & df_pubmed.title.isnull()].apply(
            lambda x: df_scopus.title[df_scopus.doi == x].item())

In [46]:
# Filling the missing values of PubMed's features "title", "abstract", "subject_areas" and "doi" with data from bioRxiv.
df_pubmed.loc[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.title.isnull(), "title"] = \
    df_pubmed.doi[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.title.isnull()].apply(
        lambda x: df_biorxiv.title[df_biorxiv.doi == x].item())
df_pubmed.loc[df_pubmed.title.isin(df_biorxiv.title.values) & df_pubmed.doi.isnull(), "doi"] = \
    df_pubmed.loc[df_pubmed.title.isin(df_biorxiv.title.values) & df_pubmed.doi.isnull(), ["doi", "title"]].apply(
        lambda x: df_biorxiv.doi[df_biorxiv.title == x.title].item() if not x.doi else x.doi, axis=1)
df_pubmed.loc[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.abstract.isnull(), "abstract"] = \
    df_pubmed.doi[df_pubmed.doi.isin(df_biorxiv.doi.values) & df_pubmed.abstract.isnull()].apply(
        lambda x: df_biorxiv.abstract[df_biorxiv.doi == x].item())
df_pubmed.loc[df_pubmed.doi.isin(df_biorxiv.doi.values), "subject_areas"] = df_pubmed.doi[
    df_pubmed.doi.isin(df_biorxiv.doi.values)].apply(lambda x: df_biorxiv.subject_areas[
        df_biorxiv.doi == x].item())

In [47]:
# Filling the missing values of PubMed's features "abstract" and "subject_areas" with data from arXiv.
df_pubmed.loc[df_pubmed.title.isin(df_arxiv.title.values) & df_pubmed.abstract.isnull(), "abstract"] = \
    df_pubmed.title[df_pubmed.title.isin(df_arxiv.title.values) & df_pubmed.abstract.isnull()].apply(
        lambda x: df_arxiv.abstract[df_arxiv.title == x].item())
df_pubmed.loc[df_pubmed.title.isin(df_arxiv.title.values), "subject_areas"] = df_pubmed.title[
    df_pubmed.title.isin(df_arxiv.title.values)].apply(
        lambda x: df_arxiv.subject_areas[df_arxiv.title == x].item())

In [48]:
# Filling the missing values of Scopus' features "abstract" and "subject_areas" with data from arXiv.
df_scopus.loc[df_scopus.title.isin(df_arxiv.title.values) & df_scopus.abstract.isnull(), "abstract"] = \
    df_scopus.title[df_scopus.title.isin(df_arxiv.title.values) & df_scopus.abstract.isnull()].apply(
        lambda x: df_arxiv.abstract[df_arxiv.title == x].item())
df_scopus.loc[df_scopus.title.isin(df_arxiv.title.values) & df_scopus.subject_areas.isnull(),
    "subject_areas"] = df_scopus.title[df_scopus.title.isin(df_arxiv.title.values) &
        df_scopus.subject_areas.isnull()].apply(lambda x: df_arxiv.subject_areas[df_arxiv.title == x].item())

In [49]:
# Filling the missing values of Scopus' features "doi" and "pubmed_id" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.doi.isnull(), "doi"] = \
    df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.doi.isnull(),
        ["doi", "pubmed_id"]].apply(lambda x: df_pubmed.doi[df_pubmed.pubmed_id == x.pubmed_id].item() \
            if not x.doi else x.doi, axis=1)
df_scopus.loc[df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)
    & df_scopus.pubmed_id.isnull() & df_scopus.doi[
        df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values), "pubmed_id"] = \
df_scopus.loc[df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)
    & df_scopus.pubmed_id.isnull() & df_scopus.doi[
        df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values),
    ["pubmed_id", "title", "doi"]].apply(lambda x: x.pubmed_id if x.pubmed_id else np.reshape(
        df_pubmed.pubmed_id[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].values, -1)[0] \
            if df_pubmed.pubmed_id[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].size > 0 \
                else None, axis=1)

In [50]:
# Filling the missing values of Scopus' feature "abstract" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.abstract.isnull(), "abstract"] = \
    df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.abstract.isnull(),
        ["abstract", "pubmed_id"]].apply(lambda x: df_pubmed.abstract[
            df_pubmed.pubmed_id == x.pubmed_id].item() if not x.abstract else x.abstract, axis=1)
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.abstract.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values), "abstract"] = \
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.abstract.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values),
    ["abstract", "title", "doi"]].apply(lambda x: x.abstract if not x.abstract else np.reshape(
        df_pubmed.abstract[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].values, -1)[0] \
            if df_pubmed.abstract[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].size > 0 \
                else None, axis=1)

In [51]:
# Filling the missing values of Scopus' feature "auth_keywords" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.auth_keywords.isnull(),
    "auth_keywords"] = df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
        df_scopus.auth_keywords.isnull(), ["auth_keywords", "pubmed_id"]].apply(
            lambda x: df_pubmed.auth_keywords[df_pubmed.pubmed_id == x.pubmed_id].item() \
                if not x.auth_keywords else x.auth_keywords, axis=1)
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.auth_keywords.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values), "auth_keywords"] = \
df_scopus.loc[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
    df_scopus.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) & df_scopus.auth_keywords.isnull() &
    df_scopus.doi[df_scopus.doi.notnull()].isin(df_pubmed.doi[df_pubmed.doi.notnull()].values),
    ["auth_keywords", "title", "doi"]].apply(lambda x: x.auth_keywords if x.auth_keywords else np.reshape(
        df_pubmed.auth_keywords[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].values, -1)[0] \
            if df_pubmed.auth_keywords[(df_pubmed.title == x.title) & (df_pubmed.doi == x.doi)].size > 0 \
                else None, axis=1)

In [52]:
# Filling the missing values of Scopus' features "author_affil" and "subject_areas" with data from PubMed.
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.author_affil.isnull(),
    "author_affil"] = df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
        df_scopus.author_affil.isnull(), ["author_affil", "pubmed_id"]].apply(
            lambda x: df_pubmed.author_affil[df_pubmed.pubmed_id == x.pubmed_id].item() \
                if not x.author_affil else x.author_affil, axis=1)
df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) & df_scopus.subject_areas.isnull(),
    "subject_areas"] = df_scopus.loc[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id.values) &
        df_scopus.subject_areas.isnull(), ["subject_areas", "pubmed_id"]].apply(
            lambda x: df_pubmed.subject_areas[df_pubmed.pubmed_id == x.pubmed_id].item() \
                if not x.subject_areas else x.subject_areas, axis=1)

In [53]:
# Removing the duplicated records between arXiv and bioRxiv.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_biorxiv.title.values)]

In [54]:
# Removing the duplicated records between arXiv and PubMed.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values)]

In [55]:
# Removing the duplicated records between arXiv and Scopus.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_scopus.title.values)]

In [56]:
# Removing the duplicated records between bioRxiv and PubMed.
df_biorxiv = df_biorxiv[~(df_biorxiv.title.isin(df_pubmed.title[df_pubmed.title.notnull()].values) &
    df_biorxiv.doi.isin(df_pubmed.doi[df_pubmed.doi.notnull()].values))]

In [57]:
# Removing the duplicated records between bioRxiv and Scopus.
df_biorxiv = df_biorxiv[~(df_biorxiv.title.isin(df_scopus.title.values) &
    df_biorxiv.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values))]

In [58]:
# Removing the duplicated records between PubMed and Scopus.
idx_removed = df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[
    df_scopus.pubmed_id.notnull()].values)].index.to_list()
idx_removed += df_pubmed.pubmed_id[~df_pubmed.pubmed_id.isin(df_scopus.pubmed_id[
        df_scopus.pubmed_id.notnull()].values) &
    df_pubmed.title.isin(df_scopus.title.values) &
    df_pubmed.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].index.to_list()
df_pubmed = df_pubmed[~df_pubmed.index.isin(list(set(idx_removed)))]

In [59]:
# Visualizing the final number of records for each dataset.
print("arXiv:", df_arxiv.id.size)
print("bioRxiv:", df_biorxiv.id.size)
print("PubMed:", df_pubmed.pubmed_id.size)
print("Scopus:", df_scopus.id.size)
print("Expected total number of records for the final dataset:",
      (df_arxiv.id.size + df_biorxiv.id.size + df_pubmed.pubmed_id.size + df_scopus.id.size))

arXiv: 2972
bioRxiv: 12389
PubMed: 40512
Scopus: 84526
Expected total number of records for the final dataset: 140399


In [60]:
# Merging/Joining the datasets.
df_final = pd.concat([df_arxiv, df_biorxiv, df_pubmed, df_scopus], ignore_index=True)

In [61]:
# Defining the "None" value for the "NaN" values.
df_final.replace({np.nan: None}, inplace=True)

In [62]:
# Renaming the feature "source".
df_final.rename(columns={"source": "data_source"}, inplace=True)

In [63]:
# Checking the dataframe.
df_final.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references,period
0,2103.03219,"(General Finance,)",the impact of covid-19 on stock market volatil...,"({'id': '-8736464535247814394', 'name': 'Ateeb...",This paper examines the impact of coronavirus ...,2021-02-11,arXiv,,,,...,,,,,,,,,,
1,2103.03055,"(Image and Video Processing, Computer Vision a...",self-supervised deep convolutional neural netw...,"({'id': '1387464675565637337', 'name': 'Matej ...","Chest radiography is a relatively cheap, widel...",2021-03-04,arXiv,,,,...,,,,,,,,,,
2,2103.03038,"(Computer Vision and Pattern Recognition,)",mobile touchless fingerprint recognition: impl...,"({'id': '1917885517247673923', 'name': 'Jannis...",This work presents an automated touchless fing...,2021-03-04,arXiv,,,,...,,,,,,,,,,
3,2103.02961,"(Image and Video Processing, Computer Vision a...",probabilistic combination of eigenlungs-based ...,"({'id': '-3330601887987387557', 'name': 'Juan ...",The outbreak of the COVID-19 (Coronavirus dise...,2021-03-04,arXiv,,,,...,,,,,,,,,,
4,2103.02917,"(Computers and Society, Computation and Language)",mp twitter engagement and abuse post-first cov...,"({'id': '-6377212943789062063', 'name': 'Traci...",The UK has had a volatile political environmen...,2021-03-04,arXiv,,,,...,,,,,,,,,,


In [64]:
# Visualizing the information of dataset.
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140399 entries, 0 to 140398
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                99887 non-null   object        
 1   subject_areas     101557 non-null  object        
 2   title             140317 non-null  object        
 3   authors           86143 non-null   object        
 4   abstract          97706 non-null   object        
 5   publication_date  140399 non-null  datetime64[ns]
 6   data_source       140399 non-null  object        
 7   doi               133707 non-null  object        
 8   author_affil      135778 non-null  object        
 9   pubmed_id         109227 non-null  object        
 10  auth_keywords     71295 non-null   object        
 11  vehicle_name      124965 non-null  object        
 12  citation_num      84526 non-null   object        
 13  language          84077 non-null   object        
 14  prod

In [65]:
# Exporting the final dataset to CSV file.
df_final.to_csv("../../data/raw/final_raw.csv", index=False, quoting=csv.QUOTE_ALL)