# Merging the datasets of publications related to COVID-19

In [1]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from string import punctuation

## 1. Defining the required functions

In [2]:
# Defining the function "clean_title".
def clean_title(title):
    if len(title) == 1 and title in punctuation:
        return None
    if title[0] in punctuation:
        title = title[1:]
    if title[-1] in punctuation:
        title = title[:-1]
    return re.sub(r"\s+", " ", title).lower()

## 2. Getting and preprocessing the datasets

### 2.1. arXiv

In [3]:
# Getting the data.
df_arxiv = pd.read_csv("../../data/prepared/arxiv_covid_19.csv", header=0, dtype={"id": "str"})

In [4]:
# Checking the dataframe.
df_arxiv.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",Unveiling the molecular mechanism of SARS-CoV-...,"({'name': 'Duc D Nguyen'}, {'name': 'Kaifu Gao...","Currently, there is no effective antiviral dru...",2020-05-27
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",Emotion-robust EEG Classification for Motor Im...,"({'name': 'Abdul Moeed'},)",Developments in Brain Computer Interfaces (BCI...,2020-05-23
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",Estimates of the proportion of SARS-CoV-2 infe...,"({'name': 'Henrik Hult'}, {'name': 'Martina Fa...",In this paper a Bayesian SEIR model is studied...,2020-05-25
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",A mathematical epidemic model using genetic fi...,"({'name': 'Mohamed Taha Rouabah'}, {'name': 'A...",A compartmental epidemic model based on geneti...,2020-06-24
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",On the Detection of Disinformation Campaign Ac...,"({'name': 'Luis Vargas'}, {'name': 'Patrick Em...",Online manipulation of information has become ...,2020-05-27


In [5]:
# Visualizing the information of dataset.
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                1000 non-null   object
 1   subject_areas     1000 non-null   object
 2   title             1000 non-null   object
 3   authors           1000 non-null   object
 4   abstract          1000 non-null   object
 5   publication_date  1000 non-null   object
dtypes: object(6)
memory usage: 47.0+ KB


In [6]:
# Changing the type of features.
df_arxiv.loc[:, ["subject_areas", "authors"]] = df_arxiv.loc[:, ["subject_areas", "authors"]].apply(
    lambda x: x.apply(eval))
df_arxiv.publication_date = pd.to_datetime(df_arxiv.publication_date)

In [7]:
# Defining the feature "source".
df_arxiv["source"] = "arXiv"

In [8]:
# Normalizing the feature "title".
df_arxiv.title = df_arxiv.title.apply(clean_title)

In [9]:
# Checking the result.
df_arxiv.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,source
0,2005.13653,"(Biomolecules, Quantitative Methods)",unveiling the molecular mechanism of sars-cov-...,"({'name': 'Duc D Nguyen'}, {'name': 'Kaifu Gao...","Currently, there is no effective antiviral dru...",2020-05-27,arXiv
1,2005.13523,"(Signal Processing, Human-Computer Interaction...",emotion-robust eeg classification for motor im...,"({'name': 'Abdul Moeed'},)",Developments in Brain Computer Interfaces (BCI...,2020-05-23,arXiv
2,2005.13519,"(Populations and Evolution, Physics and Society)",estimates of the proportion of sars-cov-2 infe...,"({'name': 'Henrik Hult'}, {'name': 'Martina Fa...",In this paper a Bayesian SEIR model is studied...,2020-05-25,arXiv
3,2005.13516,"(Populations and Evolution, Quantitative Methods)",a mathematical epidemic model using genetic fi...,"({'name': 'Mohamed Taha Rouabah'}, {'name': 'A...",A compartmental epidemic model based on geneti...,2020-06-24,arXiv
4,2005.13466,"(Social and Information Networks, Cryptography...",on the detection of disinformation campaign ac...,"({'name': 'Luis Vargas'}, {'name': 'Patrick Em...",Online manipulation of information has become ...,2020-05-27,arXiv


In [10]:
# Visualizing the information of dataset.
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                1000 non-null   object        
 1   subject_areas     1000 non-null   object        
 2   title             1000 non-null   object        
 3   authors           1000 non-null   object        
 4   abstract          1000 non-null   object        
 5   publication_date  1000 non-null   datetime64[ns]
 6   source            1000 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 54.8+ KB


### 2.2. bioRxiv

In [11]:
# Getting the data.
df_biorxiv = pd.read_csv("../../data/prepared/biorxiv_covid_19.csv", header=0, dtype={"id": "str"})

In [12]:
# Checking the dataframe.
df_biorxiv.head()

Unnamed: 0,title,doi,id,abstract,author_affil,publication_date,source
0,"Characteristics, outcome and predictors of in-...",10.1101/2020.06.30.20143701,2020.06.30.20143701,"Since December 2019, coronavirus disease 2019 ...","({'name': 'ENRICO MARIA TRECARICHI', 'affiliat...",2020-07-02,medrxiv
1,A network-informed analysis of SARS-CoV-2 and ...,10.1101/2020.07.01.20144121,2020.07.01.20144121,Abnormal coagulation and an increased risk of ...,"({'name': 'Jun Ding', 'affiliation': 'Computat...",2020-07-02,medrxiv
2,ROX Index Predicts Intubation in Patients with...,10.1101/2020.06.30.20143867,2020.06.30.20143867,Introduction Use of high flow nasal therapy (H...,"({'name': 'Maulin Patel', 'affiliation': 'Temp...",2020-07-02,medrxiv
3,COVID-MATCH65 - A prospectively derived clinic...,10.1101/2020.06.30.20143818,2020.06.30.20143818,Due to the ongoing COVID-19 pandemic and incre...,"({'name': 'Jason A Trubiano', 'affiliation': '...",2020-07-02,medrxiv
4,Relative COVID-19 viral persistence and antibo...,10.1101/2020.07.01.20143917,2020.07.01.20143917,Importance: The COVID-19 antibody response is ...,"({'name': 'Chung-Guei Huang', 'affiliation': '...",2020-07-02,medrxiv


In [13]:
# Visualizing the information of dataset.
df_biorxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6020 entries, 0 to 6019
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             6020 non-null   object
 1   doi               6020 non-null   object
 2   id                6020 non-null   object
 3   abstract          6016 non-null   object
 4   author_affil      5997 non-null   object
 5   publication_date  6020 non-null   object
 6   source            6020 non-null   object
dtypes: object(7)
memory usage: 329.3+ KB


In [14]:
# Changing the type of features.
df_biorxiv.author_affil.loc[df_biorxiv.author_affil.notnull()] = df_biorxiv.author_affil.loc[
    df_biorxiv.author_affil.notnull()].apply(eval)
df_biorxiv.publication_date = pd.to_datetime(df_biorxiv.publication_date)

In [15]:
# Normalizing the feature "title".
df_biorxiv.title = df_biorxiv.title.apply(clean_title)

In [16]:
# Checking the dataframe.
df_biorxiv.head()

Unnamed: 0,title,doi,id,abstract,author_affil,publication_date,source
0,"characteristics, outcome and predictors of in-...",10.1101/2020.06.30.20143701,2020.06.30.20143701,"Since December 2019, coronavirus disease 2019 ...","({'name': 'ENRICO MARIA TRECARICHI', 'affiliat...",2020-07-02,medrxiv
1,a network-informed analysis of sars-cov-2 and ...,10.1101/2020.07.01.20144121,2020.07.01.20144121,Abnormal coagulation and an increased risk of ...,"({'name': 'Jun Ding', 'affiliation': 'Computat...",2020-07-02,medrxiv
2,rox index predicts intubation in patients with...,10.1101/2020.06.30.20143867,2020.06.30.20143867,Introduction Use of high flow nasal therapy (H...,"({'name': 'Maulin Patel', 'affiliation': 'Temp...",2020-07-02,medrxiv
3,covid-match65 - a prospectively derived clinic...,10.1101/2020.06.30.20143818,2020.06.30.20143818,Due to the ongoing COVID-19 pandemic and incre...,"({'name': 'Jason A Trubiano', 'affiliation': '...",2020-07-02,medrxiv
4,relative covid-19 viral persistence and antibo...,10.1101/2020.07.01.20143917,2020.07.01.20143917,Importance: The COVID-19 antibody response is ...,"({'name': 'Chung-Guei Huang', 'affiliation': '...",2020-07-02,medrxiv


In [17]:
# Visualizing the information of dataset.
df_biorxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6020 entries, 0 to 6019
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   title             6020 non-null   object        
 1   doi               6020 non-null   object        
 2   id                6020 non-null   object        
 3   abstract          6016 non-null   object        
 4   author_affil      5997 non-null   object        
 5   publication_date  6020 non-null   datetime64[ns]
 6   source            6020 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 329.3+ KB


### 2.3. PubMed

In [18]:
# Getting the data.
df_pubmed = pd.read_csv("../../data/prepared/pubmed_covid_19.csv", header=0, dtype={"pubmed_id": "str"})

In [19]:
# Checking the dataframe.
df_pubmed.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi
0,32610371,Is BMI higher in younger patients with COVID-1...,Obesity has been found to be a risk factor for...,"('Age', 'COVID-19', 'Diabetes', 'Hypertension'...","Obesity (Silver Spring, Md.)",2020-07-02,"({'name': 'Ajay Bhasin', 'affiliation': 'Depar...",10.1002/oby.22947
1,32610364,"The association between obesity, type 2 diabet...","To explore the association between obesity, ty...","('COVID-19', 'Hypertension', 'Mexican populati...","Obesity (Silver Spring, Md.)",2020-07-02,"({'name': 'Edgar Denova-Gutiérrez', 'affiliati...",10.1002/oby.22946
2,32610350,Rapid Implementation of an Inpatient Telehealt...,Relaxation of laws and regulations around pri...,,Applied clinical informatics,2020-07-02,"({'name': 'Jonathan D Hron', 'affiliation': ""D...",10.1055/s-0040-1713635
3,32610334,COVID-19 Infection and Neurological Complicati...,"The present outbreak caused by SARS-CoV-2, an ...","('Coronavirus', 'Infection', 'Neurological dis...",Neuroepidemiology,2020-07-02,"({'name': 'Ettore Beghi', 'affiliation': 'Depa...",10.1159/000508991
4,32610281,Adversity as a Catalyst for Change.,,"('COVID-19', 'practice management')",The Journal of invasive cardiology,2020-07-02,"({'name': 'Mayank Dalakoti', 'affiliation': 'N...",


In [20]:
# Visualizing the information of dataset.
df_pubmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28818 entries, 0 to 28817
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pubmed_id         28818 non-null  object
 1   title             28773 non-null  object
 2   abstract          14799 non-null  object
 3   auth_keywords     13369 non-null  object
 4   vehicle_name      28812 non-null  object
 5   publication_date  28818 non-null  object
 6   author_affil      28488 non-null  object
 7   doi               28322 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


In [21]:
# Changing the type of features.
df_pubmed.auth_keywords.loc[df_pubmed.auth_keywords.notnull()] = df_pubmed.auth_keywords.loc[
    df_pubmed.auth_keywords.notnull()].apply(eval)
df_pubmed.author_affil.loc[df_pubmed.author_affil.notnull()] = df_pubmed.author_affil.loc[
    df_pubmed.author_affil.notnull()].apply(eval)
df_pubmed.publication_date = pd.to_datetime(df_pubmed.publication_date)

In [22]:
# Defining the feature "source".
df_pubmed["source"] = "PubMed"

In [23]:
# Normalizing the feature "title".
df_pubmed.title.loc[df_pubmed.title.notnull()] = df_pubmed.title.loc[
    df_pubmed.title.notnull()].apply(clean_title)

In [24]:
# Checking the dataframe.
df_pubmed.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi,source
0,32610371,is bmi higher in younger patients with covid-1...,Obesity has been found to be a risk factor for...,"(Age, COVID-19, Diabetes, Hypertension, Obesity)","Obesity (Silver Spring, Md.)",2020-07-02,"({'name': 'Ajay Bhasin', 'affiliation': 'Depar...",10.1002/oby.22947,PubMed
1,32610364,"the association between obesity, type 2 diabet...","To explore the association between obesity, ty...","(COVID-19, Hypertension, Mexican population, O...","Obesity (Silver Spring, Md.)",2020-07-02,"({'name': 'Edgar Denova-Gutiérrez', 'affiliati...",10.1002/oby.22946,PubMed
2,32610350,rapid implementation of an inpatient telehealt...,Relaxation of laws and regulations around pri...,,Applied clinical informatics,2020-07-02,"({'name': 'Jonathan D Hron', 'affiliation': 'D...",10.1055/s-0040-1713635,PubMed
3,32610334,covid-19 infection and neurological complicati...,"The present outbreak caused by SARS-CoV-2, an ...","(Coronavirus, Infection, Neurological disorder...",Neuroepidemiology,2020-07-02,"({'name': 'Ettore Beghi', 'affiliation': 'Depa...",10.1159/000508991,PubMed
4,32610281,adversity as a catalyst for change,,"(COVID-19, practice management)",The Journal of invasive cardiology,2020-07-02,"({'name': 'Mayank Dalakoti', 'affiliation': 'N...",,PubMed


In [25]:
# Visualizing the information of dataset.
df_pubmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28818 entries, 0 to 28817
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   pubmed_id         28818 non-null  object        
 1   title             28772 non-null  object        
 2   abstract          14799 non-null  object        
 3   auth_keywords     13369 non-null  object        
 4   vehicle_name      28812 non-null  object        
 5   publication_date  28818 non-null  datetime64[ns]
 6   author_affil      28488 non-null  object        
 7   doi               28322 non-null  object        
 8   source            28818 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 2.0+ MB


### 2.4. Scopus

In [26]:
# Getting the data.
df_scopus = pd.read_csv("../../data/prepared/scopus_covid_19.csv", header=0,
    dtype={"id": "str", "eid": "str", "pii": "str", "pubmed_id": "str"})

In [27]:
# Checking the dataframe.
df_scopus.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,index_terms,issn,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references
0,85084625311,10.1080/16549716.2020.1760490,32404043.0,Eco-epidemiological assessment of the COVID-19...,"© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0,eng,Journal,j,...,"('Betacoronavirus', 'China', 'Coronavirus', 'C...",16549880,Global Health Action,Taylor and Francis Ltd.,"({'id': '60031040', 'affiliation': 'Umeå Unive...","('Health Policy', 'Public Health, Environmenta...","({'id': '7006527895', 'name': 'Peter Byass'},)","({'id': '7006527895', 'name': 'Peter Byass', '...",19,"({'id': '85056407012', 'title': 'The calendar ..."
1,85084466053,10.1080/20008198.2020.1752504,,Screening for consequences of trauma–an update...,"© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0,eng,Journal,j,...,,20008066 20008198,European Journal of Psychotraumatology,Taylor and Francis Ltd.,"({'id': '60030118', 'affiliation': ""The Childr...","('Psychiatry and Mental Health',)","({'id': '56230641000', 'name': 'Miranda Olff'}...","({'id': '56230641000', 'name': 'Miranda Olff',...",31,"({'id': '85075168775', 'title': 'Trauma-relate..."
2,85084248154,10.1080/13814788.2020.1757312,32349550.0,Family medicine in times of ‘COVID-19’: A gene...,,2020-12-16,0,eng,Journal,j,...,"('Adaptation, Psychological', 'Coronavirus Inf...",17511402 13814788,European Journal of General Practice,Taylor and Francis Ltd,"({'id': '60018869', 'affiliation': 'Maastricht...","('Family Practice',)","({'id': '6507333381', 'name': 'An de Sutter'},...","({'id': '6507333381', 'name': 'An de Sutter', ...",0,
3,85082549986,10.1080/20964129.2020.1741325,,Regulating wildlife conservation and food safe...,"© 2020, © 2020 The Author(s). Published by Tay...",2020-12-16,4,eng,Journal,j,...,,23328878 20964129,Ecosystem Health and Sustainability,Taylor and Francis Ltd.,"({'id': '60027363', 'affiliation': 'University...","('Ecology, Evolution, Behavior and Systematics...","({'id': '57216083373', 'name': 'Jingjing Yuan'...","({'id': '57216083373', 'name': 'Jingjing Yuan'...",17,"({'id': '85074624653', 'title': 'A New Twenty-..."
4,85086878414,10.1016/j.ces.2020.115918,,Analogies between SARS-CoV-2 infection dynamic...,© 2020 Elsevier LtdThe pandemic infection of S...,2020-12-14,0,eng,Journal,j,...,,00092509,Chemical Engineering Science,Elsevier Ltd,"({'id': '60027245', 'affiliation': 'Universite...","('Chemistry (all)', 'Chemical Engineering (all...","({'id': '23470382500', 'name': 'F. Manenti'}, ...","({'id': '23470382500', 'name': 'F. Manenti', '...",36,"({'id': '85071770137', 'title': 'A dynamic neu..."


In [28]:
# Visualizing the information of dataset.
df_scopus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25029 entries, 0 to 25028
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                25029 non-null  object
 1   doi               24462 non-null  object
 2   pubmed_id         17416 non-null  object
 3   title             25029 non-null  object
 4   abstract          12810 non-null  object
 5   publication_date  25029 non-null  object
 6   citation_num      25029 non-null  int64 
 7   language          24886 non-null  object
 8   production_type   25029 non-null  object
 9   source_type       25029 non-null  object
 10  auth_keywords     11935 non-null  object
 11  index_terms       7548 non-null   object
 12  issn              25006 non-null  object
 13  vehicle_name      25029 non-null  object
 14  publisher         25029 non-null  object
 15  affiliations      22857 non-null  object
 16  subject_areas     24975 non-null  object
 17  authors     

In [29]:
# Changing the type of features.
df_scopus.auth_keywords.loc[df_scopus.auth_keywords.notnull()] = df_scopus.auth_keywords.loc[
    df_scopus.auth_keywords.notnull()].apply(eval)
df_scopus.index_terms.loc[df_scopus.index_terms.notnull()] = df_scopus.index_terms.loc[
    df_scopus.index_terms.notnull()].apply(eval)
df_scopus.affiliations.loc[df_scopus.affiliations.notnull()] = df_scopus.affiliations.loc[
    df_scopus.affiliations.notnull()].apply(eval)
df_scopus.subject_areas.loc[df_scopus.subject_areas.notnull()] = df_scopus.subject_areas.loc[
    df_scopus.subject_areas.notnull()].apply(eval)
df_scopus.authors.loc[df_scopus.authors.notnull()] = df_scopus.authors.loc[
    df_scopus.authors.notnull()].apply(eval)
df_scopus.author_affil.loc[df_scopus.author_affil.notnull()] = df_scopus.author_affil.loc[
    df_scopus.author_affil.notnull()].apply(eval)
df_scopus.references.loc[df_scopus.references.notnull()] = df_scopus.references.loc[
    df_scopus.references.notnull()].apply(eval)
df_scopus.publication_date = pd.to_datetime(df_scopus.publication_date)

In [30]:
# Defining the feature "source".
df_scopus["source"] = "Scopus"

In [31]:
# Normalizing the feature "title".
df_scopus.title = df_scopus.title.apply(clean_title)

In [32]:
# Checking the dataframe.
df_scopus.head()

Unnamed: 0,id,doi,pubmed_id,title,abstract,publication_date,citation_num,language,production_type,source_type,...,issn,vehicle_name,publisher,affiliations,subject_areas,authors,author_affil,ref_count,references,source
0,85084625311,10.1080/16549716.2020.1760490,32404043.0,eco-epidemiological assessment of the covid-19...,"© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0,eng,Journal,j,...,16549880,Global Health Action,Taylor and Francis Ltd.,"({'id': '60031040', 'affiliation': 'Umeå Unive...","(Health Policy, Public Health, Environmental a...","({'id': '7006527895', 'name': 'Peter Byass'},)","({'id': '7006527895', 'name': 'Peter Byass', '...",19,"({'id': '85056407012', 'title': 'The calendar ...",Scopus
1,85084466053,10.1080/20008198.2020.1752504,,screening for consequences of trauma–an update...,"© 2020, © 2020 The Author(s). Published by Inf...",2020-12-31,0,eng,Journal,j,...,20008066 20008198,European Journal of Psychotraumatology,Taylor and Francis Ltd.,"({'id': '60030118', 'affiliation': 'The Childr...","(Psychiatry and Mental Health,)","({'id': '56230641000', 'name': 'Miranda Olff'}...","({'id': '56230641000', 'name': 'Miranda Olff',...",31,"({'id': '85075168775', 'title': 'Trauma-relate...",Scopus
2,85084248154,10.1080/13814788.2020.1757312,32349550.0,family medicine in times of ‘covid-19’: a gene...,,2020-12-16,0,eng,Journal,j,...,17511402 13814788,European Journal of General Practice,Taylor and Francis Ltd,"({'id': '60018869', 'affiliation': 'Maastricht...","(Family Practice,)","({'id': '6507333381', 'name': 'An de Sutter'},...","({'id': '6507333381', 'name': 'An de Sutter', ...",0,,Scopus
3,85082549986,10.1080/20964129.2020.1741325,,regulating wildlife conservation and food safe...,"© 2020, © 2020 The Author(s). Published by Tay...",2020-12-16,4,eng,Journal,j,...,23328878 20964129,Ecosystem Health and Sustainability,Taylor and Francis Ltd.,"({'id': '60027363', 'affiliation': 'University...","(Ecology, Evolution, Behavior and Systematics,...","({'id': '57216083373', 'name': 'Jingjing Yuan'...","({'id': '57216083373', 'name': 'Jingjing Yuan'...",17,"({'id': '85074624653', 'title': 'A New Twenty-...",Scopus
4,85086878414,10.1016/j.ces.2020.115918,,analogies between sars-cov-2 infection dynamic...,© 2020 Elsevier LtdThe pandemic infection of S...,2020-12-14,0,eng,Journal,j,...,00092509,Chemical Engineering Science,Elsevier Ltd,"({'id': '60027245', 'affiliation': 'Universite...","(Chemistry (all), Chemical Engineering (all), ...","({'id': '23470382500', 'name': 'F. Manenti'}, ...","({'id': '23470382500', 'name': 'F. Manenti', '...",36,"({'id': '85071770137', 'title': 'A dynamic neu...",Scopus


In [33]:
# Visualizing the information of dataset.
df_scopus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25029 entries, 0 to 25028
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                25029 non-null  object        
 1   doi               24462 non-null  object        
 2   pubmed_id         17416 non-null  object        
 3   title             25029 non-null  object        
 4   abstract          12810 non-null  object        
 5   publication_date  25029 non-null  datetime64[ns]
 6   citation_num      25029 non-null  int64         
 7   language          24886 non-null  object        
 8   production_type   25029 non-null  object        
 9   source_type       25029 non-null  object        
 10  auth_keywords     11935 non-null  object        
 11  index_terms       7548 non-null   object        
 12  issn              25006 non-null  object        
 13  vehicle_name      25029 non-null  object        
 14  publisher         2502

## 3. Merging/Joining the datasets

In [34]:
# Checking the duplicated records between arXiv and Scopus.
df_scopus.id[df_scopus.title.isin(df_arxiv.title)].size

26

In [35]:
# Checking the duplicated records between bioRxiv and Scopus.
df_scopus.id[df_scopus.title.isin(df_biorxiv.title)].size

206

In [36]:
# Checking the duplicated records between PubMed and Scopus by the features "title" and "pubmed_id".
df_scopus.id[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id) & df_scopus.title.isin(df_pubmed.title)].size

14411

In [37]:
# Checking the duplicated records between PubMed and Scopus by only the feature "pubmed_id".
df_scopus.id[df_scopus.pubmed_id.isin(df_pubmed.pubmed_id) & ~df_scopus.title.isin(df_pubmed.title)].size

2074

In [38]:
# Checking the duplicated records between PubMed and Scopus by only the feature "title".
df_scopus.id[~df_scopus.pubmed_id.isin(df_pubmed.pubmed_id) & df_scopus.title.isin(df_pubmed.title)].size

3937

In [39]:
# Removing the duplicated records between arXiv and Scopus.
df_arxiv = df_arxiv[~df_arxiv.title.isin(df_scopus.title)]

In [40]:
# Removing the duplicated records between bioRxiv and Scopus.
df_biorxiv = df_biorxiv[~df_biorxiv.title.isin(df_scopus.title)]

In [41]:
# Removing the duplicated records between PubMed and Scopus.
idx_removed = df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id) &
                                  df_pubmed.title.isin(df_scopus.title)].index.to_list()
idx_removed += df_pubmed.pubmed_id[~df_pubmed.pubmed_id.isin(df_scopus.pubmed_id) &
                                   df_pubmed.title.isin(df_scopus.title)].index.to_list()
idx_removed += df_pubmed.pubmed_id[df_pubmed.pubmed_id.isin(df_scopus.pubmed_id) &
                                   ~df_pubmed.title.isin(df_scopus.title)].index.to_list()
df_pubmed = df_pubmed[~df_pubmed.index.isin(list(set(idx_removed)))]

In [42]:
# Visualizing the final number of records for each dataset.
print("arXiv:", df_arxiv.id.size)
print("bioRxiv:", df_biorxiv.id.size)
print("PubMed:", df_pubmed.pubmed_id.size)
print("Scopus:", df_scopus.id.size)
print("Expected total number of records for the final dataset:",
      (df_arxiv.id.size + df_biorxiv.id.size + df_pubmed.pubmed_id.size + df_scopus.id.size))

arXiv: 974
bioRxiv: 5815
PubMed: 8400
Scopus: 25029
Expected total number of records for the final dataset: 40218


In [43]:
# Merging/Joining the datasets.
df_final = pd.concat([df_arxiv, df_biorxiv, df_pubmed, df_scopus], ignore_index=True)

In [44]:
# Defining the "None" value for the "NaN" values.
df_final.replace({np.nan: None}, inplace=True)

In [45]:
# Renaming the feature "source".
df_final.rename(columns={"source": "data_source"}, inplace=True)

In [46]:
# Checking the dataframe.
df_final.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,citation_num,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references
0,2005.13653,"(Biomolecules, Quantitative Methods)",unveiling the molecular mechanism of sars-cov-...,"({'name': 'Duc D Nguyen'}, {'name': 'Kaifu Gao...","Currently, there is no effective antiviral dru...",2020-05-27,arXiv,,,,...,,,,,,,,,,
1,2005.13523,"(Signal Processing, Human-Computer Interaction...",emotion-robust eeg classification for motor im...,"({'name': 'Abdul Moeed'},)",Developments in Brain Computer Interfaces (BCI...,2020-05-23,arXiv,,,,...,,,,,,,,,,
2,2005.13519,"(Populations and Evolution, Physics and Society)",estimates of the proportion of sars-cov-2 infe...,"({'name': 'Henrik Hult'}, {'name': 'Martina Fa...",In this paper a Bayesian SEIR model is studied...,2020-05-25,arXiv,,,,...,,,,,,,,,,
3,2005.13516,"(Populations and Evolution, Quantitative Methods)",a mathematical epidemic model using genetic fi...,"({'name': 'Mohamed Taha Rouabah'}, {'name': 'A...",A compartmental epidemic model based on geneti...,2020-06-24,arXiv,,,,...,,,,,,,,,,
4,2005.13466,"(Social and Information Networks, Cryptography...",on the detection of disinformation campaign ac...,"({'name': 'Luis Vargas'}, {'name': 'Patrick Em...",Online manipulation of information has become ...,2020-05-27,arXiv,,,,...,,,,,,,,,,


In [47]:
# Visualizing the information of dataset.
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40218 entries, 0 to 40217
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                31818 non-null  object        
 1   subject_areas     25949 non-null  object        
 2   title             40193 non-null  object        
 3   authors           25610 non-null  object        
 4   abstract          24447 non-null  object        
 5   publication_date  40218 non-null  datetime64[ns]
 6   data_source       40218 non-null  object        
 7   doi               38454 non-null  object        
 8   author_affil      38770 non-null  object        
 9   pubmed_id         25816 non-null  object        
 10  auth_keywords     15908 non-null  object        
 11  vehicle_name      33423 non-null  object        
 12  citation_num      25029 non-null  object        
 13  language          24886 non-null  object        
 14  production_type   2502

In [48]:
# Exporting the final dataset to CSV file.
df_final.to_csv("../../data/raw/final_raw.csv", index=False, quoting=csv.QUOTE_ALL)