# Cleaning and Preprocessing the final dataset of publications related to COVID-19

In [1]:
# Importing the required libraries.
import csv, pandas as pd, numpy as np

## 1. Generating the dataframe from the raw data

In [2]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/final_raw.csv", header=0, dtype={"id": "str", "pubmed_id": "str"})

In [3]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,citation_num,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references
0,2005.13653,"('Biomolecules', 'Quantitative Methods')",unveiling the molecular mechanism of sars-cov-...,"({'name': 'Duc D Nguyen'}, {'name': 'Kaifu Gao...","Currently, there is no effective antiviral dru...",2020-05-27,arXiv,,,,...,,,,,,,,,,
1,2005.13523,"('Signal Processing', 'Human-Computer Interact...",emotion-robust eeg classification for motor im...,"({'name': 'Abdul Moeed'},)",Developments in Brain Computer Interfaces (BCI...,2020-05-23,arXiv,,,,...,,,,,,,,,,
2,2005.13519,"('Populations and Evolution', 'Physics and Soc...",estimates of the proportion of sars-cov-2 infe...,"({'name': 'Henrik Hult'}, {'name': 'Martina Fa...",In this paper a Bayesian SEIR model is studied...,2020-05-25,arXiv,,,,...,,,,,,,,,,
3,2005.13516,"('Populations and Evolution', 'Quantitative Me...",a mathematical epidemic model using genetic fi...,"({'name': 'Mohamed Taha Rouabah'}, {'name': 'A...",A compartmental epidemic model based on geneti...,2020-06-24,arXiv,,,,...,,,,,,,,,,
4,2005.13466,"('Social and Information Networks', 'Cryptogra...",on the detection of disinformation campaign ac...,"({'name': 'Luis Vargas'}, {'name': 'Patrick Em...",Online manipulation of information has become ...,2020-05-27,arXiv,,,,...,,,,,,,,,,


In [4]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40218 entries, 0 to 40217
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                31818 non-null  object 
 1   subject_areas     25949 non-null  object 
 2   title             40193 non-null  object 
 3   authors           25610 non-null  object 
 4   abstract          24447 non-null  object 
 5   publication_date  40218 non-null  object 
 6   data_source       40218 non-null  object 
 7   doi               38454 non-null  object 
 8   author_affil      38770 non-null  object 
 9   pubmed_id         25816 non-null  object 
 10  auth_keywords     15908 non-null  object 
 11  vehicle_name      33423 non-null  object 
 12  citation_num      25029 non-null  float64
 13  language          24886 non-null  object 
 14  production_type   25029 non-null  object 
 15  source_type       25029 non-null  object 
 16  index_terms       7548 non-null   object

## 2. Cleaning and preprocessing the dataframe

In [5]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [6]:
# Changing the type of features.
df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]] = df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]].apply(lambda x: x.apply(lambda y: eval(y) if y else None))
df_data.publication_date = pd.to_datetime(df_data.publication_date)

In [7]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0

In [8]:
# Extracting the missing authors from the feature "author_affil".
df_data.authors.loc[df_data.authors.isnull() & df_data.author_affil.notnull()] = [
    tuple([{"name": author["name"]} for author in authors if author["name"]])
    for authors in df_data.author_affil[df_data.authors.isnull() & df_data.author_affil.notnull()]]

In [9]:
# Removing the empty lists of authors.
df_data.authors.loc[df_data.authors == ()] = None

In [10]:
# Extracting the missing affiliations from the feature "author_affil".
df_data.affiliations.loc[df_data.affiliations.isnull() & df_data.author_affil.notnull()] = [
    tuple([{"affiliation": affil["affiliation"]} for affil in affils if affil["affiliation"]])
    for affils in df_data.author_affil[df_data.affiliations.isnull() & df_data.author_affil.notnull()]]

In [11]:
# Removing the empty lists of affiliations.
df_data.affiliations.loc[df_data.affiliations == ()] = None

In [12]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [13]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,citation_num,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references
0,2005.13653,"(Biomolecules, Quantitative Methods)",unveiling the molecular mechanism of sars-cov-...,"({'name': 'Duc D Nguyen'}, {'name': 'Kaifu Gao...","Currently, there is no effective antiviral dru...",2020-05-27,arXiv,,,,...,0.0,,,,,,,,0.0,
1,2005.13523,"(Signal Processing, Human-Computer Interaction...",emotion-robust eeg classification for motor im...,"({'name': 'Abdul Moeed'},)",Developments in Brain Computer Interfaces (BCI...,2020-05-23,arXiv,,,,...,0.0,,,,,,,,0.0,
2,2005.13519,"(Populations and Evolution, Physics and Society)",estimates of the proportion of sars-cov-2 infe...,"({'name': 'Henrik Hult'}, {'name': 'Martina Fa...",In this paper a Bayesian SEIR model is studied...,2020-05-25,arXiv,,,,...,0.0,,,,,,,,0.0,
3,2005.13516,"(Populations and Evolution, Quantitative Methods)",a mathematical epidemic model using genetic fi...,"({'name': 'Mohamed Taha Rouabah'}, {'name': 'A...",A compartmental epidemic model based on geneti...,2020-06-24,arXiv,,,,...,0.0,,,,,,,,0.0,
4,2005.13466,"(Social and Information Networks, Cryptography...",on the detection of disinformation campaign ac...,"({'name': 'Luis Vargas'}, {'name': 'Patrick Em...",Online manipulation of information has become ...,2020-05-27,arXiv,,,,...,0.0,,,,,,,,0.0,


In [14]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40218 entries, 0 to 40217
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                31818 non-null  object        
 1   subject_areas     25949 non-null  object        
 2   title             40193 non-null  object        
 3   authors           39695 non-null  object        
 4   abstract          24447 non-null  object        
 5   publication_date  40218 non-null  datetime64[ns]
 6   data_source       40218 non-null  object        
 7   doi               38454 non-null  object        
 8   author_affil      38770 non-null  object        
 9   pubmed_id         25816 non-null  object        
 10  auth_keywords     15908 non-null  object        
 11  vehicle_name      33423 non-null  object        
 12  citation_num      40218 non-null  float64       
 13  language          24886 non-null  object        
 14  production_type   2502

## 3. Saving the dataframe

In [15]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/final_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)