# Cleaning and Preprocessing the final dataset of publications related to COVID-19

In [1]:
# Importing the required libraries.
import csv, pandas as pd, numpy as np

## 1. Generating the dataframe from the raw data

In [2]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/final_raw.csv", header=0, dtype={"id": "str", "pubmed_id": "str"})

In [3]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,citation_num,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references
0,2009.11008,"('Image and Video Processing', 'Computer Visio...",attention with multiple sources knowledges for...,"({'id': '-3656862960144035448', 'name': 'Duy M...","Until now, Coronavirus SARS-CoV-2 has caused m...",2020-09-23,arXiv,,,,...,,,,,,,,,,
1,2009.10931,"('Quantitative Methods', 'Machine Learning')",drug repurposing for covid-19 using graph neur...,"({'id': '-4571039949857585287', 'name': 'Kangl...",Amid the pandemic of 2019 novel coronavirus di...,2020-09-23,arXiv,,,,...,,,,,,,,,,
2,2009.10808,"('Machine Learning', 'Applications')",using machine learning to develop a novel covi...,"({'id': '-1643832521739170778', 'name': 'Anuj ...",COVID19 is now one of the most leading causes ...,2020-09-22,arXiv,,,,...,,,,,,,,,,
3,2009.10648,"('Social and Information Networks', 'Physics a...",google covid-19 community mobility reports: in...,"({'id': '1593276023866582611', 'name': 'Gabrie...",Social distancing (SD) has been critical in th...,2020-09-17,arXiv,,,,...,,,,,,,,,,
4,2009.10608,"('Image and Video Processing', 'Computer Visio...",dual encoder fusion u-net (defu-net) for cross...,"({'id': '-4095306500263987581', 'name': 'Lipei...",A number of methods based on the deep learning...,2020-09-11,arXiv,,,,...,,,,,,,,,,


In [4]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77129 entries, 0 to 77128
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                63461 non-null  object 
 1   subject_areas     64357 non-null  object 
 2   title             77110 non-null  object 
 3   authors           54375 non-null  object 
 4   abstract          50038 non-null  object 
 5   publication_date  77129 non-null  object 
 6   data_source       77129 non-null  object 
 7   doi               73284 non-null  object 
 8   author_affil      74274 non-null  object 
 9   pubmed_id         57587 non-null  object 
 10  auth_keywords     36085 non-null  object 
 11  vehicle_name      66916 non-null  object 
 12  citation_num      53266 non-null  float64
 13  language          52982 non-null  object 
 14  production_type   53266 non-null  object 
 15  source_type       53266 non-null  object 
 16  index_terms       23975 non-null  object

## 2. Cleaning and preprocessing the dataframe

In [5]:
# Defining the function to remove the missing compound entities.
def normalize(tuple_obj):
    if len(tuple_obj) == 0:
        return None
    return tuple_obj

In [6]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [7]:
# Changing the type of features.
df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]] = df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]].apply(lambda x: x.apply(lambda y: eval(y) if y else None))
df_data.publication_date = pd.to_datetime(df_data.publication_date)

In [8]:
# Removing articles whose publication year is less than 2019.
df_data = df_data[pd.DatetimeIndex(df_data.publication_date).year >= 2019]

In [9]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0

In [10]:
# Extracting the missing authors from the feature "author_affil".
df_data.loc[df_data.authors.isnull() & df_data.author_affil.notnull(), "authors"] = df_data.loc[
    df_data.authors.isnull() & df_data.author_affil.notnull(), "author_affil"].apply(
        lambda x: normalize(tuple([{"id": author["id"], "name": author["name"]}
                            for author in x if author["name"]])))

In [11]:
# Checking if there are the empty lists of authors.
df_data.authors[df_data.authors == ()].size

0

In [12]:
# Extracting the missing affiliations from the feature "author_affil".
df_data.loc[df_data.affiliations.isnull() & df_data.author_affil.notnull(), "affiliations"] = df_data.loc[
    df_data.affiliations.isnull() & df_data.author_affil.notnull(), "author_affil"].apply(
        lambda x: normalize(tuple([{"id": affil["affil_id"], "affiliation": affil["affiliation"],
            "country": affil["country"]} for affil in x if affil["affiliation"] or affil["country"]])))

In [13]:
# Checking if there are the empty lists of affiliations.
df_data.affiliations[df_data.affiliations == ()].size

0

In [14]:
# Extracting the missing authors in the feature "author_affil" from the "authors" one.
df_data.loc[df_data.author_affil.isnull() & df_data.authors.notnull(), "author_affil"] = df_data.loc[
    df_data.author_affil.isnull() & df_data.authors.notnull(), "authors"].apply(
        lambda x: normalize(tuple([{**author, "affil_id": None, "affiliation": None, "country": None}
                            for author in x if author["name"]])))

In [15]:
# Checking if there are the empty lists in the feature "author_affil".
df_data.author_affil[df_data.author_affil == ()].size

0

In [16]:
# Normalizing the feature "id".
df_data.loc[df_data.pubmed_id.notnull() & df_data.id.isnull(), "id"] = df_data.pubmed_id[
    df_data.pubmed_id.notnull() & df_data.id.isnull()]

In [17]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [18]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,citation_num,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references
0,2009.11008,"(Image and Video Processing, Computer Vision a...",attention with multiple sources knowledges for...,"({'id': '-3656862960144035448', 'name': 'Duy M...","Until now, Coronavirus SARS-CoV-2 has caused m...",2020-09-23,arXiv,,"({'id': '-3656862960144035448', 'name': 'Duy M...",,...,0.0,,,,,,,,0.0,
1,2009.10931,"(Quantitative Methods, Machine Learning)",drug repurposing for covid-19 using graph neur...,"({'id': '-4571039949857585287', 'name': 'Kangl...",Amid the pandemic of 2019 novel coronavirus di...,2020-09-23,arXiv,,"({'id': '-4571039949857585287', 'name': 'Kangl...",,...,0.0,,,,,,,,0.0,
2,2009.10808,"(Machine Learning, Applications)",using machine learning to develop a novel covi...,"({'id': '-1643832521739170778', 'name': 'Anuj ...",COVID19 is now one of the most leading causes ...,2020-09-22,arXiv,,"({'id': '-1643832521739170778', 'name': 'Anuj ...",,...,0.0,,,,,,,,0.0,
3,2009.10648,"(Social and Information Networks, Physics and ...",google covid-19 community mobility reports: in...,"({'id': '1593276023866582611', 'name': 'Gabrie...",Social distancing (SD) has been critical in th...,2020-09-17,arXiv,,"({'id': '1593276023866582611', 'name': 'Gabrie...",,...,0.0,,,,,,,,0.0,
4,2009.10608,"(Image and Video Processing, Computer Vision a...",dual encoder fusion u-net (defu-net) for cross...,"({'id': '-4095306500263987581', 'name': 'Lipei...",A number of methods based on the deep learning...,2020-09-11,arXiv,,"({'id': '-4095306500263987581', 'name': 'Lipei...",,...,0.0,,,,,,,,0.0,


In [19]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77119 entries, 0 to 77128
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                77119 non-null  object        
 1   subject_areas     64355 non-null  object        
 2   title             77100 non-null  object        
 3   authors           76191 non-null  object        
 4   abstract          50028 non-null  object        
 5   publication_date  77119 non-null  datetime64[ns]
 6   data_source       77119 non-null  object        
 7   doi               73276 non-null  object        
 8   author_affil      76227 non-null  object        
 9   pubmed_id         57579 non-null  object        
 10  auth_keywords     36078 non-null  object        
 11  vehicle_name      66908 non-null  object        
 12  citation_num      77119 non-null  float64       
 13  language          52982 non-null  object        
 14  production_type   5326

## 3. Saving the dataframe

In [20]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/final_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)