# Cleaning and Preprocessing the arXiv publications related to COVID-19

The publications' data were collected from [arXiv webpage](https://arxiv.org/covid19search) related to COVID-19.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text
from datetime import datetime

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/arxiv_raw.csv", header=0)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,date
0,arXiv:2009.11008,"['Image and Video Processing', 'Computer Visio...",Attention with Multiple Sources Knowledges for...,"['Duy M. H. Nguyen', 'Duy M. Nguyen', 'Huong V...","Until now, Coronavirus SARS-CoV-2 has caused m...","23 September, 2020; September 2020."
1,arXiv:2009.10931,"['Quantitative Methods', 'Machine Learning']",Drug Repurposing for COVID-19 using Graph Neur...,"['Kanglin Hsieh', 'Yinyin Wang', 'Luyao Chen',...",Amid the pandemic of 2019 novel coronavirus di...,"23 September, 2020; September 2020."
2,arXiv:2009.10808,"['Machine Learning', 'Applications']",Using Machine Learning to Develop a Novel COVI...,"['Anuj Tiwari', 'Arya V. Dadhania', 'Vijay Avi...",COVID19 is now one of the most leading causes ...,"22 September, 2020; September 2020."
3,arXiv:2009.10648,"['Social and Information Networks', 'Physics a...",Google COVID-19 community mobility reports: in...,"['Gabriela Cavalcante da Silvaa', 'Sabrina Oli...",Social distancing (SD) has been critical in th...,"17 September, 2020; September 2020."
4,arXiv:2009.10608,"['Image and Video Processing', 'Computer Visio...",Dual Encoder Fusion U-Net (DEFU-Net) for Cross...,"['Lipei Zhang', 'Aozhi Liu', 'Jing Xiao', 'Pau...",A number of methods based on the deep learning...,"11 September, 2020; September 2020."


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             2335 non-null   object
 1   subject_areas  2335 non-null   object
 2   title          2335 non-null   object
 3   authors        2335 non-null   object
 4   abstract       2335 non-null   object
 5   date           2335 non-null   object
dtypes: object(6)
memory usage: 109.6+ KB


## 2. Cleaning and preprocessing the dataframe

In [6]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [7]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.replace("arXiv:", "").strip())

In [8]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple(eval(x)))

In [9]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"[\^_]", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ",
            re.sub(r"\\?%", "@PER@CENT@", y))).strip())))))

In [10]:
# Normalizing the feature "authors".
df_data.authors = [tuple([{"id": str(hash("{} - {}".format(author, "arXiv"))), "name": author}
                    for author in eval(authors)]) for authors in df_data.authors]

In [11]:
# Normalizing the feature "date".
df_data.date = df_data.date.apply(lambda x: re.sub(r"\s+", " ", x.split(".")[0]))
df_data.date = df_data.date.apply(lambda x: x.replace("submitted ", ""))

In [12]:
# Creating the feature "publication_date" from the feature "date".
df_data["publication_date"] = df_data.date.apply(
    lambda x: datetime.strptime(x.split(";")[0].strip(), "%d %B, %Y").date())

In [13]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns="date", inplace=True)

In [14]:
# Checking the result.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date
0,2009.11008,"(Image and Video Processing, Computer Vision a...",Attention with Multiple Sources Knowledges for...,"({'id': '-3656862960144035448', 'name': 'Duy M...","Until now, Coronavirus SARS-CoV-2 has caused m...",2020-09-23
1,2009.10931,"(Quantitative Methods, Machine Learning)",Drug Repurposing for COVID-19 using Graph Neur...,"({'id': '-4571039949857585287', 'name': 'Kangl...",Amid the pandemic of 2019 novel coronavirus di...,2020-09-23
2,2009.10808,"(Machine Learning, Applications)",Using Machine Learning to Develop a Novel COVI...,"({'id': '-1643832521739170778', 'name': 'Anuj ...",COVID19 is now one of the most leading causes ...,2020-09-22
3,2009.10648,"(Social and Information Networks, Physics and ...",Google COVID-19 community mobility reports: in...,"({'id': '1593276023866582611', 'name': 'Gabrie...",Social distancing (SD) has been critical in th...,2020-09-17
4,2009.10608,"(Image and Video Processing, Computer Vision a...",Dual Encoder Fusion U-Net (DEFU-Net) for Cross...,"({'id': '-4095306500263987581', 'name': 'Lipei...",A number of methods based on the deep learning...,2020-09-11


In [15]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2335 non-null   object
 1   subject_areas     2335 non-null   object
 2   title             2335 non-null   object
 3   authors           2335 non-null   object
 4   abstract          2335 non-null   object
 5   publication_date  2335 non-null   object
dtypes: object(6)
memory usage: 109.6+ KB


## 3. Saving the dataframe

In [16]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/arxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)