# Cleaning and Preprocessing the arXiv publications related to COVID-19

The publications' data were collected from [arXiv webpage](https://arxiv.org/covid19search) related to COVID-19.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text
from datetime import datetime

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/arxiv_raw.csv", header=0)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,date
0,arXiv:2005.13653,"['Biomolecules', 'Quantitative Methods']",Unveiling the molecular mechanism of SARS-CoV-...,"['Duc D Nguyen', 'Kaifu Gao', 'Jiahui Chen', '...","Currently, there is no effective antiviral dru...","27 May, 2020; May 2020."
1,arXiv:2005.13523,"['Signal Processing', 'Human-Computer Interact...",Emotion-robust EEG Classification for Motor Im...,['Abdul Moeed'],Developments in Brain Computer Interfaces (BCI...,"23 May, 2020; May 2020."
2,arXiv:2005.13519,"['Populations and Evolution', 'Physics and Soc...",Estimates of the proportion of SARS-CoV-2 infe...,"['Henrik Hult', 'Martina Favero']",In this paper a Bayesian SEIR model is studied...,"25 May, 2020; May 2020. ..."
3,arXiv:2005.13516,"['Populations and Evolution', 'Quantitative Me...",A mathematical epidemic model using genetic fi...,"['Mohamed Taha Rouabah', 'Abdellah Tounsi', 'N...",A compartmental epidemic model based on geneti...,"24 June, 2020; submitted 26 May, 2020; ..."
4,arXiv:2005.13466,"['Social and Information Networks', 'Cryptogra...",On the Detection of Disinformation Campaign Ac...,"['Luis Vargas', 'Patrick Emami', 'Patrick Tray...",Online manipulation of information has become ...,"27 May, 2020; May 2020."


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1000 non-null   object
 1   subject_areas  1000 non-null   object
 2   title          1000 non-null   object
 3   authors        1000 non-null   object
 4   abstract       1000 non-null   object
 5   date           1000 non-null   object
dtypes: object(6)
memory usage: 47.0+ KB


## 2. Cleaning and preprocessing the dataframe

In [6]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [7]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.replace("arXiv:", "").strip())

In [8]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple(eval(x)))

In [9]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"[\^_]", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ",
            re.sub(r"\\?%", "@PER@CENT@", y))).strip())))))

In [10]:
# Normalizing the feature "authors".
df_data.authors = [tuple([{"name": author} for author in eval(authors)]) for authors in df_data.authors]

In [11]:
# Normalizing the feature "date".
df_data.date = df_data.date.apply(lambda x: re.sub(r"\s+", " ", x.split(".")[0]))
df_data.date = df_data.date.apply(lambda x: x.replace("submitted ", ""))

In [12]:
# Creating the feature "publication_date" from the feature "date".
df_data["publication_date"] = df_data.date.apply(
    lambda x: datetime.strptime(x.split(";")[0].strip(), "%d %B, %Y").date())

In [13]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns="date", inplace=True)

In [14]:
# Checking the result.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date
0,2005.13653,"(Biomolecules, Quantitative Methods)",Unveiling the molecular mechanism of SARS-CoV-...,"({'name': 'Duc D Nguyen'}, {'name': 'Kaifu Gao...","Currently, there is no effective antiviral dru...",2020-05-27
1,2005.13523,"(Signal Processing, Human-Computer Interaction...",Emotion-robust EEG Classification for Motor Im...,"({'name': 'Abdul Moeed'},)",Developments in Brain Computer Interfaces (BCI...,2020-05-23
2,2005.13519,"(Populations and Evolution, Physics and Society)",Estimates of the proportion of SARS-CoV-2 infe...,"({'name': 'Henrik Hult'}, {'name': 'Martina Fa...",In this paper a Bayesian SEIR model is studied...,2020-05-25
3,2005.13516,"(Populations and Evolution, Quantitative Methods)",A mathematical epidemic model using genetic fi...,"({'name': 'Mohamed Taha Rouabah'}, {'name': 'A...",A compartmental epidemic model based on geneti...,2020-06-24
4,2005.13466,"(Social and Information Networks, Cryptography...",On the Detection of Disinformation Campaign Ac...,"({'name': 'Luis Vargas'}, {'name': 'Patrick Em...",Online manipulation of information has become ...,2020-05-27


In [15]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                1000 non-null   object
 1   subject_areas     1000 non-null   object
 2   title             1000 non-null   object
 3   authors           1000 non-null   object
 4   abstract          1000 non-null   object
 5   publication_date  1000 non-null   object
dtypes: object(6)
memory usage: 47.0+ KB


## 3. Saving the dataframe

In [16]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/arxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)