# Cleaning and Preprocessing the arXiv publications related to COVID-19

The publications' data were collected from [arXiv webpage](https://arxiv.org/covid19search) related to COVID-19.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text
from datetime import datetime

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/arxiv_raw.csv", header=0)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,date
0,arXiv:2103.03219,['General Finance'],The Impact of COVID-19 on Stock Market Volatil...,"['Ateeb Akhter Shah Syed', 'Kaneez Fatima']",This paper examines the impact of coronavirus ...,"11 February, 2021; March 2021."
1,arXiv:2103.03055,"['Image and Video Processing', 'Computer Visio...",Self-supervised deep convolutional neural netw...,"['Matej Gazda', 'Jakub Gazda', 'Jan Plavka', '...","Chest radiography is a relatively cheap, widel...","4 March, 2021; March 2021."
2,arXiv:2103.03038,['Computer Vision and Pattern Recognition'],Mobile Touchless Fingerprint Recognition: Impl...,"['Jannis Priesnitz', 'Rolf Huesmann', 'Christi...",This work presents an automated touchless fing...,"4 March, 2021; March 2021."
3,arXiv:2103.02961,"['Image and Video Processing', 'Computer Visio...",Probabilistic combination of eigenlungs-based ...,"['Juan E. Arco', 'Andrés Ortiz', 'Javier Ramír...",The outbreak of the COVID-19 (Coronavirus dise...,"4 March, 2021; March 2021."
4,arXiv:2103.02917,"['Computers and Society', 'Computation and Lan...",MP Twitter Engagement and Abuse Post-first COV...,"['Tracie Farrell', 'Mehmet Bakir', 'Kalina Bon...",The UK has had a volatile political environmen...,"4 March, 2021; March 2021."


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3640 entries, 0 to 3639
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             3640 non-null   object
 1   subject_areas  3640 non-null   object
 2   title          3640 non-null   object
 3   authors        3640 non-null   object
 4   abstract       3640 non-null   object
 5   date           3640 non-null   object
dtypes: object(6)
memory usage: 170.8+ KB


## 2. Cleaning and preprocessing the dataframe

In [6]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [7]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.replace("arXiv:", "").strip())

In [8]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple(eval(x)))

In [9]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"[\^_]", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ",
            re.sub(r"\\?%", "@PER@CENT@", y))).strip())))))

In [10]:
# Normalizing the feature "authors".
df_data.authors = [tuple([{"id": str(hash("{} - {}".format(author, "arXiv"))), "name": author}
                   for author in eval(authors)]) for authors in df_data.authors]

In [11]:
# Normalizing the feature "date".
df_data.date = df_data.date.apply(lambda x: re.sub(r"\s+", " ", x.split(".")[0]))
df_data.date = df_data.date.apply(lambda x: x.replace("submitted ", ""))

In [12]:
# Creating the feature "publication_date" from the feature "date".
df_data["publication_date"] = df_data.date.apply(
    lambda x: datetime.strptime(x.split(";")[0].strip(), "%d %B, %Y").date())

In [13]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns="date", inplace=True)

In [14]:
# Checking the result.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date
0,2103.03219,"(General Finance,)",The Impact of COVID-19 on Stock Market Volatil...,"({'id': '-8736464535247814394', 'name': 'Ateeb...",This paper examines the impact of coronavirus ...,2021-02-11
1,2103.03055,"(Image and Video Processing, Computer Vision a...",Self-supervised deep convolutional neural netw...,"({'id': '1387464675565637337', 'name': 'Matej ...","Chest radiography is a relatively cheap, widel...",2021-03-04
2,2103.03038,"(Computer Vision and Pattern Recognition,)",Mobile Touchless Fingerprint Recognition: Impl...,"({'id': '1917885517247673923', 'name': 'Jannis...",This work presents an automated touchless fing...,2021-03-04
3,2103.02961,"(Image and Video Processing, Computer Vision a...",Probabilistic combination of eigenlungs-based ...,"({'id': '-3330601887987387557', 'name': 'Juan ...",The outbreak of the COVID-19 (Coronavirus dise...,2021-03-04
4,2103.02917,"(Computers and Society, Computation and Language)",MP Twitter Engagement and Abuse Post-first COV...,"({'id': '-6377212943789062063', 'name': 'Traci...",The UK has had a volatile political environmen...,2021-03-04


In [15]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3640 entries, 0 to 3639
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                3640 non-null   object
 1   subject_areas     3640 non-null   object
 2   title             3640 non-null   object
 3   authors           3640 non-null   object
 4   abstract          3640 non-null   object
 5   publication_date  3640 non-null   object
dtypes: object(6)
memory usage: 170.8+ KB


## 3. Saving the dataframe

In [16]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/arxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)