# Cleaning and Preprocessing the arXiv publications related to COVID-19

The publications' data were collected from [arXiv webpage](https://arxiv.org/covid19search) related to COVID-19.

In [None]:
# Uncomment to install the library.
# %pip install pylatexenc

In [None]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text
from datetime import datetime

## 1. Generating the dataframe from the raw data

In [None]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/arxiv_raw.csv", header=0)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 2. Cleaning and preprocessing the dataframe

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.replace("arXiv:", "").strip())

In [None]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple(eval(x)))

In [None]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"[\^_]", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ",
            re.sub(r"\\?%", "@PER@CENT@", y))).strip())))))

In [None]:
# Normalizing the feature "authors".
df_data.authors = [tuple([{"id": str(hash("{} - {}".format(author, "arXiv"))), "name": author}
                   for author in eval(authors)]) for authors in df_data.authors]

In [None]:
# Normalizing the feature "date".
df_data.date = df_data.date.apply(lambda x: re.sub(r"\s+", " ", x.split(".")[0]))
df_data.date = df_data.date.apply(lambda x: x.replace("submitted ", ""))

In [None]:
# Creating the feature "publication_date" from the feature "date".
df_data["publication_date"] = df_data.date.apply(
    lambda x: datetime.strptime(x.split(";")[0].strip(), "%d %B, %Y").date())

In [None]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns="date", inplace=True)

In [None]:
# Checking the result.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 3. Saving the dataframe

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/arxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)