# Cleaning and Preprocessing the bioRxiv publications related to COVID-19

The publications' data were collected from [bioRxiv API](https://api.biorxiv.org/covid19/help) related to COVID-19.

In [None]:
# Uncomment to install the library.
# %pip install pylatexenc

In [None]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [None]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/biorxiv_raw.csv", header=0)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 2. Cleaning and preprocessing the dataframe

In [None]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns=["rel_num_authors", "version", "license", "type"], inplace=True)

In [None]:
# Renaming the columns.
columns = {"rel_title": "title", "rel_doi": "doi", "rel_link": "id", "rel_abs": "abstract",
    "rel_authors": "author_affil", "rel_date": "publication_date", "rel_site": "source",
    "category": "subject_areas"}
df_data.rename(columns=columns, inplace=True)

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None, "none": None, "none.": None, "None": None}, inplace=True)

In [None]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.split("/")[-1])

In [None]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"\^", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ", re.sub("\\\\?%", "@PER@CENT@",
            re.sub("\\\\", "\n", re.sub(r"\\href\{(.+)\}\{(.+)\}", "\g<2> (\g<1>)",
                y)))).strip())))) if y else None))

In [None]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple([x]) if x else None)

In [None]:
# Changing the type of feature "author_affil".
df_data.author_affil = df_data.author_affil.apply(lambda x: eval(x) if x else None)

In [None]:
# Normalizing the feature "author_affil".
df_data.author_affil[df_data.author_affil.notnull()] = [
    [{"name": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("^-\s", "", author["author_name"])))),
      "affiliation": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                        re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("Affiliation:", "",
                            re.sub(r"[0-9]+\.\s", "", author["author_inst"]), flags=re.IGNORECASE))))}
     for author in authors] if len(authors) > 0 else None
    for authors in df_data.author_affil[df_data.author_affil.notnull()]]

In [None]:
# Removing the invalid authors and affiliations.
invalid_authors = ["Revision Created", "Revision Converted", "Newly Submitted Revision",
                   "Final Decision"]
for idx, authors in df_data.author_affil[df_data.author_affil.notnull()].iteritems():
    if authors:
        for author in list(authors):
            if author["name"].strip() in invalid_authors:
                authors.remove(author)
            elif not author["affiliation"] or author["affiliation"].lower().replace(".", "") == "none":
                author["affiliation"] = None
        df_data.author_affil[idx] = tuple(authors)

In [None]:
# Creating the authors' and affiliations' IDs.
df_data.author_affil[df_data.author_affil.notnull()] = [tuple([
    {"id": str(hash("{} - {}".format(author["name"], df_data.source[idx]))) if author["name"] else None,
     "name": author["name"] if author["name"] else None,
     "affil_id": str(hash("{} - {}".format(author["affiliation"], df_data.source[idx]))) \
         if author["affiliation"] else None,
     "affiliation": author["affiliation"] if author["affiliation"] else None, "country": None}
    for author in authors])
    for idx, authors in df_data.author_affil[df_data.author_affil.notnull()].iteritems()]

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None, "none": None, "none.": None, "None": None}, inplace=True)

In [None]:
# Removing the duplicated records by features "title" and "doi".
df_data = df_data.sort_values(by=["title", "publication_date"]).drop_duplicates(["title", "doi"], "last")

In [None]:
# Checking the result.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 3. Saving the dataframe

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/biorxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)