# Cleaning and Preprocessing the bioRxiv publications related to COVID-19

The publications' data were collected from [bioRxiv webpage](https://connect.biorxiv.org/relate/content/181) related to COVID-19.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/biorxiv_raw.csv", header=0)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,rel_title,rel_doi,rel_link,rel_abs,rel_num_authors,rel_authors,rel_date,rel_site
0,"Characteristics, outcome and predictors of in-...",10.1101/2020.06.30.20143701,http://medrxiv.org/cgi/content/short/2020.06.3...,"Since December 2019, coronavirus disease 2019 ...",23,"[{'author_name': 'ENRICO MARIA TRECARICHI', 'a...",2020-07-02,medrxiv
1,A network-informed analysis of SARS-CoV-2 and ...,10.1101/2020.07.01.20144121,http://medrxiv.org/cgi/content/short/2020.07.0...,Abnormal coagulation and an increased risk of ...,13,"[{'author_name': 'Jun Ding', 'author_inst': 'C...",2020-07-02,medrxiv
2,ROX Index Predicts Intubation in Patients with...,10.1101/2020.06.30.20143867,http://medrxiv.org/cgi/content/short/2020.06.3...,Introduction Use of high flow nasal therapy (H...,16,"[{'author_name': 'Maulin Patel', 'author_inst'...",2020-07-02,medrxiv
3,COVID-MATCH65 - A prospectively derived clinic...,10.1101/2020.06.30.20143818,http://medrxiv.org/cgi/content/short/2020.06.3...,Due to the ongoing COVID-19 pandemic and incre...,17,"[{'author_name': 'Jason A Trubiano', 'author_i...",2020-07-02,medrxiv
4,Relative COVID-19 viral persistence and antibo...,10.1101/2020.07.01.20143917,http://medrxiv.org/cgi/content/short/2020.07.0...,Importance: The COVID-19 antibody response is ...,9,"[{'author_name': 'Chung-Guei Huang', 'author_i...",2020-07-02,medrxiv


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6020 entries, 0 to 6019
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   rel_title        6020 non-null   object
 1   rel_doi          6020 non-null   object
 2   rel_link         6020 non-null   object
 3   rel_abs          6020 non-null   object
 4   rel_num_authors  6020 non-null   int64 
 5   rel_authors      6020 non-null   object
 6   rel_date         6020 non-null   object
 7   rel_site         6020 non-null   object
dtypes: int64(1), object(7)
memory usage: 376.4+ KB


## 2. Cleaning and preprocessing the dataframe

In [6]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns="rel_num_authors", inplace=True)

In [7]:
# Renaming the columns.
columns = {"rel_title": "title", "rel_doi": "doi", "rel_link": "id", "rel_abs": "abstract",
    "rel_authors": "author_affil", "rel_date": "publication_date", "rel_site": "source"}
df_data.rename(columns=columns, inplace=True)

In [8]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.split("/")[-1])

In [9]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"\^", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ", re.sub("\\\\?%", "@PER@CENT@",
            re.sub(r"\\href\{(.+)\}\{(.+)\}", "\g<2> \\url{\g<1>}", y))).strip()))))))

In [10]:
# Changing the type of feature "author_affil".
df_data.author_affil = df_data.author_affil.apply(eval)

In [11]:
# Normalizing the feature "author_affil".
df_data.author_affil = [
    [{"name": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("^-\s", "", author["author_name"])))),
      "affiliation": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                        re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("Affiliation:", "",
                            re.sub(r"[0-9]+\.\s", "", author["author_inst"]), flags=re.IGNORECASE))))}
     for author in authors] if len(authors) > 0 else None for authors in df_data.author_affil]

In [12]:
# Removing the invalid authors and affiliations.
invalid_authors = ["Revision Created", "Revision Converted", "Newly Submitted Revision",
                   "Final Decision"]
for idx, authors in df_data.author_affil.iteritems():
    if authors:
        for author in list(authors):
            if author["name"].strip() in invalid_authors:
                authors.remove(author)
            elif not author["affiliation"] or author["affiliation"].lower().replace(".", "") == "none":
                author["affiliation"] = None
        df_data.author_affil[idx] = tuple(authors)

In [13]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None, "none": None, "none.": None, "None": None}, inplace=True)

In [14]:
# Checking the result.
df_data.head()

Unnamed: 0,title,doi,id,abstract,author_affil,publication_date,source
0,"Characteristics, outcome and predictors of in-...",10.1101/2020.06.30.20143701,2020.06.30.20143701,"Since December 2019, coronavirus disease 2019 ...","({'name': 'ENRICO MARIA TRECARICHI', 'affiliat...",2020-07-02,medrxiv
1,A network-informed analysis of SARS-CoV-2 and ...,10.1101/2020.07.01.20144121,2020.07.01.20144121,Abnormal coagulation and an increased risk of ...,"({'name': 'Jun Ding', 'affiliation': 'Computat...",2020-07-02,medrxiv
2,ROX Index Predicts Intubation in Patients with...,10.1101/2020.06.30.20143867,2020.06.30.20143867,Introduction Use of high flow nasal therapy (H...,"({'name': 'Maulin Patel', 'affiliation': 'Temp...",2020-07-02,medrxiv
3,COVID-MATCH65 - A prospectively derived clinic...,10.1101/2020.06.30.20143818,2020.06.30.20143818,Due to the ongoing COVID-19 pandemic and incre...,"({'name': 'Jason A Trubiano', 'affiliation': '...",2020-07-02,medrxiv
4,Relative COVID-19 viral persistence and antibo...,10.1101/2020.07.01.20143917,2020.07.01.20143917,Importance: The COVID-19 antibody response is ...,"({'name': 'Chung-Guei Huang', 'affiliation': '...",2020-07-02,medrxiv


In [15]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6020 entries, 0 to 6019
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             6020 non-null   object
 1   doi               6020 non-null   object
 2   id                6020 non-null   object
 3   abstract          6016 non-null   object
 4   author_affil      5997 non-null   object
 5   publication_date  6020 non-null   object
 6   source            6020 non-null   object
dtypes: object(7)
memory usage: 329.3+ KB


## 3. Saving the dataframe

In [16]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/biorxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)