# Cleaning and Preprocessing the bioRxiv publications related to COVID-19

The publications' data were collected from [bioRxiv API](https://api.biorxiv.org/covid19/help) related to COVID-19.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/biorxiv_raw.csv", header=0)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,rel_doi,rel_title,rel_date,rel_site,rel_link,rel_abs,rel_num_authors,rel_authors,version,license,type,category
0,10.1101/2021.03.03.21251639,SARS-CoV-2 antibody magnitude and detectabilit...,2021-03-05,medRxiv,https://medrxiv.org/cgi/content/short/2021.03....,Serosurveillance studies are critical for esti...,45,"[{'author_name': 'Michael J Peluso', 'author_i...",1,cc_by,PUBLISHAHEADOFPRINT,infectious diseases
1,10.1101/2021.03.03.21252086,"COVID Symptoms, Symptom Clusters, and Predicto...",2021-03-05,medRxiv,https://medrxiv.org/cgi/content/short/2021.03....,Emerging data suggest that the effects of infe...,11,"[{'author_name': 'Yong Huang', 'author_inst': ...",1,cc_by_nd,PUBLISHAHEADOFPRINT,infectious diseases
2,10.1101/2021.03.02.21252105,SARS-CoV-2 Load does not Predict Transmissibil...,2021-03-05,medRxiv,https://medrxiv.org/cgi/content/short/2021.03....,SARS-CoV2 is highly contagious and the global ...,22,"[{'author_name': 'Di Tian', 'author_inst': 'Tu...",1,cc_no,PUBLISHAHEADOFPRINT,infectious diseases
3,10.1101/2021.03.03.21251066,Age-dependent immune response to the Biontech/...,2021-03-05,medRxiv,https://medrxiv.org/cgi/content/short/2021.03....,Background: The SARS-CoV-2 pandemic has led to...,19,"[{'author_name': 'Lisa Müller', 'author_inst':...",1,cc_by_nc_nd,PUBLISHAHEADOFPRINT,infectious diseases
4,10.1101/2021.03.01.21252250,Just 2% of SARS-CoV-2-positive individuals car...,2021-03-05,medRxiv,https://medrxiv.org/cgi/content/short/2021.03....,We analyze data from the Fall 2020 pandemic re...,25,"[{'author_name': 'Qing Yang', 'author_inst': '...",1,cc_by,PUBLISHAHEADOFPRINT,infectious diseases


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13789 entries, 0 to 13788
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   rel_doi          13789 non-null  object
 1   rel_title        13789 non-null  object
 2   rel_date         13789 non-null  object
 3   rel_site         13789 non-null  object
 4   rel_link         13789 non-null  object
 5   rel_abs          13789 non-null  object
 6   rel_num_authors  13789 non-null  int64 
 7   rel_authors      13702 non-null  object
 8   version          13789 non-null  int64 
 9   license          13733 non-null  object
 10  type             13789 non-null  object
 11  category         13789 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.3+ MB


## 2. Cleaning and preprocessing the dataframe

In [6]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns=["rel_num_authors", "version", "license", "type"], inplace=True)

In [7]:
# Renaming the columns.
columns = {"rel_title": "title", "rel_doi": "doi", "rel_link": "id", "rel_abs": "abstract",
    "rel_authors": "author_affil", "rel_date": "publication_date", "rel_site": "source",
    "category": "subject_areas"}
df_data.rename(columns=columns, inplace=True)

In [8]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None, "none": None, "none.": None, "None": None}, inplace=True)

In [9]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.split("/")[-1])

In [10]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"\^", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ", re.sub("\\\\?%", "@PER@CENT@",
            re.sub("\\\\", "\n", re.sub(r"\\href\{(.+)\}\{(.+)\}", "\g<2> (\g<1>)",
                y)))).strip())))) if y else None))

In [11]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple([x]) if x else None)

In [12]:
# Changing the type of feature "author_affil".
df_data.author_affil = df_data.author_affil.apply(lambda x: eval(x) if x else None)

In [13]:
# Normalizing the feature "author_affil".
df_data.author_affil[df_data.author_affil.notnull()] = [
    [{"name": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("^-\s", "", author["author_name"])))),
      "affiliation": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                        re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("Affiliation:", "",
                            re.sub(r"[0-9]+\.\s", "", author["author_inst"]), flags=re.IGNORECASE))))}
     for author in authors] if len(authors) > 0 else None
    for authors in df_data.author_affil[df_data.author_affil.notnull()]]

In [14]:
# Removing the invalid authors and affiliations.
invalid_authors = ["Revision Created", "Revision Converted", "Newly Submitted Revision",
                   "Final Decision"]
for idx, authors in df_data.author_affil[df_data.author_affil.notnull()].iteritems():
    if authors:
        for author in list(authors):
            if author["name"].strip() in invalid_authors:
                authors.remove(author)
            elif not author["affiliation"] or author["affiliation"].lower().replace(".", "") == "none":
                author["affiliation"] = None
        df_data.author_affil[idx] = tuple(authors)

In [15]:
# Creating the authors' and affiliations' IDs.
df_data.author_affil[df_data.author_affil.notnull()] = [tuple([
    {"id": str(hash("{} - {}".format(author["name"], df_data.source[idx]))) if author["name"] else None,
     "name": author["name"] if author["name"] else None,
     "affil_id": str(hash("{} - {}".format(author["affiliation"], df_data.source[idx]))) \
         if author["affiliation"] else None,
     "affiliation": author["affiliation"] if author["affiliation"] else None, "country": None}
    for author in authors])
    for idx, authors in df_data.author_affil[df_data.author_affil.notnull()].iteritems()]

In [16]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None, "none": None, "none.": None, "None": None}, inplace=True)

In [17]:
# Removing the duplicated records by features "title" and "doi".
df_data = df_data.sort_values(by=["title", "publication_date"]).drop_duplicates(["title", "doi"], "last")

In [18]:
# Checking the result.
df_data.head()

Unnamed: 0,doi,title,publication_date,source,id,abstract,author_affil,subject_areas
5607,10.1101/2020.08.25.20181545,"""I walk around like my hands are covered in mu...",2020-08-31,medRxiv,2020.08.25.20181545,ObjectivesTo investigate how and why Canadians...,"({'id': '8616496335208757239', 'name': 'Robyn ...","(public and global health,)"
9458,10.1101/2020.05.28.120709,"""Monoclonal-type"" plastic antibodies for SARS-...",2020-05-28,bioRxiv,2020.05.28.120709,Summary of the ideaOur idea is focused on the ...,"({'id': '2070372118427817817', 'name': 'France...","(synthetic biology,)"
11723,10.1101/2020.04.16.20067884,"""No test is better than a bad test"": Impact of...",2020-04-22,medRxiv,2020.04.16.20067884,Testing is viewed as a critical aspect of any ...,"({'id': '-7499246138313714402', 'name': 'Nicho...","(epidemiology,)"
2000,10.1101/2020.12.22.20248719,"""There's No Place Like Home for The Holidays:""...",2020-12-24,medRxiv,2020.12.22.20248719,"In the US, public health officials discouraged...","({'id': '-1353686671842246904', 'name': 'Shrut...","(epidemiology,)"
997,10.1101/2021.01.29.21250626,"""This is really like waiting for war and this ...",2021-02-01,medRxiv,2021.01.29.21250626,Healthcare professionals (HCPs) are facing rem...,"({'id': '-8861698337242705120', 'name': 'Madle...","(intensive care and critical care medicine,)"


In [19]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13789 entries, 5607 to 3584
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   doi               13789 non-null  object
 1   title             13789 non-null  object
 2   publication_date  13789 non-null  object
 3   source            13789 non-null  object
 4   id                13789 non-null  object
 5   abstract          13788 non-null  object
 6   author_affil      13702 non-null  object
 7   subject_areas     13789 non-null  object
dtypes: object(8)
memory usage: 969.5+ KB


## 3. Saving the dataframe

In [20]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/biorxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)