# Cleaning and Preprocessing the bioRxiv publications related to COVID-19

The publications' data were collected from [bioRxiv API](https://api.biorxiv.org/covid19/help) related to COVID-19.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/biorxiv_raw.csv", header=0)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,rel_doi,rel_title,rel_date,rel_site,rel_link,rel_abs,rel_num_authors,rel_authors,version,license,type,category
0,10.1101/2020.09.22.20196204,Risk factors for mortality among hospitalized ...,2020-09-24,medRxiv,https://medrxiv.org/cgi/content/short/2020.09....,Objectives To develop a prognostic model to id...,8,"[{'author_name': 'Devin Incerti', 'author_inst...",1,cc_by_nc,PUBLISHAHEADOFPRINT,infectious diseases
1,10.1101/2020.09.23.20150961,Prospective comparison of saliva and nasophary...,2020-09-24,medRxiv,https://medrxiv.org/cgi/content/short/2020.09....,Current testing for COVID-19 relies on quantit...,3,"[{'author_name': '- COVISAL Guyane', 'author_i...",1,cc_by_nc_nd,PUBLISHAHEADOFPRINT,epidemiology
2,10.1101/2020.09.23.20200089,Epidemiological measures for informing the gen...,2020-09-24,medRxiv,https://medrxiv.org/cgi/content/short/2020.09....,"During the SARS-CoV-2 outbreak, several epidem...",5,"[{'author_name': 'Ralph Brinks', 'author_inst'...",1,cc_no,PUBLISHAHEADOFPRINT,epidemiology
3,10.1101/2020.09.23.20199463,Comparative Effectiveness of Famotidine in Hos...,2020-09-24,medRxiv,https://medrxiv.org/cgi/content/short/2020.09....,Background: Famotidine has been posited as a p...,5,"[{'author_name': 'Azza Shoaibi', 'author_inst'...",1,cc_by,PUBLISHAHEADOFPRINT,gastroenterology
4,10.1101/2020.09.22.20197046,Performance Assessment of First-Generation Ant...,2020-09-24,medRxiv,https://medrxiv.org/cgi/content/short/2020.09....,The clinical and epidemiological use of SARS-C...,12,"[{'author_name': 'Tahir S Shamsi', 'author_ins...",1,cc_no,PUBLISHAHEADOFPRINT,pathology


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9027 entries, 0 to 9026
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   rel_doi          9027 non-null   object
 1   rel_title        9027 non-null   object
 2   rel_date         9027 non-null   object
 3   rel_site         9027 non-null   object
 4   rel_link         9027 non-null   object
 5   rel_abs          9027 non-null   object
 6   rel_num_authors  9027 non-null   int64 
 7   rel_authors      9027 non-null   object
 8   version          9027 non-null   int64 
 9   license          9020 non-null   object
 10  type             9027 non-null   object
 11  category         9027 non-null   object
dtypes: int64(2), object(10)
memory usage: 846.4+ KB


## 2. Cleaning and preprocessing the dataframe

In [6]:
# Removing unnecessary columns.
df_data.drop(axis=1, columns=["rel_num_authors", "version", "license", "type"], inplace=True)

In [7]:
# Renaming the columns.
columns = {"rel_title": "title", "rel_doi": "doi", "rel_link": "id", "rel_abs": "abstract",
    "rel_authors": "author_affil", "rel_date": "publication_date", "rel_site": "source",
    "category": "subject_areas"}
df_data.rename(columns=columns, inplace=True)

In [8]:
# Normalizing the feature "id".
df_data.id = df_data.id.apply(lambda x: x.split("/")[-1])

In [9]:
# Normalizing the features "title" and "abstract".
df_data.loc[:, ["title", "abstract"]] = df_data.loc[:, ["title", "abstract"]].apply(
    lambda x: x.apply(lambda y: re.sub("/r/", "", re.sub("@PER@CENT@", "%", re.sub(r"\^", "",
        LatexNodes2Text().latex_to_text(re.sub(r"\s+", " ", re.sub("\\\\?%", "@PER@CENT@",
            re.sub("\\\\", "\n", re.sub(r"\\href\{(.+)\}\{(.+)\}", "\g<2> (\g<1>)",
                y)))).strip())))) if y else None))

In [10]:
# Normalizing the feature "subject_areas".
df_data.subject_areas = df_data.subject_areas.apply(lambda x: tuple([x]) if x else None)

In [11]:
# Changing the type of feature "author_affil".
df_data.author_affil = df_data.author_affil.apply(eval)

In [12]:
# Normalizing the feature "author_affil".
df_data.author_affil = [
    [{"name": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("^-\s", "", author["author_name"])))),
      "affiliation": re.sub(r"\s+", " ", LatexNodes2Text().latex_to_text(
                        re.sub(r"^\"(.+)\"$", "\g<1>", re.sub("Affiliation:", "",
                            re.sub(r"[0-9]+\.\s", "", author["author_inst"]), flags=re.IGNORECASE))))}
     for author in authors] if len(authors) > 0 else None for authors in df_data.author_affil]

In [13]:
# Removing the invalid authors and affiliations.
invalid_authors = ["Revision Created", "Revision Converted", "Newly Submitted Revision",
                   "Final Decision"]
for idx, authors in df_data.author_affil.iteritems():
    if authors:
        for author in list(authors):
            if author["name"].strip() in invalid_authors:
                authors.remove(author)
            elif not author["affiliation"] or author["affiliation"].lower().replace(".", "") == "none":
                author["affiliation"] = None
        df_data.author_affil[idx] = tuple(authors)

In [14]:
# Creating the authors' and affiliations' IDs.
df_data.author_affil = [tuple([
    {"id": str(hash("{} - {}".format(author["name"], df_data.source[idx]))) if author["name"] else None,
     "name": author["name"] if author["name"] else None,
     "affil_id": str(hash("{} - {}".format(author["affiliation"], df_data.source[idx]))) \
         if author["affiliation"] else None,
     "affiliation": author["affiliation"] if author["affiliation"] else None, "country": None}
    for author in authors]) for idx, authors in df_data.author_affil.iteritems()]

In [15]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None, "none": None, "none.": None, "None": None}, inplace=True)

In [16]:
# Removing the duplicated records by features "title" and "doi".
df_data = df_data.sort_values(by=["title", "publication_date"]).drop_duplicates(["title", "doi"], "last")

In [17]:
# Checking the result.
df_data.head()

Unnamed: 0,doi,title,publication_date,source,id,abstract,author_affil,subject_areas
862,10.1101/2020.08.25.20181545,"""I walk around like my hands are covered in mu...",2020-08-31,medRxiv,2020.08.25.20181545,Objectives: To investigate how and why Canadia...,"({'id': '-4223852819809795845', 'name': 'Robyn...","(public and global health,)"
4708,10.1101/2020.05.28.120709,"""Monoclonal-type"" plastic antibodies for SARS-...",2020-05-28,bioRxiv,2020.05.28.120709,Summary of the ideaOur idea is focused on the ...,"({'id': '4304627819863036756', 'name': 'France...","(synthetic biology,)"
6970,10.1101/2020.04.16.20067884,"""No test is better than a bad test"": Impact of...",2020-04-22,medRxiv,2020.04.16.20067884,Testing is viewed as a critical aspect of any ...,"({'id': '-1190320489506440084', 'name': 'Nicho...","(epidemiology,)"
4202,10.1101/2020.06.04.20122812,'Drawing on Wisdom to Cope with Adversity:' A ...,2020-06-07,medRxiv,2020.06.04.20122812,Background: Mental health has become one of th...,"({'id': '6026351697434995649', 'name': 'Jose M...","(psychiatry and clinical psychology,)"
2593,10.1101/2020.07.11.20151308,'Trained immunity' from Mycobacterium spp. exp...,2020-07-14,medRxiv,2020.07.11.20151308,Protective variables for COVID-19 are unknown....,"({'id': '-655212003565356990', 'name': 'Samer ...","(infectious diseases,)"


In [18]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9027 entries, 862 to 2144
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   doi               9027 non-null   object
 1   title             9027 non-null   object
 2   publication_date  9027 non-null   object
 3   source            9027 non-null   object
 4   id                9027 non-null   object
 5   abstract          9023 non-null   object
 6   author_affil      9027 non-null   object
 7   subject_areas     9027 non-null   object
dtypes: object(8)
memory usage: 634.7+ KB


## 3. Saving the dataframe

In [19]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/biorxiv_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)