# Cleaning and Preprocessing the PubMed publications related to COVID-19

For collecting the PubMed publications related to COVID-19, we used the "pymed" library. It is avaliable on https://pypi.org/project/pymed/.

In [None]:
# Uncomment to install the library.
# %pip install pylatexenc

In [None]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [None]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/pubmed_raw.csv", header=0, dtype=object)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 2. Cleaning and preprocessing the dataframe

In [None]:
# Defining the function "clean_text" to clean and preprocess any text.
def clean_text(text):
    if text:
        return re.sub(r"\\", " ", re.sub(r"\s+", " ", re.sub(r"\-{2,}", "-", re.sub("[0-9]*\u200b", "",
            str(text)).replace("\xad", "-")).replace("\u2009", " ").replace("\xa0", " ").replace(
            "\n", " ").replace("\ufeff", "").replace("\u202f", "").replace("\u2028", " ").replace(
            "\u200f", "").replace("\u200e", "").replace("()", "").replace("[]", "").replace(
            "\\'", "\'").replace("\uf06b", "").replace("\x96", "").replace("\u200c", ""))).strip()
    else:
        return None

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Removing unnecessary columns.
columns_drop = ["methods", "conclusions", "results", "copyrights", "xml", "isbn",
                "language", "publication_type", "sections", "publisher", "publisher_location"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [None]:
# Getting the PubMed ID for each paper.
df_data.pubmed_id = df_data.pubmed_id.apply(lambda x: x.split()[0].strip())

In [None]:
# Normalizing the doi for each paper.
df_data.loc[df_data.doi.notnull(), "doi"] = df_data.loc[df_data.doi.notnull(), "doi"].apply(
    lambda x: x.split()[0].strip())

In [None]:
# Normalizing the features "abstract", "title" and "journal".
df_data.abstract = df_data.abstract.apply(
    lambda x: clean_text(LatexNodes2Text().latex_to_text(
        re.sub(r"\s+", " ", re.sub("%", "\\%", x)))) if x and len(x) > 0 else None)
df_data.title = df_data.title.apply(lambda x: clean_text(x) if x and len(x) > 0 else None)
df_data.journal = df_data.journal.apply(clean_text)

In [None]:
# Setting the feature "keywords" as a tuple of keywords and normalizing the keywords for each paper.
df_data.keywords.loc[df_data.keywords.notnull()] = [
    tuple([clean_text(keyword) for keyword in eval(keywords)]) if eval(keywords) else None
    for keywords in df_data.keywords[df_data.keywords.notnull()]]

In [None]:
# Checking there are invalid keywords.
df_data[df_data.keywords.notnull()].keywords[
    [np.any([item == None for item in keywords])
    for keywords in df_data[df_data.keywords.notnull()].keywords]].size

In [None]:
# Removing the invalid keywords.
df_data.keywords.loc[df_data.keywords.notnull()] = [tuple([item for item in keywords if item])
    for keywords in df_data.keywords[df_data.keywords.notnull()]]
df_data.keywords.loc[df_data.keywords.notnull()] = df_data.keywords.loc[
    df_data.keywords.notnull()].apply(lambda x: x if len(x) > 0 else None)

In [None]:
# Checking again there are invalid keywords.
df_data[df_data.keywords.notnull()].keywords[
    [np.any([item == None for item in keywords])
    for keywords in df_data[df_data.keywords.notnull()].keywords]].size

In [None]:
# Correcting the feature "authors".
for idx, authors in enumerate(df_data.authors):
    if not eval(authors):
        df_data.authors[idx] = None
    else:
        list_authors = []
        for author in eval(authors):
            auth = {}
            if author["firstname"] and author["lastname"]:
                auth["name"] = clean_text("{} {}".format(author["firstname"], author["lastname"]))
            elif author["firstname"] and not author["lastname"]:
                auth["name"] = clean_text(author["firstname"])
            elif not author["firstname"] and author["lastname"]:
                auth["name"] = clean_text(author["lastname"])
            else:
                auth["name"] = None

            auth["id"] = str(hash("{} - {}".format(auth["name"], "PubMed"))) if auth["name"] else None
            auth["affiliation"] = clean_text(author["affiliation"]) if "affiliation" in author else None
            auth["affil_id"] = str(hash("{} - {}".format(auth["affiliation"], "PubMed"))) \
                if auth["affiliation"] else None
            auth["country"] = None

            if auth["affiliation"] or auth["name"]:
                list_authors.append(auth)

        df_data.authors[idx] = tuple(list_authors) if len(list_authors) > 0 else None

In [None]:
# Renaming the features "authors", "keywords" and "journal".
df_data.rename(columns={"authors": "author_affil", "keywords": "auth_keywords",
                        "journal": "vehicle_name"}, inplace=True)

In [None]:
# Removing the duplicated records by features "title" and "doi".
df_data = pd.concat([df_data[df_data.title.isnull() | df_data.doi.isnull()],
    df_data[df_data.title.notnull() & df_data.doi.notnull()].sort_values(
        by=["title", "publication_date"]).drop_duplicates(["title", "doi"], "last")], ignore_index=True)

In [None]:
# Checking the result.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 3. Saving the dataframe

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/pubmed_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)