# Cleaning and Preprocessing the PubMed publications related to COVID-19

For collecting the PubMed publications related to COVID-19, we used the "pymed" library. It is avaliable on https://pypi.org/project/pymed/.

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/pubmed_raw.csv", header=0, dtype=object)

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,pubmed_id,title,abstract,keywords,journal,publication_date,authors,methods,conclusions,results,copyrights,doi,xml,isbn,language,publication_type,sections,publisher,publisher_location
0,33668070,Expression of Surfactant protein D (SP-D) dist...,The differentiation of influenza and COVID-19 ...,"['COVID-19', 'SARS-CoV-2', 'acute respiratory ...",The Journal of infectious diseases,2021-03-06,"[{'lastname': 'Choreño-Parra', 'firstname': 'J...",,,,© The Author(s) 2021. Published by Oxford Univ...,10.1093/infdis/jiab113,<Element 'PubmedArticle' at 0x7efeff4e9720>,,,,,,
1,33668060,A million-dose success for Nepal: insights fro...,,"['Astra-Zeneca COVID-19 vaccine', 'SARS-CoV-2'...",Journal of travel medicine,2021-03-06,"[{'lastname': 'Bhattarai', 'firstname': 'Suraj...",,,,,10.1093/jtm/taab027,<Element 'PubmedArticle' at 0x7efed2b4c6d0>,,,,,,
2,33668011,Current smoking and SARS-CoV-2 infection: find...,Several studies reported a low prevalence of c...,[],JMIR public health and surveillance,2021-03-06,"[{'lastname': 'Prinelli', 'firstname': 'Federi...",,,Out of the 6857 individuals (mean age 47.9 yea...,,10.2196/27091,<Element 'PubmedArticle' at 0x7efed2b54810>,,,,,,
3,33668003,The prognostic value of elevated creatine kina...,"Creatine kinase (CK), a marker of muscle damag...","['Coronavirus', 'Creatine kinase', 'Mortality'...",Diabetes & metabolic syndrome,2021-03-06,"[{'lastname': 'Akbar', 'firstname': 'Mohammad ...",,Elevated CK was associated with increased mort...,There are 2471 patients from 14 studies includ...,Copyright © 2021. Published by Elsevier Ltd.,10.1016/j.dsx.2021.02.012,<Element 'PubmedArticle' at 0x7efed26e3180>,,,,,,
4,33667997,COVID-19 exposure and obstructive sleep apnea:...,,[],Sleep medicine,2021-03-06,"[{'lastname': 'Mohit', 'firstname': None, 'ini...",,,,,10.1016/j.sleep.2021.02.022,<Element 'PubmedArticle' at 0x7efed26efbd0>,,,,,,


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105575 entries, 0 to 105574
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   pubmed_id           105575 non-null  object
 1   title               105381 non-null  object
 2   abstract            67951 non-null   object
 3   keywords            105502 non-null  object
 4   journal             105502 non-null  object
 5   publication_date    105575 non-null  object
 6   authors             105575 non-null  object
 7   methods             844 non-null     object
 8   conclusions         7147 non-null    object
 9   results             15805 non-null   object
 10  copyrights          41345 non-null   object
 11  doi                 103533 non-null  object
 12  xml                 105502 non-null  object
 13  isbn                42 non-null      object
 14  language            73 non-null      object
 15  publication_type    73 non-null      object
 16  se

## 2. Cleaning and preprocessing the dataframe

In [6]:
# Defining the function "clean_text" to clean and preprocess any text.
def clean_text(text):
    if text:
        return re.sub(r"\\", " ", re.sub(r"\s+", " ", re.sub(r"\-{2,}", "-", re.sub("[0-9]*\u200b", "",
            str(text)).replace("\xad", "-")).replace("\u2009", " ").replace("\xa0", " ").replace(
            "\n", " ").replace("\ufeff", "").replace("\u202f", "").replace("\u2028", " ").replace(
            "\u200f", "").replace("\u200e", "").replace("()", "").replace("[]", "").replace(
            "\\'", "\'").replace("\uf06b", "").replace("\x96", "").replace("\u200c", ""))).strip()
    else:
        return None

In [7]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [8]:
# Removing unnecessary columns.
columns_drop = ["methods", "conclusions", "results", "copyrights", "xml", "isbn",
                "language", "publication_type", "sections", "publisher", "publisher_location"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [9]:
# Getting the PubMed ID for each paper.
df_data.pubmed_id = df_data.pubmed_id.apply(lambda x: x.split()[0].strip())

In [10]:
# Normalizing the doi for each paper.
df_data.loc[df_data.doi.notnull(), "doi"] = df_data.loc[df_data.doi.notnull(), "doi"].apply(
    lambda x: x.split()[0].strip())

In [11]:
# Normalizing the features "abstract", "title" and "journal".
df_data.abstract = df_data.abstract.apply(
    lambda x: clean_text(LatexNodes2Text().latex_to_text(
        re.sub(r"\s+", " ", re.sub("%", "\\%", x)))) if x and len(x) > 0 else None)
df_data.title = df_data.title.apply(lambda x: clean_text(x) if x and len(x) > 0 else None)
df_data.journal = df_data.journal.apply(clean_text)

In [12]:
# Setting the feature "keywords" as a tuple of keywords and normalizing the keywords for each paper.
df_data.keywords.loc[df_data.keywords.notnull()] = [
    tuple([clean_text(keyword) for keyword in eval(keywords)]) if eval(keywords) else None
    for keywords in df_data.keywords[df_data.keywords.notnull()]]

  arr_value = np.array(value)


In [13]:
# Checking there are invalid keywords.
df_data[df_data.keywords.notnull()].keywords[
    [np.any([item == None for item in keywords])
    for keywords in df_data[df_data.keywords.notnull()].keywords]].size

28

In [14]:
# Removing the invalid keywords.
df_data.keywords.loc[df_data.keywords.notnull()] = [tuple([item for item in keywords if item])
    for keywords in df_data.keywords[df_data.keywords.notnull()]]
df_data.keywords.loc[df_data.keywords.notnull()] = df_data.keywords.loc[
    df_data.keywords.notnull()].apply(lambda x: x if len(x) > 0 else None)

In [15]:
# Checking again there are invalid keywords.
df_data[df_data.keywords.notnull()].keywords[
    [np.any([item == None for item in keywords])
    for keywords in df_data[df_data.keywords.notnull()].keywords]].size

0

In [16]:
# Correcting the feature "authors".
for idx, authors in enumerate(df_data.authors):
    if not eval(authors):
        df_data.authors[idx] = None
    else:
        list_authors = []
        for author in eval(authors):
            auth = {}
            if author["firstname"] and author["lastname"]:
                auth["name"] = clean_text("{} {}".format(author["firstname"], author["lastname"]))
            elif author["firstname"] and not author["lastname"]:
                auth["name"] = clean_text(author["firstname"])
            elif not author["firstname"] and author["lastname"]:
                auth["name"] = clean_text(author["lastname"])
            else:
                auth["name"] = None

            auth["id"] = str(hash("{} - {}".format(auth["name"], "PubMed"))) if auth["name"] else None
            auth["affiliation"] = clean_text(author["affiliation"]) if "affiliation" in author else None
            auth["affil_id"] = str(hash("{} - {}".format(auth["affiliation"], "PubMed"))) \
                if auth["affiliation"] else None
            auth["country"] = None

            if auth["affiliation"] or auth["name"]:
                list_authors.append(auth)

        df_data.authors[idx] = tuple(list_authors) if len(list_authors) > 0 else None

In [17]:
# Renaming the features "authors", "keywords" and "journal".
df_data.rename(columns={"authors": "author_affil", "keywords": "auth_keywords",
                        "journal": "vehicle_name"}, inplace=True)

In [18]:
# Removing the duplicated records by features "title" and "doi".
df_data = pd.concat([df_data[df_data.title.isnull() | df_data.doi.isnull()],
    df_data[df_data.title.notnull() & df_data.doi.notnull()].sort_values(
        by=["title", "publication_date"]).drop_duplicates(["title", "doi"], "last")], ignore_index=True)

In [19]:
# Checking the result.
df_data.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi
0,33666930,Deployment of the 1st Area Medical Laboratory ...,"In December 2019, an outbreak of pneumonia cau...",,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'William Washington', 'id': '3069391...",
1,33666929,The COVID-19 Army Rapid Assessment Tool (CARAT...,The COVID-19 pandemic poses unique challenges ...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Michael J Walters', 'id': '25963685...",
2,33666928,1ST Cavalry Division Forward's Defender Europe...,The 1st Cavalry Division Forward (1CD FWD) alo...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Chi L Truong', 'id': '4333851133578...",
3,33666927,A Comprehensive Overview of the US Army Dentis...,The historic outbreak of the novel coronavirus...,,"Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Shani O Thompson Burkes', 'id': '-9...",
4,33666926,"Nutrition, Immune Function, and Infectious Dis...",Consuming a diet meeting energy demands and pr...,"(COVID-19, energy intake, immune function, mic...","Medical journal (Fort Sam Houston, Tex.)",2021-03-06,"({'name': 'Tracey J Smith', 'id': '-1746242993...",


In [20]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105413 entries, 0 to 105412
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   pubmed_id         105413 non-null  object
 1   title             105219 non-null  object
 2   abstract          67886 non-null   object
 3   auth_keywords     58334 non-null   object
 4   vehicle_name      105340 non-null  object
 5   publication_date  105413 non-null  object
 6   author_affil      104429 non-null  object
 7   doi               103371 non-null  object
dtypes: object(8)
memory usage: 6.4+ MB


## 3. Saving the dataframe

In [21]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/pubmed_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)