# Cleaning and Preprocessing the PubMed publications related to COVID-19

For collecting the PubMed publications related to COVID-19, we used the "pymed" library. It is avaliable on [https://pypi.org/project/pymed/].

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/pubmed_raw.csv", header=0, dtype={"pubmed_id": "str"})

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,pubmed_id,title,abstract,keywords,journal,publication_date,authors,methods,conclusions,results,copyrights,doi,xml,isbn,language,publication_type,sections,publisher,publisher_location
0,32610371,Is BMI higher in younger patients with COVID-1...,Obesity has been found to be a risk factor for...,"['Age', 'COVID-19', 'Diabetes', 'Hypertension'...","Obesity (Silver Spring, Md.)",2020-07-02,"[{'lastname': 'Bhasin', 'firstname': 'Ajay', '...",,We found younger patients (age <50 years) with...,We found patients younger than 50 years of age...,This article is protected by copyright. All ri...,10.1002/oby.22947,<Element 'PubmedArticle' at 0x7fdbc6294458>,,,,,,
1,32610364,"The association between obesity, type 2 diabet...","To explore the association between obesity, ty...","['COVID-19', 'Hypertension', 'Mexican populati...","Obesity (Silver Spring, Md.)",2020-07-02,"[{'lastname': 'Denova-Gutiérrez', 'firstname':...",,"Obesity, diabetes, and hypertension were signi...",Patients who tested positive for COVID-19 had ...,This article is protected by copyright. All ri...,10.1002/oby.22946,<Element 'PubmedArticle' at 0x7fdbc58389f8>,,,,,,
2,32610350,Rapid Implementation of an Inpatient Telehealt...,Relaxation of laws and regulations around pri...,[],Applied clinical informatics,2020-07-02,"[{'lastname': 'Hron', 'firstname': 'Jonathan D...",,We successfully implemented and scaled a secu...,"Within 7 weeks of go-live, we hosted 1,820 in...",Georg Thieme Verlag KG Stuttgart · New York.,10.1055/s-0040-1713635,<Element 'PubmedArticle' at 0x7fdbc584a1d8>,,,,,,
3,32610334,COVID-19 Infection and Neurological Complicati...,"The present outbreak caused by SARS-CoV-2, an ...","['Coronavirus', 'Infection', 'Neurological dis...",Neuroepidemiology,2020-07-02,"[{'lastname': 'Beghi', 'firstname': 'Ettore', ...",,,,"© 2020 S. Karger AG, Basel.",10.1159/000508991,<Element 'PubmedArticle' at 0x7fdbc5853778>,,,,,,
4,32610281,Adversity as a Catalyst for Change.,,"['COVID-19', 'practice management']",The Journal of invasive cardiology,2020-07-02,"[{'lastname': 'Dalakoti', 'firstname': 'Mayank...",,,,,,<Element 'PubmedArticle' at 0x7fdbc55dbc78>,,,,,,


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28818 entries, 0 to 28817
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   pubmed_id           28818 non-null  object
 1   title               28773 non-null  object
 2   abstract            14815 non-null  object
 3   keywords            28812 non-null  object
 4   journal             28812 non-null  object
 5   publication_date    28818 non-null  object
 6   authors             28818 non-null  object
 7   methods             117 non-null    object
 8   conclusions         1230 non-null   object
 9   results             2776 non-null   object
 10  copyrights          8473 non-null   object
 11  doi                 28322 non-null  object
 12  xml                 28812 non-null  object
 13  isbn                1 non-null      object
 14  language            6 non-null      object
 15  publication_type    6 non-null      object
 16  sections            6 

## 2. Cleaning and preprocessing the dataframe

In [6]:
# Defining the function "clean_text" to cleaning and preprocessing any text.
def clean_text(text):
    if text:
        return re.sub(r"\s+", " ", re.sub("[0-9]*\u200b", "", str(text)).replace("\u2009", " ").replace(
            "\xa0", " ").replace("\n", " ").replace("\ufeff", "").replace(
            "\u202f", "").replace("\u2028", " ").replace("\u200f", "")).strip()
    else:
        return None

In [7]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [8]:
# Removing unnecessary columns.
columns_drop = ["methods", "conclusions", "results", "copyrights", "xml", "isbn",
                "language", "publication_type", "sections", "publisher", "publisher_location"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [9]:
# Getting the PubMed ID for each paper.
df_data.pubmed_id = df_data.pubmed_id.apply(lambda x: x.split()[0].strip())

In [10]:
# Normalizing the features "abstract" and "title".
df_data.abstract = df_data.abstract.apply(
    lambda x: LatexNodes2Text().latex_to_text(
        re.sub(r"\s+", " ", re.sub("%", "\\%", x))) if x and len(x) > 0 else None)
df_data.title = df_data.title.apply(lambda x: x.replace("\n", " ") if x and len(x) > 0 else None)

In [11]:
# Setting the feature "keywords" as a tuple of keywords and normalizing the keywords for each paper.
df_data.keywords.loc[df_data.keywords.notnull()] = [
    tuple([clean_text(keyword) for keyword in eval(keywords)]) if eval(keywords) else None
    for keywords in df_data.keywords[df_data.keywords.notnull()]]

In [12]:
# Correcting the feature "authors".
for idx, authors in enumerate(df_data.authors):
    if not eval(authors):
        df_data.authors[idx] = None
    else:
        list_authors = []
        for author in eval(authors):
            auth = {}
            if author["firstname"] and author["lastname"]:
                auth["name"] = clean_text("{} {}".format(author["firstname"], author["lastname"]))
            elif author["firstname"] and not author["lastname"]:
                auth["name"] = clean_text(author["firstname"])
            elif not author["firstname"] and author["lastname"]:
                auth["name"] = clean_text(author["lastname"])

            if "affiliation" in author:
                auth["affiliation"] = clean_text(author["affiliation"])
            else:
                auth["affiliation"] = None
            
            if "name" in auth:
                list_authors.append(auth)
        if list_authors:
            df_data.authors[idx] = tuple(list_authors)
        else:
            df_data.authors[idx] = None

In [13]:
# Renaming the features "authors", "keywords" and "journal".
df_data.rename(columns={"authors": "author_affil", "keywords": "auth_keywords",
                        "journal": "vehicle_name"}, inplace=True)

In [14]:
# Checking the result.
df_data.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi
0,32610371,Is BMI higher in younger patients with COVID-1...,Obesity has been found to be a risk factor for...,"(Age, COVID-19, Diabetes, Hypertension, Obesity)","Obesity (Silver Spring, Md.)",2020-07-02,"({'name': 'Ajay Bhasin', 'affiliation': 'Depar...",10.1002/oby.22947
1,32610364,"The association between obesity, type 2 diabet...","To explore the association between obesity, ty...","(COVID-19, Hypertension, Mexican population, O...","Obesity (Silver Spring, Md.)",2020-07-02,"({'name': 'Edgar Denova-Gutiérrez', 'affiliati...",10.1002/oby.22946
2,32610350,Rapid Implementation of an Inpatient Telehealt...,Relaxation of laws and regulations around pri...,,Applied clinical informatics,2020-07-02,"({'name': 'Jonathan D Hron', 'affiliation': 'D...",10.1055/s-0040-1713635
3,32610334,COVID-19 Infection and Neurological Complicati...,"The present outbreak caused by SARS-CoV-2, an ...","(Coronavirus, Infection, Neurological disorder...",Neuroepidemiology,2020-07-02,"({'name': 'Ettore Beghi', 'affiliation': 'Depa...",10.1159/000508991
4,32610281,Adversity as a Catalyst for Change.,,"(COVID-19, practice management)",The Journal of invasive cardiology,2020-07-02,"({'name': 'Mayank Dalakoti', 'affiliation': 'N...",


In [15]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28818 entries, 0 to 28817
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pubmed_id         28818 non-null  object
 1   title             28773 non-null  object
 2   abstract          14815 non-null  object
 3   auth_keywords     13369 non-null  object
 4   vehicle_name      28812 non-null  object
 5   publication_date  28818 non-null  object
 6   author_affil      28488 non-null  object
 7   doi               28322 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


## 3. Saving the dataframe

In [16]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/pubmed_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)