# Cleaning and Preprocessing the PubMed publications related to COVID-19

For collecting the PubMed publications related to COVID-19, we used the "pymed" library. It is avaliable on [https://pypi.org/project/pymed/].

In [1]:
# Uncomment to install the library.
# %pip install pylatexenc

In [2]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np
from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [3]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/pubmed_raw.csv", header=0, dtype={"pubmed_id": "str"})

In [4]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,pubmed_id,title,abstract,keywords,journal,publication_date,authors,methods,conclusions,results,copyrights,doi,xml,isbn,language,publication_type,sections,publisher,publisher_location
0,32967025,ERCP during the COVID-19 epidemic.,,[],Endoscopy,2020-09-24,"[{'lastname': 'Wang', 'firstname': 'Hongling',...",,,,,10.1055/a-1194-4745,<Element 'PubmedArticle' at 0x7f83a91d91d8>,,,,,,
1,32967024,ERCP in patients with COVID-19 infection - is ...,,[],Endoscopy,2020-09-24,"[{'lastname': 'Bilal', 'firstname': 'Mohammad'...",,,,,10.1055/a-1180-8681,<Element 'PubmedArticle' at 0x7f83a91de5e8>,,,,,,
2,32967023,Raising the threshold for hospital admission a...,,[],Endoscopy,2020-09-24,"[{'lastname': 'Laursen', 'firstname': 'Stig B'...",,,,,10.1055/a-1202-1374,<Element 'PubmedArticle' at 0x7f83a91e8548>,,,,,,
3,32967022,"""Double-surgical-mask-with-slit"" method: reduc...",,[],Endoscopy,2020-09-24,"[{'lastname': 'Lazaridis', 'firstname': 'Nikol...",,,,,10.1055/a-1198-5471,<Element 'PubmedArticle' at 0x7f83a91f0048>,,,,,,
4,32967019,Teams and endoscopy: another effect of the COV...,,[],Endoscopy,2020-09-24,"[{'lastname': 'Dinis-Ribeiro', 'firstname': 'M...",,,,,10.1055/a-1223-2406,<Element 'PubmedArticle' at 0x7f83a91f7228>,,,,,,


In [5]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55595 entries, 0 to 55594
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   pubmed_id           55595 non-null  object
 1   title               55499 non-null  object
 2   abstract            32080 non-null  object
 3   keywords            55577 non-null  object
 4   journal             55577 non-null  object
 5   publication_date    55595 non-null  object
 6   authors             55595 non-null  object
 7   methods             348 non-null    object
 8   conclusions         3077 non-null   object
 9   results             6808 non-null   object
 10  copyrights          18723 non-null  object
 11  doi                 54572 non-null  object
 12  xml                 55577 non-null  object
 13  isbn                3 non-null      object
 14  language            18 non-null     object
 15  publication_type    18 non-null     object
 16  sections            18

## 2. Cleaning and preprocessing the dataframe

In [6]:
# Defining the function "clean_text" to clean and preprocess any text.
def clean_text(text):
    if text:
        return re.sub(r"\s+", " ", re.sub(r"\-{2,}", "-", re.sub("[0-9]*\u200b", "", str(text))).replace(
            "\u2009", " ").replace("\xa0", " ").replace("\n", " ").replace("\ufeff", "").replace(
            "\u202f", "").replace("\u2028", " ").replace("\u200f", "").replace("\u200e", "").replace(
            "()", "").replace("[]", "").replace("\\'", "\'")).strip()
    else:
        return None

In [7]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [8]:
# Removing unnecessary columns.
columns_drop = ["methods", "conclusions", "results", "copyrights", "xml", "isbn",
                "language", "publication_type", "sections", "publisher", "publisher_location"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [9]:
# Getting the PubMed ID for each paper.
df_data.pubmed_id = df_data.pubmed_id.apply(lambda x: x.split()[0].strip())

In [10]:
# Normalizing the features "abstract", "title" and "journal".
df_data.abstract = df_data.abstract.apply(
    lambda x: clean_text(LatexNodes2Text().latex_to_text(
        re.sub(r"\s+", " ", re.sub("%", "\\%", x)))) if x and len(x) > 0 else None)
df_data.title = df_data.title.apply(lambda x: clean_text(x) if x and len(x) > 0 else None)
df_data.journal = df_data.journal.apply(clean_text)

In [11]:
# Setting the feature "keywords" as a tuple of keywords and normalizing the keywords for each paper.
df_data.keywords.loc[df_data.keywords.notnull()] = [
    tuple([clean_text(keyword) for keyword in eval(keywords)]) if eval(keywords) else None
    for keywords in df_data.keywords[df_data.keywords.notnull()]]

In [12]:
# Checking there are invalid keywords.
df_data[df_data.keywords.notnull()].keywords[
    [np.any([item == None for item in keywords])
    for keywords in df_data[df_data.keywords.notnull()].keywords]].size

24

In [13]:
# Removing the invalid keywords.
df_data.keywords.loc[df_data.keywords.notnull()] = [tuple([item for item in keywords if item])
    for keywords in df_data.keywords[df_data.keywords.notnull()]]
df_data.keywords.loc[df_data.keywords.notnull()] = df_data.keywords.loc[
    df_data.keywords.notnull()].apply(lambda x: x if len(x) > 0 else None)

In [14]:
# Correcting the feature "authors".
for idx, authors in enumerate(df_data.authors):
    if not eval(authors):
        df_data.authors[idx] = None
    else:
        list_authors = []
        for author in eval(authors):
            auth = {}
            if author["firstname"] and author["lastname"]:
                auth["name"] = clean_text("{} {}".format(author["firstname"], author["lastname"]))
            elif author["firstname"] and not author["lastname"]:
                auth["name"] = clean_text(author["firstname"])
            elif not author["firstname"] and author["lastname"]:
                auth["name"] = clean_text(author["lastname"])
            else:
                auth["name"] = None

            auth["id"] = str(hash("{} - {}".format(auth["name"], "PubMed"))) if auth["name"] else None
            auth["affiliation"] = clean_text(author["affiliation"]) if "affiliation" in author else None
            auth["affil_id"] = str(hash("{} - {}".format(auth["affiliation"], "PubMed"))) \
                if auth["affiliation"] else None
            auth["country"] = None

            if auth["affiliation"] or auth["name"]:
                list_authors.append(auth)

        df_data.authors[idx] = tuple(list_authors) if len(list_authors) > 0 else None

In [15]:
# Renaming the features "authors", "keywords" and "journal".
df_data.rename(columns={"authors": "author_affil", "keywords": "auth_keywords",
                        "journal": "vehicle_name"}, inplace=True)

In [16]:
# Removing the duplicated records by features "title" and "doi".
df_data = pd.concat([df_data[df_data.title.isnull() | df_data.doi.isnull()],
    df_data[df_data.title.notnull() & df_data.doi.notnull()].sort_values(
        by=["title", "publication_date"]).drop_duplicates(["title", "doi"], "last")], ignore_index=True)

In [17]:
# Checking the result.
df_data.head()

Unnamed: 0,pubmed_id,title,abstract,auth_keywords,vehicle_name,publication_date,author_affil,doi
0,32966253,Post-COVID-19 management guidelines for orthod...,,,Journal of clinical orthodontics : JCO,2020-09-24,"({'name': 'Jae Hyun Park', 'id': '491700415152...",
1,32966252,Orthodontics in the COVID-19 Era: The way forw...,,,Journal of clinical orthodontics : JCO,2020-09-24,"({'name': 'M Srirengalakshmi', 'id': '26238586...",
2,32964105,Unintended consequences of COVID-19: Opportuni...,,"(coronavirus infections, mechanical ventilator...",Canadian journal of respiratory therapy : CJRT...,2020-09-24,"({'name': 'Patricia McClurg', 'id': '-28749460...",
3,32965930,StatPearls,Amidst the coronavirus 2019-nCoV (COVID-19) pa...,,,2020,"({'name': 'Onyinyechukwu Okorji', 'id': '65138...",
4,32963099,,The membrane-anchored spike (S) protein of sev...,"(S2 fusion peptide-containing domain, coronavi...",mSystems,2020-09-24,"({'name': 'Nishant Shekhar', 'id': '7707225331...",10.1128/mSystems.00382-20


In [18]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55503 entries, 0 to 55502
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pubmed_id         55503 non-null  object
 1   title             55407 non-null  object
 2   abstract          32060 non-null  object
 3   auth_keywords     28215 non-null  object
 4   vehicle_name      55485 non-null  object
 5   publication_date  55503 non-null  object
 6   author_affil      54983 non-null  object
 7   doi               54480 non-null  object
dtypes: object(8)
memory usage: 3.4+ MB


## 3. Saving the dataframe

In [19]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/pubmed_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)