# Cleaning and Preprocessing the final dataset of publications related to COVID-19

In [1]:
# Importing the required libraries.
import csv, pandas as pd, numpy as np

## 1. Generating the dataframe from the raw data

In [2]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/final_raw.csv", header=0, dtype=object)

In [3]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references,period
0,2103.03219,"('General Finance',)",the impact of covid-19 on stock market volatil...,"({'id': '-8736464535247814394', 'name': 'Ateeb...",This paper examines the impact of coronavirus ...,2021-02-11,arXiv,,,,...,,,,,,,,,,
1,2103.03055,"('Image and Video Processing', 'Computer Visio...",self-supervised deep convolutional neural netw...,"({'id': '1387464675565637337', 'name': 'Matej ...","Chest radiography is a relatively cheap, widel...",2021-03-04,arXiv,,,,...,,,,,,,,,,
2,2103.03038,"('Computer Vision and Pattern Recognition',)",mobile touchless fingerprint recognition: impl...,"({'id': '1917885517247673923', 'name': 'Jannis...",This work presents an automated touchless fing...,2021-03-04,arXiv,,,,...,,,,,,,,,,
3,2103.02961,"('Image and Video Processing', 'Computer Visio...",probabilistic combination of eigenlungs-based ...,"({'id': '-3330601887987387557', 'name': 'Juan ...",The outbreak of the COVID-19 (Coronavirus dise...,2021-03-04,arXiv,,,,...,,,,,,,,,,
4,2103.02917,"('Computers and Society', 'Computation and Lan...",mp twitter engagement and abuse post-first cov...,"({'id': '-6377212943789062063', 'name': 'Traci...",The UK has had a volatile political environmen...,2021-03-04,arXiv,,,,...,,,,,,,,,,


In [4]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140399 entries, 0 to 140398
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                99887 non-null   object
 1   subject_areas     101557 non-null  object
 2   title             140317 non-null  object
 3   authors           86143 non-null   object
 4   abstract          97706 non-null   object
 5   publication_date  140399 non-null  object
 6   data_source       140399 non-null  object
 7   doi               133707 non-null  object
 8   author_affil      135778 non-null  object
 9   pubmed_id         109227 non-null  object
 10  auth_keywords     71295 non-null   object
 11  vehicle_name      124965 non-null  object
 12  citation_num      84526 non-null   object
 13  language          84077 non-null   object
 14  production_type   84526 non-null   object
 15  source_type       84526 non-null   object
 16  index_terms       50712 non-null   obj

## 2. Cleaning and preprocessing the dataframe

In [5]:
# Function to normalize the affiliations of the authors.
def normalize_affiliations(row):
    # Getting missing values within the feature "author_affil" from "affiliations" one.
    if row.affiliations and row.author_affil:
        for pos, author in enumerate(row.author_affil):
            for affil in row.affiliations:
                if affil["id"] and author["affil_id"] and affil["id"] in [af.strip()
                        for af in author["affil_id"].split(",")]:
                    row.author_affil[pos]["affil_id"] = affil["id"]
                    row.author_affil[pos]["affiliation"] = affil["affiliation"]
                    if affil["country"] and not author["country"]:
                        row.author_affil[pos]["country"] = affil["country"]
                    elif affil["country"] != author["country"]:
                        row.author_affil[pos]["country"] = affil["country"]
    else:
        # Getting missing values within the feature "affiliations" from "author_affil" one.
        if row.author_affil:
            affils = set([(author["affil_id"], author["affiliation"], author["country"])
                          for author in row.author_affil
                          if author["affil_id"] or author["affiliation"]])
            if len(affils) > 0:
                keys = ["id", "affiliation", "country"]
                row.affiliations = tuple([dict(zip(keys, affil)) for affil in affils])
            else:
                row.affiliations = None
    return row

In [6]:
# Function to normalize the name of the authors.
def normalize_name_authors(row):
    if row.authors and row.author_affil:
        for pos, item in enumerate(row.authors):
            for author in list(row.author_affil):
                if item["id"] == author["id"]:
                    row.authors[pos]["name"] = author["name"]
    elif row.author_affil:
        authors = set([(author["id"], author["name"]) for author in row.author_affil
                       if author["name"]])
        if len(authors) > 0:
            keys = ["id", "name"]
            row.authors = tuple([dict(zip(keys, author)) for author in authors])
        else:
            row.authors = None

    return row

In [7]:
# Function to normalize the the authors and their affiliations.
def normalize_features(row):
    fields = {
        "authors": ["id", "name"],
        "affiliations": ["id", "affiliation", "country"],
        "affil": ["affil_id", "affiliation", "country"]
    }
    # Normalizing the authors.
    records = [tuple([item[f] for f in fields["authors"]]) for item in row.authors] \
        if row.authors else []
    if row.author_affil:
        records = set([*records, *[tuple([item[c] for c in fields["authors"]])
                                          for item in row.author_affil
                                          if item["id"] and item["name"]]])
    elif len(records) > 0 and not row.author_affil:
        row.author_affil = tuple([{**dict(zip(fields["authors"], auth)), "affil_id": None,
                                   "affiliation": None, "country": None} for auth in records])

    if len(records) > 0:
        row.authors = tuple([dict(zip(fields["authors"], auth)) for auth in records])

    # Normalizing the affiliations.
    if row.affiliations:
        records = [tuple([item[c] for c in fields["affiliations"]])
                          for item in row.affiliations]
        if row.author_affil:
            records = set([*records, *[tuple([item[c] for c in fields["affil"]])
                                              for item in row.author_affil
                                              if item["affil_id"] or item["affiliation"]]])
        row.affiliations = tuple([dict(zip(fields["affiliations"], affil))
                                  for affil in records])
    return row

In [8]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [9]:
# Changing the type of features.
df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]] = df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]].apply(lambda x: x.apply(lambda y: eval(y) if y else None))
df_data.publication_date = pd.to_datetime(df_data.publication_date)

In [10]:
# Removing articles whose publication year is less than 2019.
df_data = df_data[pd.DatetimeIndex(df_data.publication_date).year >= 2019]

In [11]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0

In [12]:
# Applying the function "normalize_name_authors" to the data.
df_data[["authors", "author_affil"]] = df_data[["authors", "author_affil"]].apply(
    normalize_name_authors, axis=1)

In [13]:
# Checking if there are the empty lists of authors.
df_data.authors[df_data.authors == ()].size

0

In [14]:
# Applying the function "normalize_affiliations" to the data.
df_data[["affiliations", "author_affil"]] = df_data[
    ["affiliations", "author_affil"]].apply(normalize_affiliations, axis=1)

In [15]:
# Checking if there are the empty lists of affiliations.
df_data.affiliations[df_data.affiliations == ()].size

0

In [16]:
# Applying the function "normalize_features" to the data.
df_data[["authors", "affiliations", "author_affil"]] = df_data[
    ["authors", "affiliations", "author_affil"]].apply(
        normalize_features, axis=1)

In [17]:
# Checking if there are the empty lists in the feature "author_affil".
df_data.author_affil[df_data.author_affil == ()].size

0

In [18]:
# Normalizing the feature "id".
df_data.loc[df_data.pubmed_id.notnull() & df_data.id.isnull(), "id"] = df_data.pubmed_id[
    df_data.pubmed_id.notnull() & df_data.id.isnull()]

In [19]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [20]:
# Checking the dataframe.
df_data.head()

Unnamed: 0,id,subject_areas,title,authors,abstract,publication_date,data_source,doi,author_affil,pubmed_id,...,language,production_type,source_type,index_terms,issn,publisher,affiliations,ref_count,references,period
0,2103.03219,"(General Finance,)",the impact of covid-19 on stock market volatil...,"({'id': '-8736464535247814394', 'name': 'Ateeb...",This paper examines the impact of coronavirus ...,2021-02-11,arXiv,,"({'id': '-8736464535247814394', 'name': 'Ateeb...",,...,,,,,,,,0,,
1,2103.03055,"(Image and Video Processing, Computer Vision a...",self-supervised deep convolutional neural netw...,"({'id': '1387464675565637337', 'name': 'Matej ...","Chest radiography is a relatively cheap, widel...",2021-03-04,arXiv,,"({'id': '1387464675565637337', 'name': 'Matej ...",,...,,,,,,,,0,,
2,2103.03038,"(Computer Vision and Pattern Recognition,)",mobile touchless fingerprint recognition: impl...,"({'id': '1917885517247673923', 'name': 'Jannis...",This work presents an automated touchless fing...,2021-03-04,arXiv,,"({'id': '1917885517247673923', 'name': 'Jannis...",,...,,,,,,,,0,,
3,2103.02961,"(Image and Video Processing, Computer Vision a...",probabilistic combination of eigenlungs-based ...,"({'id': '-3330601887987387557', 'name': 'Juan ...",The outbreak of the COVID-19 (Coronavirus dise...,2021-03-04,arXiv,,"({'id': '-3330601887987387557', 'name': 'Juan ...",,...,,,,,,,,0,,
4,2103.02917,"(Computers and Society, Computation and Language)",mp twitter engagement and abuse post-first cov...,"({'id': '-6377212943789062063', 'name': 'Traci...",The UK has had a volatile political environmen...,2021-03-04,arXiv,,"({'id': '-6377212943789062063', 'name': 'Traci...",,...,,,,,,,,0,,


In [21]:
# Visualizing the information of dataset.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140390 entries, 0 to 140398
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                140390 non-null  object        
 1   subject_areas     101555 non-null  object        
 2   title             140308 non-null  object        
 3   authors           138733 non-null  object        
 4   abstract          97698 non-null   object        
 5   publication_date  140390 non-null  datetime64[ns]
 6   data_source       140390 non-null  object        
 7   doi               133703 non-null  object        
 8   author_affil      138743 non-null  object        
 9   pubmed_id         109220 non-null  object        
 10  auth_keywords     71291 non-null   object        
 11  vehicle_name      124961 non-null  object        
 12  citation_num      140390 non-null  object        
 13  language          84077 non-null   object        
 14  prod

## 3. Saving the dataframe

In [22]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/final_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)