# Cleaning and Preprocessing the final dataset of publications related to COVID-19

In [None]:
# Importing the required libraries.
import csv, pandas as pd, numpy as np

## 1. Generating the dataframe from the raw data

In [None]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../../data/raw/final_raw.csv", header=0, dtype=object)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 2. Cleaning and preprocessing the dataframe

In [None]:
# Function to normalize the affiliations of the authors.
def normalize_affiliations(row):
    # Getting missing values within the feature "author_affil" from "affiliations" one.
    if row.affiliations and row.author_affil:
        for pos, author in enumerate(row.author_affil):
            for affil in row.affiliations:
                if affil["id"] and author["affil_id"] and affil["id"] in [af.strip()
                        for af in author["affil_id"].split(",")]:
                    row.author_affil[pos]["affil_id"] = affil["id"]
                    row.author_affil[pos]["affiliation"] = affil["affiliation"]
                    if affil["country"] and not author["country"]:
                        row.author_affil[pos]["country"] = affil["country"]
                    elif affil["country"] != author["country"]:
                        row.author_affil[pos]["country"] = affil["country"]
    else:
        # Getting missing values within the feature "affiliations" from "author_affil" one.
        if row.author_affil:
            affils = set([(author["affil_id"], author["affiliation"], author["country"])
                          for author in row.author_affil
                          if author["affil_id"] or author["affiliation"]])
            if len(affils) > 0:
                keys = ["id", "affiliation", "country"]
                row.affiliations = tuple([dict(zip(keys, affil)) for affil in affils])
            else:
                row.affiliations = None
    return row

In [None]:
# Function to normalize the name of the authors.
def normalize_name_authors(row):
    if row.authors and row.author_affil:
        for pos, item in enumerate(row.authors):
            for author in list(row.author_affil):
                if item["id"] == author["id"]:
                    row.authors[pos]["name"] = author["name"]
    elif row.author_affil:
        authors = set([(author["id"], author["name"]) for author in row.author_affil
                       if author["name"]])
        if len(authors) > 0:
            keys = ["id", "name"]
            row.authors = tuple([dict(zip(keys, author)) for author in authors])
        else:
            row.authors = None

    return row

In [None]:
# Function to normalize the the authors and their affiliations.
def normalize_features(row):
    fields = {
        "authors": ["id", "name"],
        "affiliations": ["id", "affiliation", "country"],
        "affil": ["affil_id", "affiliation", "country"]
    }
    # Normalizing the authors.
    records = [tuple([item[f] for f in fields["authors"]]) for item in row.authors] \
        if row.authors else []
    if row.author_affil:
        records = set([*records, *[tuple([item[c] for c in fields["authors"]])
                                          for item in row.author_affil
                                          if item["id"] and item["name"]]])
    elif len(records) > 0 and not row.author_affil:
        row.author_affil = tuple([{**dict(zip(fields["authors"], auth)), "affil_id": None,
                                   "affiliation": None, "country": None} for auth in records])

    if len(records) > 0:
        row.authors = tuple([dict(zip(fields["authors"], auth)) for auth in records])

    # Normalizing the affiliations.
    if row.affiliations:
        records = [tuple([item[c] for c in fields["affiliations"]])
                          for item in row.affiliations]
        if row.author_affil:
            records = set([*records, *[tuple([item[c] for c in fields["affil"]])
                                              for item in row.author_affil
                                              if item["affil_id"] or item["affiliation"]]])
        row.affiliations = tuple([dict(zip(fields["affiliations"], affil))
                                  for affil in records])
    return row

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Changing the type of features.
df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]] = df_data.loc[:, ["auth_keywords", "index_terms", "affiliations", "subject_areas", "authors", "author_affil", "references"]].apply(lambda x: x.apply(lambda y: eval(y) if y else None))
df_data.publication_date = pd.to_datetime(df_data.publication_date)

In [None]:
# Removing articles whose publication year is less than 2019.
df_data = df_data[pd.DatetimeIndex(df_data.publication_date).year >= 2019]

In [None]:
# Creating the feature "period" from the feature "publication_date".
df_data.loc[df_data.period.isnull(), "period"] = df_data.loc[
    df_data.period.isnull(), "publication_date"].apply(
        lambda x: "{}-{}".format(x.year, x.month))

In [None]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0

In [None]:
# Applying the function "normalize_name_authors" to the data.
df_data[["authors", "author_affil"]] = df_data[["authors", "author_affil"]].apply(
    normalize_name_authors, axis=1)

In [None]:
# Checking if there are the empty lists of authors.
df_data.authors[df_data.authors == ()].size

In [None]:
# Applying the function "normalize_affiliations" to the data.
df_data[["affiliations", "author_affil"]] = df_data[
    ["affiliations", "author_affil"]].apply(normalize_affiliations, axis=1)

In [None]:
# Checking if there are the empty lists of affiliations.
df_data.affiliations[df_data.affiliations == ()].size

In [None]:
# Applying the function "normalize_features" to the data.
df_data[["authors", "affiliations", "author_affil"]] = df_data[
    ["authors", "affiliations", "author_affil"]].apply(
        normalize_features, axis=1)

In [None]:
# Checking if there are the empty lists in the feature "author_affil".
df_data.author_affil[df_data.author_affil == ()].size

In [None]:
# Normalizing the feature "id".
df_data.loc[df_data.pubmed_id.notnull() & df_data.id.isnull(), "id"] = df_data.pubmed_id[
    df_data.pubmed_id.notnull() & df_data.id.isnull()]

In [None]:
# Removing the feature "pubmed_id".
df_data.drop(columns="pubmed_id", inplace=True)

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 3. Saving the dataframe

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("../../data/prepared/final_covid_19.csv", index=False, quoting=csv.QUOTE_ALL)