# Cleaning and Preprocessing the Scopus publications related to the Tiny GenAI

In [None]:
# Uncomment to install the library.
# %pip install pylatexenc

In [None]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np
# from pylatexenc.latex2text import LatexNodes2Text

## 1. Generating the dataframe from the raw data

In [None]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../data/raw/scopus_raw.csv", header=0, dtype=object)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 2. Cleaning and preprocessing the dataframe

In [None]:
# Removing the invalid articles.
df_data = df_data.loc[df_data.id.notnull() & df_data.eid.notnull()]

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0

In [None]:
# Normalizing the feature "abstract".
df_data.abstract.loc[df_data.abstract.isnull() & df_data.description.notnull()] = df_data.description.loc[
    df_data.abstract.isnull() & df_data.description.notnull()]

In [None]:
# Normalizing the feature "vehicle_name".
df_data.vehicle_name.loc[df_data.conference_name.notnull() & df_data.vehicle_name.notnull()] = df_data.conference_name.loc[
    df_data.conference_name.notnull() & df_data.vehicle_name.notnull()]

In [None]:
# Removing unnecessary columns.
columns_drop = ["eid", "pii", "description", "isbn", "conf_location", "conference_name",
    "vehicle_address", "title_edition", "pubmed_id"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [None]:
# Changing the type of some features.
df_data.loc[:, ["citation_num", "ref_count"]] = df_data.loc[:, ["citation_num", "ref_count"]].astype(np.float32)
df_data.auth_keywords.loc[df_data.auth_keywords.notnull()] = df_data.auth_keywords.loc[
    df_data.auth_keywords.notnull()].apply(eval)
df_data.index_terms.loc[df_data.index_terms.notnull()] = df_data.index_terms.loc[
    df_data.index_terms.notnull()].apply(eval)
df_data.affiliations.loc[df_data.affiliations.notnull()] = df_data.affiliations.loc[
    df_data.affiliations.notnull()].apply(eval)
df_data.subject_areas.loc[df_data.subject_areas.notnull()] = df_data.subject_areas.loc[
    df_data.subject_areas.notnull()].apply(eval)
df_data.authors.loc[df_data.authors.notnull()] = df_data.authors.loc[df_data.authors.notnull()].apply(eval)
df_data.author_affil.loc[df_data.author_affil.notnull()] = df_data.author_affil.loc[
    df_data.author_affil.notnull()].apply(eval)
df_data.references.loc[df_data.references.notnull()] = df_data.references.loc[
    df_data.references.notnull()].apply(eval)
df_data.publication_date = pd.to_datetime(df_data.publication_date)

In [None]:
# Creating the features "period" and "year" from the feature "publication_date".
if "period" not in df_data:
    df_data["period"] = df_data.publication_date.apply(lambda x: "{}-{}".format(x.year, x.month))
if "year" not in df_data:
    df_data["year"] = df_data.publication_date.apply(lambda x: x.year)

In [None]:
# Normalizing the itens contained in the features "auth_keywords", "index_terms" and "subject_areas".
df_data.loc[df_data.auth_keywords.notnull(), "auth_keywords"] = df_data.loc[
    df_data.auth_keywords.notnull(), "auth_keywords"].apply(
        lambda x: tuple(set([item.strip().lower() for item in x])))
df_data.loc[df_data.index_terms.notnull(), "index_terms"] = df_data.loc[
    df_data.index_terms.notnull(), "index_terms"].apply(
        lambda x: tuple(set([item.strip().lower() for item in x])))
df_data.loc[df_data.subject_areas.notnull(), "subject_areas"] = df_data.loc[
    df_data.subject_areas.notnull(), "subject_areas"].apply(
        lambda x: tuple(set([item["area"].strip().lower() for item in x])))

In [None]:
# Checking there are invalid values in the features "auth_keywords", "index_terms" and "subject_areas".
for column in ["auth_keywords", "index_terms", "subject_areas"]:
    count = df_data.loc[df_data[column].notnull(), column][
                [np.any([item == None or item.lower() == "none" for item in items])
                 for items in df_data.loc[df_data[column].notnull(), column]]].size
    print("{}: {}".format(column, count))

In [None]:
# Removing the invalid values in the features "auth_keywords", "index_terms" and "subject_areas".
for column in ["auth_keywords", "index_terms", "subject_areas"]:
    df_data.loc[df_data[column].notnull(), column] = [
        tuple([item for item in items if item])
        for items in df_data.loc[df_data[column].notnull(), column]]
    df_data.loc[df_data[column].notnull(), column] = df_data.loc[
        df_data[column].notnull(), column].apply(lambda x: x if len(x) > 0 else None)

In [None]:
# Removing the invalid values in the features "authors", "affiliations" and "author_affil".
for column in ["authors", "affiliations", "author_affil"]:
    df_data.loc[df_data[column].notnull(), column] = df_data.loc[
        df_data[column].notnull(), column].apply(lambda x: x if len(x) > 0 else None)

In [None]:
# Creating the affiliations' and authors' IDs for those that have not a ID.
df_data.loc[df_data.author_affil.notnull(), "author_affil"] = df_data.loc[
    df_data.author_affil.notnull(), "author_affil"].apply(lambda x: tuple([{
        "id": item["id"] if item["id"] is not None and item["name"] is not None else \
            str(hash("{} - {}".format(item["name"], "Scopus"))) if item["name"] is not None else None,
        "name": item["name"],
        "affil_id": item["affil_id"] if item["affil_id"] is not None and item["affiliation"] is not None else \
            str(hash("{} - {}".format(item["affiliation"], "Scopus"))) \
                if item["affiliation"] is not None else None,
        "affiliation": item["affiliation"], "country": item["country"]}
    for item in x]))

In [None]:
# Removing duplicates within the list of affiliations and authors.
df_data.loc[df_data.author_affil.notnull(), "author_affil"] = [
    set([(au["id"], au["name"], au["affil_id"],
        au["affiliation"], au["country"]) for au in row])
    for row in df_data.loc[df_data.author_affil.notnull(), "author_affil"]]
df_data.loc[df_data.author_affil.notnull(), "author_affil"] = [tuple([dict(zip(
        ["id", "name", "affil_id", "affiliation", "country"], au)) for au in row])
    for row in df_data.loc[df_data.author_affil.notnull(), "author_affil"]]

In [None]:
# Removing the duplicated records by feature "id".
df_data = df_data.sort_values(by=["id", "period"]).drop_duplicates("id", keep="first")

In [None]:
# Removing the duplicated records by features "title" and "doi".
df_data = pd.concat([df_data[df_data.title.isnull() | df_data.doi.isnull()],
    df_data[df_data.title.notnull() & df_data.doi.notnull()].sort_values(
        by=["title", "citation_num", "publication_date"]).drop_duplicates(
            subset=["title", "doi"], keep="last")], ignore_index=True)

In [None]:
# Checking the result.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 3. Saving the dataframe

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("../data/prepared/scopus_tiny_genai.csv", index=False, quoting=csv.QUOTE_ALL)