# Cleaning and Preprocessing the WoS publications related to the Tiny GenAI

In [None]:
# Uncomment to install the library.
# %pip install thefuzz

In [None]:
# Importing the required libraries.
import re, csv, pandas as pd, numpy as np
from thefuzz import fuzz

## 1. Generating the dataframe from the raw data

In [None]:
# Creating a dataframe from the raw data.
df_data = pd.read_csv("../data/raw/wos_raw.csv", header=0, dtype=object)

In [None]:
# Checking the dataframe.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 2. Cleaning and preprocessing the dataframe

In [None]:
def fix_affiliations(row):
    fix_values = {"Electronics & Telecommunications Research Institute - Korea (ETRI)": "Electronics & Telecommunications Research Institute",
                  "ETRI, Daejeon, South Korea": "Electronics & Telecommunications Research Institute, Daejeon, South Korea",
                  "Binzhou Univ": "Shandong University of Aeronautics", "Utah System of Higher Education": "University of Utah",
                  "NYU, New York, NY USA": "New York University, New York, NY USA",# "Univ Rostock": "University of Rostock",
                  "Symbiosis Centre for Information Technology (SCIT)": "Symbiosis International University",
                  "Univ Hlth Network": "University of Toronto", "University Health Network Toronto": "University of Toronto",
                  "Harvard Med Sch": "Harvard University", "Harvard Medical School": "Harvard University",
                  "MIT, Dept Elect Engn & Comp Sci, Cambridge, MA USA": "Massachusetts Institute of Technology, Dept Elect Engn & Comp Sci, Cambridge, MA USA",
                  "Massachusetts Gen Hosp": "Massachusetts General Hospital", "University of California System": "University of California San Francisco",
                  "Helmholtz Zentrum Munchen": "Helmholtz Association", "Helmholtz-Center Munich - German Research Center for Environmental Health": "Helmholtz Association"}
    for k, v in fix_values.items():
        row.affiliations = row.affiliations.replace(k, v)
        row.author_affil = row.author_affil.replace(k, v)
    return row

In [None]:
def normalize_authors(row):
    records = list()
    for auth in row.authors.split(";"):
        auth = [a.strip() for a in auth.strip().split(",")]
        auth = f"{auth[1]} {auth[0]}"
        record = {"name": auth}
        if row.researcher_ids is not None:
            for id_auth in row.researcher_ids.split(";"):
                id_auth = id_auth.strip().split("/")
                id_auth[0] = [a.strip() for a in id_auth[0].split(",")]
                if (" ".join(id_auth[0]).lower().startswith(record["name"].lower()) or " ".join(id_auth[0][::-1]).lower().startswith(record["name"].lower())) and "id" not in record:
                    record["id"] = id_auth[-1].strip()
        elif row.orcids is not None:
            for id_auth in row.orcids.split(";"):
                id_auth = id_auth.strip().split("/")
                id_auth[0] = [a.strip() for a in id_auth[0].split(",")]
                if (" ".join(id_auth[0]).lower().startswith(record["name"].lower()) or " ".join(id_auth[0][::-1]).lower().startswith(record["name"].lower())) and "id" not in record:
                    record["id"] = id_auth[-1].strip()
        if "id" not in record:
            record["id"] = str(hash("{} - {}".format(record["name"], "Web of Science")))
        records.append(record)
    return tuple(records)

In [None]:
def normalize_affiliations(row):
    records = list()
    if row.affiliations != row.author_affil:
        # Fixing the textual content and removing affiliations duplicated.
        row.author_affil = re.sub(r"\[[^\]]+\]", "", row.author_affil)
        row.affiliations = list(set([re.sub(r"\s+", " ", affil.lower()).strip()
                                     for affil in row.affiliations.split(";")]))
        row.author_affil = list(set([re.sub(r"\s+", " ", auth.lower()).strip()
                                     for auth in row.author_affil.split(";")]))
        addr_used = list()
        for affil in row.affiliations:
            items = dict()
            for auth in row.author_affil:
                item = {"author_affil": auth}
                auth = [a.strip() for a in auth.split(",")]
                item = {**item, "name": auth[0], "country": auth[-1]}
                items[fuzz.partial_token_sort_ratio(affil, item["name"])] = item
            if items[max(items.keys())]["author_affil"] not in addr_used:
                record = {"name": affil, "author_affil": items[max(items.keys())]["author_affil"],
                          "country": items[max(items.keys())]["country"]}
                addr_used.append(items[max(items.keys())]["author_affil"])
            else:
                record = {"name": affil, "author_affil": None, "country": None}
            records.append(record)
        if len(set(row.author_affil).difference(set(addr_used))) > 0:
            for auth in set(row.author_affil).difference(set(addr_used)):
                auth = [a.strip() for a in auth.split(",")]
                if not np.any([auth[0] == a.split(",")[0].strip() for a in addr_used]):
                    records.append({"name": auth[0], "author_affil": None, "country": auth[-1]})
    else:
        row.affiliations = re.sub(r"\[[^\]]+\]", "", row.affiliations)
        row.affiliations = list(set([re.sub(r"\s+", " ", affil.lower()).strip()
                                     for affil in row.affiliations.split(";")]))
        for auth in row.affiliations:
                record = auth.split(",")
                record = {"name": record[0].strip(), "country": record[-1].strip()}
                records.append(record)
    records = [{"id": str(hash("{} - {}".format(record["name"], "Web of Science"))),
                "affiliation": record["name"].title(), "country": record["country"].title() \
                    if len(set(["usa", "china"]).intersection(set(record["country"].split()))) == 0 \
                        else "USA" if "usa" in record["country"].split() else "China"
                } for record in records]
    return tuple(records)

In [None]:
# Renaming the columns.
df_data.columns = [re.sub("[(),]", "", c.replace(" ", "_").lower()) for c in df_data.columns]
cols = {"180_day_usage_count": "day_180_usage_count", "publication_year": "year", "research_areas": "subject_areas",
        "ut_unique_wos_id": "id", "article_title": "title", "publication_type": "source_type",
        "document_type": "production_type", "source_title": "vehicle_name", "cited_reference_count": "ref_count",
        "author_keywords": "auth_keywords", "addresses": "author_affil", "times_cited_wos_core": "citation_num"}
df_data.rename(columns=cols, inplace=True)

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Defining the "zero" value for the articles without numbers of citation and references.
df_data.citation_num.loc[df_data.citation_num.isnull()] = 0
df_data.ref_count.loc[df_data.ref_count.isnull()] = 0
df_data.loc[:, ["citation_num", "ref_count"]] = df_data.loc[:, ["citation_num", "ref_count"]].astype(np.int32)

In [None]:
# Normalizing the feature "vehicle_name".
df_data.vehicle_name.loc[df_data.conference_title.notnull()] = df_data.conference_title.loc[df_data.conference_title.notnull()]

In [None]:
# Normalizing the feature "publication_date".
df_data.publication_date.loc[df_data.publication_date.notnull()] = df_data.loc[
    df_data.publication_date.notnull(), ["publication_date", "year"]].apply(lambda x: f"{x.year} {x.publication_date}".title() \
        if x.year not in x.publication_date else x.publication_date.title(), axis=1)
df_data.publication_date.loc[df_data.publication_date.isnull()] = df_data.year.loc[df_data.publication_date.isnull()]
df_data.publication_date = df_data.publication_date.apply(lambda x: f"{x} Jan 01" if len(x.split()) == 1 else \
    f"{x} 01" if len(x.split()) == 2 else x)
df_data.publication_date = pd.to_datetime(df_data.publication_date, format="%Y %b %d")

In [None]:
# Creating the feature "period" from the feature "publication_date".
if "period" not in df_data:
    df_data.loc[:, "period"] = df_data.publication_date.apply(lambda x: "{}-{}".format(x.year, x.month))

In [None]:
# Normalizing the feature "issn".
df_data.issn = df_data.loc[:, ["issn", "eissn"]].apply(
    lambda x: f"{x.issn};{x.eissn}" if x.issn is not None and x.eissn is not None \
        else x.issn if x.issn is not None and x.eissn is None \
            else x.eissn if x.issn is None and x.eissn is not None \
                else None, axis=1)

In [None]:
# Normalizing the feature "auth_keywords".
df_data.loc[df_data.keywords_plus.notnull(), "auth_keywords"] = df_data.loc[
    df_data.keywords_plus.notnull(), ["auth_keywords", "keywords_plus"]].apply(
        lambda x: f"{x.auth_keywords}; {x.keywords_plus}", axis=1)
df_data.auth_keywords = df_data.auth_keywords.apply(
    lambda x: tuple(set([au.strip().lower() for au in x.split(";")])) \
        if x is not None else None)

In [None]:
# Normalizing the feature "subject_areas".
sa = ["multidisciplinary", "computer science", "manufacturing", "finance", "radiology"]
df_data.subject_areas = df_data.loc[:, ["wos_categories", "subject_areas"]].apply(
    lambda x: f"{x.wos_categories.lower()}; {x.subject_areas.lower()}", axis=1)
df_data.subject_areas = df_data.subject_areas.apply(
    lambda x: tuple(set([f"{field.strip().split(',')[-1].strip()} {field.strip().split(',')[0].strip()}" \
        if len(field.strip().split(",")) > 1 and field.strip().split(',')[0].strip() not in sa \
            else sub_sa.strip() if len(field.strip().split(",")) > 1 else field.strip()
        for field in x.split(";") for sub_sa in field.strip().split(",")])) if x is not None else None)

In [None]:
# Checking there are invalid values in the features "auth_keywords" and "subject_areas".
for column in ["auth_keywords", "subject_areas"]:
    count = df_data.loc[df_data[column].notnull(), column][
                [np.any([item == None or item.lower() == "none" for item in items])
                 for items in df_data.loc[df_data[column].notnull(), column]]].size
    print("{}: {}".format(column, count))

In [None]:
# Removing the invalid values in the features "auth_keywords" and "subject_areas".
for column in ["auth_keywords", "subject_areas"]:
    df_data.loc[df_data[column].notnull(), column] = [
        tuple([item for item in items if item])
        for items in df_data.loc[df_data[column].notnull(), column]]
    df_data.loc[df_data[column].notnull(), column] = df_data.loc[
        df_data[column].notnull(), column].apply(lambda x: x if len(x) > 0 else None)

In [None]:
# Normalizing the feature "authors".
df_data.author_full_names.loc[(df_data.author_full_names.isnull()) & (df_data.authors.notnull())] = df_data.authors.loc[
    (df_data.author_full_names.isnull()) & (df_data.authors.notnull())]
df_data.authors = df_data.author_full_names.copy()
df_data.loc[:, "authors"] = df_data.loc[:, ["authors", "researcher_ids", "orcids"]].apply(normalize_authors, axis=1)

In [None]:
# Normalizing the feature "affiliations".
df_data.affiliations.loc[(df_data.affiliations.isnull()) & (df_data.author_affil.notnull())] = df_data.author_affil.loc[
    (df_data.affiliations.isnull()) & (df_data.author_affil.notnull())]
df_data.loc[:, ["affiliations", "author_affil"]] = df_data.loc[:, ["affiliations", "author_affil"]].apply(fix_affiliations, axis=1)
df_data.loc[:, "affiliations"] = df_data.loc[:, ["author_affil", "affiliations"]].apply(normalize_affiliations, axis=1)

In [None]:
# Removing unnecessary columns.
columns_drop = ["author_full_names", "book_authors", "book_editors", "book_group_authors", "book_author_full_names", "group_authors",
                "book_series_title", "book_series_subtitle", "conference_title", "conference_date", "conference_location",
                "conference_sponsor", "conference_host", "keywords_plus", "reprint_addresses", "email_addresses", "researcher_ids", "orcids", "funding_orgs",
                "funding_name_preferred", "funding_text", "cited_references", "publisher_city", "publisher_address", "eissn", "isbn",
                "journal_abbreviation", "journal_iso_abbreviation", "volume", "issue", "part_number", "supplement",
                "special_issue", "meeting_abstract", "start_page", "end_page", "article_number", "doi_link", "book_doi",
                "early_access_date", "number_of_pages", "wos_categories", "web_of_science_index", "ids_number", "pubmed_id",
                "open_access_designations", "highly_cited_status", "hot_paper_status", "date_of_export", "web_of_science_record",
                "day_180_usage_count", "since_2013_usage_count", "times_cited_all_databases"]
df_data.drop(axis=1, columns=columns_drop, inplace=True)

In [None]:
# Removing the invalid values in the features "authors" and "affiliations".
for column in ["authors", "affiliations"]:
    df_data.loc[df_data[column].notnull(), column] = df_data.loc[
        df_data[column].notnull(), column].apply(lambda x: x if len(x) > 0 else None)

In [None]:
# Removing the duplicated records by feature "id".
df_data = df_data.sort_values(by=["id", "period"]).drop_duplicates("id", keep="first")

In [None]:
# Checking the result.
df_data.head()

In [None]:
# Visualizing the information of dataset.
df_data.info()

## 3. Saving the dataframe

In [None]:
# Exporting the data to CSV file.
df_data.to_csv("../data/prepared/wos_tiny_genai.csv", index=False, quoting=csv.QUOTE_ALL)