# Merging the datasets of publications related to the Tiny GenAI

In [None]:
# Importing the required libraries.
import csv, re, pandas as pd, numpy as np
from string import punctuation

## 1. Defining the required functions

In [None]:
# Defining the function "clean_title".
def clean_title(title):
    if len(title) == 1 and title in punctuation:
        return None
    title = title.lower()
    title = title.replace("€", "").replace("…", "...").replace("τhe", "the").replace(
        "–", "-").replace("‘", "'").replace("“", "\"").replace("”", "\"").replace(
        "′", "'").replace("’", "'").replace("č", "c")
    while title[0] in punctuation or title[0] == " " or title[-1] in punctuation:
        if title[0] in punctuation:
            title = title[1:]
        if title[-1] in punctuation:
            title = title[:-1]
        title = title.strip()
    return re.sub(r"\"+", "", re.sub(r"\s+", " ", title))

## 2. Getting and preprocessing the datasets

### 2.1. Web of Science

In [None]:
# Getting the data.
df_wos = pd.read_csv("../data/prepared/wos_tiny_genai.csv", header=0)

In [None]:
# Checking the dataframe.
df_wos.head()

In [None]:
# Visualizing the information of dataset.
df_wos.info()

In [None]:
# Defining the "None" value for the "NaN" values.
df_wos.replace({np.nan: None}, inplace=True)

In [None]:
# Changing the type of features.
df_wos.loc[df_wos.auth_keywords.notnull(), "auth_keywords"] = df_wos.loc[
    df_wos.auth_keywords.notnull(), "auth_keywords"].apply(eval)
df_wos.loc[df_wos.authors.notnull(), "authors"] = df_wos.loc[
    df_wos.authors.notnull(), "authors"].apply(eval)
df_wos.loc[df_wos.affiliations.notnull(), "affiliations"] = df_wos.loc[
    df_wos.affiliations.notnull(), "affiliations"].apply(eval)
df_wos.loc[df_wos.subject_areas.notnull(), "subject_areas"] = df_wos.loc[
    df_wos.subject_areas.notnull(), "subject_areas"].apply(eval)
df_wos.publication_date = pd.to_datetime(df_wos.publication_date)

In [None]:
# Defining the feature "source".
df_wos["source"] = "Web of Science"

In [None]:
# Normalizing the feature "title".
df_wos.loc[df_wos.title.notnull(), "title"] = df_wos.loc[
    df_wos.title.notnull(), "title"].apply(clean_title)

In [None]:
# Checking the dataframe.
df_wos.head()

In [None]:
# Visualizing the information of dataset.
df_wos.info()

### 2.2. Scopus

In [None]:
# Getting the data.
df_scopus = pd.read_csv("../data/prepared/scopus_tiny_genai.csv", header=0, dtype=object)

In [None]:
# Checking the dataframe.
df_scopus.head()

In [None]:
# Visualizing the information of dataset.
df_scopus.info()

In [None]:
# Defining the "None" value for the "NaN" values.
df_scopus.replace({np.nan: None}, inplace=True)

In [None]:
# Changing the type of features.
df_scopus.auth_keywords.loc[df_scopus.auth_keywords.notnull()] = df_scopus.auth_keywords.loc[
    df_scopus.auth_keywords.notnull()].apply(eval)
df_scopus.index_terms.loc[df_scopus.index_terms.notnull()] = df_scopus.index_terms.loc[
    df_scopus.index_terms.notnull()].apply(eval)
df_scopus.affiliations.loc[df_scopus.affiliations.notnull()] = df_scopus.affiliations.loc[
    df_scopus.affiliations.notnull()].apply(eval)
df_scopus.subject_areas.loc[df_scopus.subject_areas.notnull()] = df_scopus.subject_areas.loc[
    df_scopus.subject_areas.notnull()].apply(eval)
df_scopus.authors.loc[df_scopus.authors.notnull()] = df_scopus.authors.loc[
    df_scopus.authors.notnull()].apply(eval)
df_scopus.author_affil.loc[df_scopus.author_affil.notnull()] = df_scopus.author_affil.loc[
    df_scopus.author_affil.notnull()].apply(eval)
df_scopus.references.loc[df_scopus.references.notnull()] = df_scopus.references.loc[
    df_scopus.references.notnull()].apply(eval)
df_scopus.publication_date = pd.to_datetime(df_scopus.publication_date)

In [None]:
# Defining the feature "source".
df_scopus["source"] = "Scopus"

In [None]:
# Normalizing the feature "title".
df_scopus.title = df_scopus.title.apply(clean_title)

In [None]:
# Checking the dataframe.
df_scopus.head()

In [None]:
# Visualizing the information of dataset.
df_scopus.info()     

## 3. Merging/Joining the datasets

In [None]:
# Checking the duplicated records between Web of Science and Scopus by the features "title" and "doi".
df_wos.id[df_wos.title.isin(df_scopus.title.values) |
    df_wos.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values)].size

In [None]:
# Filling the missing values of Web of Science's features "auth_keywords", "issn" and "doi" with data from Scopus.
filter_data = (df_wos.title.isin(df_scopus.title.values) | df_wos.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values))
df_wos.loc[filter_data & (df_wos.auth_keywords.isnull()), "auth_keywords"] = df_wos.loc[
    filter_data & (df_wos.auth_keywords.isnull()), ["title", "doi"]].apply(
        lambda x: df_scopus.auth_keywords[(df_scopus.title == x.title) | (df_scopus.doi[
            df_scopus.doi.notnull()] == x.doi)].item() if x.doi is not None else \
                df_scopus.auth_keywords[df_scopus.title == x.title].item(), axis=1)
df_wos.loc[filter_data & (df_wos.issn.isnull()), "issn"] = df_wos.loc[
    filter_data & (df_wos.issn.isnull()), ["title", "doi"]].apply(
        lambda x: df_scopus.issn[(df_scopus.title == x.title) | (df_scopus.doi[
            df_scopus.doi.notnull()] == x.doi)].item() if x.doi is not None else \
                df_scopus.issn[df_scopus.title == x.title].item(), axis=1)
df_wos.loc[(df_wos.title.isin(df_scopus.title.values)) & (df_wos.doi.isnull()), "doi"] = df_wos.loc[
    (df_wos.title.isin(df_scopus.title.values)) & (df_wos.doi.isnull()), "title"].apply(
        lambda x: df_scopus.doi[df_scopus.title == x].item())

In [None]:
# Filling the missing values of Scopus' features "auth_keywords", "issn" and "publisher" with data from Web of Science.
filter_data = (df_scopus.title.isin(df_wos.title.values) | df_scopus.doi[df_scopus.doi.notnull()].isin(df_wos.doi[df_wos.doi.notnull()].values))
df_scopus.loc[filter_data & (df_scopus.auth_keywords.isnull()), "auth_keywords"] = df_scopus.loc[
    filter_data & (df_scopus.auth_keywords.isnull()), ["title", "doi"]].apply(
        lambda x: df_wos.auth_keywords[(df_wos.title == x.title) | (df_wos.doi[
            df_wos.doi.notnull()] == x.doi)].item() if x.doi is not None else \
                df_wos.auth_keywords[df_wos.title == x.title].item(), axis=1)
df_scopus.loc[filter_data & (df_scopus.issn.isnull()), "issn"] = df_scopus.loc[
    filter_data & (df_scopus.issn.isnull()), ["title", "doi"]].apply(
        lambda x: df_wos.issn[(df_wos.title == x.title) | (df_wos.doi[
            df_wos.doi.notnull()] == x.doi)].item() if x.doi is not None else \
                df_wos.issn[df_wos.title == x.title].item(), axis=1)
df_scopus.loc[filter_data & (df_scopus.publisher.isnull()), "publisher"] = df_scopus.loc[
    filter_data & (df_scopus.publisher.isnull()), ["title", "doi"]].apply(
        lambda x: df_wos.publisher[(df_wos.title == x.title) | (df_wos.doi[
            df_wos.doi.notnull()] == x.doi)].item() if x.doi is not None else \
                df_wos.publisher[df_wos.title == x.title].item(), axis=1)

In [None]:
# Removing the duplicated records between Web of Science and Scopus.
filter_data = (df_wos.title.isin(df_scopus.title.values) |
               df_wos.doi.isin(df_scopus.doi[df_scopus.doi.notnull()].values))
df_wos = df_wos[~filter_data]

In [None]:
# Visualizing the final number of records for each dataset.
print("Web of Science:", df_wos.id.size)
print("Scopus:", df_scopus.id.size)
print("Expected total number of records for the final dataset:",
      (df_wos.id.size + df_scopus.id.size))

In [None]:
# Merging/Joining the datasets.
df_final = pd.concat([df_wos, df_scopus], ignore_index=True)

In [None]:
# Defining the "None" value for the "NaN" values.
df_final.replace({np.nan: None}, inplace=True)

In [None]:
# Renaming the feature "source".
df_final.rename(columns={"source": "data_source"}, inplace=True)

In [None]:
# Checking the dataframe.
df_final.head()

In [None]:
# Visualizing the information of dataset.
df_final.info()

In [None]:
# Exporting the final dataset to CSV file.
df_final.to_csv("../data/raw/final_raw.csv", index=False, quoting=csv.QUOTE_ALL)