# Analysis of Manuscripts related to Tiny Generative AI

## 1. Importing the libraries

In [None]:
# Importing the required libraries.
import csv, pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns, matplotlib.lines as mlines
from matplotlib import cm

## 2. Getting the data

In [None]:
# Creating a dataframe from the data.
df_data = pd.read_csv("../data/prepared/final_tiny_genai.csv", header=0, dtype=object)

In [None]:
# Getting the data of characterization of manuscripts related to the approaches.
df_clusters = pd.read_csv("../data/cluster_manuscripts.csv", header=0, index_col=False, dtype=object)

In [None]:
# Getting the data of characterization of manuscripts related to the multimodal models.
df_modal = pd.read_csv("../data/multimodal.csv", header=0, index_col=False, dtype=object)

## 3. Preprocessing the data

In [None]:
# Ver se é realmente excluded: 85174242719, 85124404792
ids = ["WOS:001113618700001", "WOS:001115816700001", "WOS:001134125600001", "85137545603", "85138387466",
       "85149435768", "85154622483", "85159857945", "85171302111", "85171522777", "85172291414",
       "85174416073", "85176314091", "85177554366", "85177602386", "85179556106", "85180795696",
       "85183293539", "85183296628", "85183802322", "85185765289", "85186648507", "85190237480",
       "85192795312", "85181907943", "85174821496", "85184585506", "85193991709", "85190595763",
       "85174242719", "85160023694", "85194480879", "85189499366", "85138236299", "85186584200",
       "85121137332", "85124404792", "85123940821", "85179853539", "85189238917", "85165029279",
       "85189496851", "85185411206", "85180766190", "85184270807", "85164941428", "85189331552",
       "85192192383", "85182519704", "85178618093", "85172763360", "85193023067", "85168656299",
       "85190371753"]
len(ids)

In [None]:
# Excluding the irrelevant manuscripts.
df_data = df_data[~df_data.id.isin(ids)]

In [None]:
# Defining the "None" value for the "NaN" values.
df_data.replace({np.nan: None}, inplace=True)

In [None]:
# Changing the type of features.
df_data.loc[:, ["auth_keywords", "affiliations", "subject_areas", "authors"]] = \
    df_data.loc[:, ["auth_keywords", "affiliations", "subject_areas", "authors"]].apply(
        lambda x: x.apply(lambda y: eval(y) if y else None), axis=1)
df_data.publication_date = pd.to_datetime(df_data.publication_date)
df_data.year = df_data.year.astype(np.int32)

In [None]:
# Including the "category" feature.
df_data.loc[:, "category"] = df_data.id.apply(lambda x: df_clusters[df_clusters.id == x].category.item())

In [None]:
# Including the "multimodal" feature.
df_data.loc[:, "multimodal"] = df_data.id.apply(lambda x: df_modal[df_modal.id == x].multimodal.item())

## 4. Utils Functions

In [None]:
class Plots:

    @staticmethod
    def set_config_matplotlib(fig_size=None):
        """ Function that configures the Matplotlib. """
        sns.reset_orig()
        plt.style.use("ggplot")
        if not fig_size:
            fig_size = (12, 8)
        plt.rcParams.update({"figure.figsize": fig_size, "figure.autolayout": True,
            "figure.dpi": 300, "font.size": 12, "figure.facecolor": "white"})

    @staticmethod
    def plot_single_bar_chart(x_values, y_values, title, x_label, y_label, width=0.35, y_max_lim=None, name_fig=None, rotation_x=None):
        """ Function to plot a single and vertical bars chart.

        Parameters:
        -----------
        * x_values (list): the x coordinates of the bars.
        * y_values (list): the y coordinates of the bars
        * title (str): the title of chart.
        * x_label (str): the label of the x-axis.
        * y_label (str): the label of the y-axis.
        * width (float, optional): the width of bars. Defaults to 0.35.
        * y_max_lim (float, optional): the maximum limit of y axis. Default to None.
        * name_fig (str, optional): the destination location/path to save a image of chart. Defaults to None.
        * rotation_x (float, optional): the angle (in degrees) of the x-axis ticks. Defaults to None.
        """

        x = np.arange(len(x_values)) # the label locations

        # Defining the colors.
        colors = ["blue"] * len(x_values)

        fig, ax = plt.subplots()
        rects = ax.barh(x, y_values, height=width, color=colors)

        # Add some text for labels, title and custom x-axis tick labels, etc.
        ax.set_ylabel(x_label)
        ax.set_xlabel(y_label)
        ax.set_title(title)
        ax.set_yticks(x)
        ax.set_yticklabels(x_values, rotation=rotation_x)

        if not y_max_lim:
            y_max_lim = max(y_values) + 10
        ax.set_xlim([0, y_max_lim])

        num_format = "{:.2f}" if isinstance(y_values.iloc[0], float) else "{:d}"

        rotation_annotation = 60 if rotation_x else None
        def autolabel(rects):
            """Attach a text label to the right of each bar in rects, displaying its height."""
            for rect in rects:
                width = rect.get_width()
                ax.annotate(num_format.format(width),
                            xy=(width, rect.get_y() + rect.get_height() / 2),
                            xytext=(3, 0),  # 3 points horizontal offset
                            textcoords="offset points",
                            ha="left", va="center", rotation=rotation_annotation)

        autolabel(rects)
        fig.tight_layout()
        if name_fig:
            plt.savefig(name_fig)
        plt.show()


    @staticmethod
    def plot_pie_chart(x_values, y_values, title, name_fig=None):
        fig, ax = plt.subplots()
        wedges, texts, autotexts = ax.pie(y_values, autopct=lambda pct: "", startangle=90, pctdistance=0.85)
        ax.axis('equal')
        ax.set_title(title)
        
        # Adding the legend.
        ax.legend(wedges, x_values, loc="lower center", bbox_to_anchor=(1, 0, 0.5, 1))
    
        # Drawing lines associating the labels to the slices.
        for i, wedge in enumerate(wedges):
            ang = (wedge.theta2 - wedge.theta1) / 2. + wedge.theta1
            y = np.sin(np.deg2rad(ang))
            x = np.cos(np.deg2rad(ang))
            horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
            connectionstyle = "angle,angleA=0,angleB={}".format(ang)
            ax.annotate("{:.1f}%".format(y_values.iloc[i]), fontsize=15,
                        xy=(x, y), xytext=(1.1*np.sign(x), 1.2*y),
                        horizontalalignment=horizontalalignment,
                        bbox=dict(fc="white", ec="white", lw=0.01),
                        arrowprops=dict(arrowstyle="-", color='black', connectionstyle=connectionstyle))

        # Exibe o gráfico
        fig.tight_layout()
        if name_fig:
            plt.savefig(name_fig)
        plt.show()


## 5. Data Analysis

In [None]:
Plots.set_config_matplotlib()

### 5.1. Countries

In [None]:
# Getting the data.
countries = df_data.loc[df_data.affiliations.notnull(), "affiliations"].copy()

In [None]:
# Preprocessing the data.
countries = [list(set([c["country"] for c in paper if c["country"]])) for paper in countries]
countries = [c for paper in countries for c in paper if len(paper) > 0]
countries = pd.Series(countries)
countries.replace({"Hong Kong": "China"}, inplace=True)
countries = countries.value_counts().reset_index().rename(columns={"index": "country", "count": "num_paper"}).head(10)

In [None]:
# Listing the data.
countries

In [None]:
# Plotting the data.
Plots.plot_single_bar_chart(countries.country, countries.num_paper, "Top 10 Countries", None, "Number of Manuscripts",
                            name_fig="countries.png")

### 5.2. Vehicles

In [None]:
# Getting the data.
vehicles = df_data.loc[df_data.vehicle_name.notnull(), "vehicle_name"].copy()

In [None]:
# Processing the data.
vehicles = vehicles.value_counts().reset_index().rename(columns={"count": "num_paper"}).head(10)

In [None]:
# Listing the data.
for idx, v in vehicles.iterrows():
    print(f"{v.vehicle_name} ({v.num_paper})")

In [None]:
# Plotting the data.
Plots.plot_single_bar_chart(vehicles.vehicle_name, vehicles.num_paper, "Top 10 Publication Vehicles", None, "Number of Manuscripts",
                            name_fig="vehicles.png")

### 5.3. Authors

In [None]:
# Getting the data.
authors = df_data.loc[df_data.authors.notnull(), "authors"].copy()

In [None]:
# Preprocessing the data.
authors = [list(set([a["name"] for a in paper if a["name"]])) for paper in authors]
authors = [a for paper in authors for a in paper if len(paper) > 0]
authors = pd.Series(authors)
authors = authors.value_counts().reset_index().rename(columns={"index": "author", "count": "num_paper"}).head(10)

In [None]:
# Listing the data.
authors

### 5.4. Affiliations

In [None]:
# Getting the data.
affiliations = df_data.loc[df_data.affiliations.notnull(), "affiliations"].copy()

In [None]:
# Preprocessing the data.
affiliations = [list(set([a["affiliation"] for a in paper if a["affiliation"]])) for paper in affiliations]
affiliations = [a for paper in affiliations for a in paper if len(paper) > 0]
affiliations = pd.Series(affiliations)
affiliations = affiliations.value_counts().reset_index().rename(columns={"index": "affiliation", "count": "num_paper"}).head(15)

In [None]:
# Listing the data.
affiliations

In [None]:
# Plotting the data.
Plots.plot_single_bar_chart(affiliations.affiliation, affiliations.num_paper, "Top 15 Affiliations", None, "Number of Manuscripts",
                            name_fig="affiliations.png")

### 5.5. Approach

In [None]:
# Getting the data.
categories = df_data.loc[df_data.category.notnull(), "category"].copy()

In [None]:
# Preprocessing the data.
categories = categories.value_counts().reset_index().rename(columns={"count": "num_paper"})
total = categories.num_paper.sum()
categories["percent"] = (categories.num_paper / total) * 100

In [None]:
# Plotting the data.
Plots.plot_pie_chart(categories.category, categories.percent, None,
                     name_fig="approaches.png")

### 5.6. Multimodal

In [None]:
# Getting the data.
multimodal = df_data.loc[df_data.multimodal.notnull(), "multimodal"].copy()

In [None]:
# Preprocessing the data.
multimodal = multimodal.value_counts().reset_index().rename(columns={"count": "num_paper"})
total = multimodal.num_paper.sum()
multimodal["percent"] = (multimodal.num_paper / total) * 100

In [None]:
# Plotting the data.
Plots.plot_pie_chart(multimodal.multimodal, multimodal.percent, None,
                     name_fig="multimodal.png")