In [None]:
PROJECT_PATH = "" # one of [cargo, jpetstore]
PROJECT_NAME = "" # one of [cargo, jpetstore]
GROQ_API_KEY = "" # create at https://console.groq.com/keys

In [None]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.llms.groq import Groq
from semantic_router import Route
from semantic_router.encoders import HuggingFaceEncoder
from semantic_router.layer import RouteLayer
from IPython.display import Markdown
from ema_workbench import load_results
from ema_workbench.analysis import feature_scoring
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import MDS
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
import numpy as np
import pickle
import seaborn as sns
from typing import Union
from SALib.analyze import sobol
import random
import collections
from groq import Groq as GroqV2
from llama_index.core.evaluation import DatasetGenerator
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.tools import FunctionTool
from llama_index.core.schema import Document
from llama_index.core.prompts.base import PromptTemplate
from llama_index.experimental.query_engine import PandasQueryEngine
from llama_index.core.tools.query_engine import QueryEngineTool
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors.llm_selectors import (
    LLMSingleSelector
)
from sklearn_extra.cluster import KMedoids
import os
from IPython.display import display
random.seed(1)

In [None]:
import logging

logging.basicConfig(level=logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)
logging.getLogger('httpcore').setLevel(logging.WARNING)
logging.getLogger('llama_index').setLevel(logging.WARNING)
logging.getLogger('EMA').setLevel(logging.WARNING)

In [None]:
COMMON_COLUMNS = ['algorithm', 'scenario', 'policy', 'model']

PARAMETER_MAPPING = {
    'microservice_threshold': 'mthreshold',
}

def get_parameters(experiments: pd.DataFrame) -> list[str]:
    parameters = list(set(COMMON_COLUMNS) ^ set(experiments.columns.values))
    parameters.sort()
    return parameters

def get_normalized_parameter(parameter: str) -> str:
    new_parameter = PARAMETER_MAPPING.get(parameter)
    if new_parameter is not None:
        return new_parameter
    return parameter


def reorder_index(index):
    parts = index.split('_')
    pairs = list(zip(parts[0::2], parts[1::2]))
    pairs.sort(key=lambda x: x[0])
    return '_'.join([f"{parameter}_{float(value):.9f}" for parameter, value in pairs])

def rename_dict_keys(dictionary: dict) -> dict:
    new_dictionary = {}
    for key, value in dictionary.items():
        new_dictionary[reorder_index(key)] = value
    return new_dictionary

In [None]:
class DecompositionSpace:
    def __init__(self, project_path, project_name):
        self.outcomes = pd.DataFrame()
        self.experiments = pd.DataFrame()
        self.all = pd.DataFrame()
        self.partitions = dict()
        self.uncertainties_problem = dict()
        self.sobol = dict()
        self.partitions_distance = pd.DataFrame()
        self.feature_scoring = dict()
        self.partition_labels = dict()
        self.embeddings_2d_partitions = dict()
        self.partitions_df = pd.DataFrame()
        self.sobol_df = pd.DataFrame()
        for _, algorithms, _ in os.walk(project_path):
            if len(algorithms) > 0:
                for algorithm in algorithms:
                    project_full_path = f"{project_path}/{algorithm}/{project_name}"
                    model_filename = f"{project_full_path}_128scenarios_nopolicies_sobol"
                    experiments_df, outcomes = load_results(f"{model_filename}.tar.gz")
                    experiments_df['algorithm'] = algorithm
                    outcomes_df = pd.DataFrame(outcomes)
                    self.outcomes = pd.concat([self.outcomes, outcomes_df])
                    self.experiments = pd.concat([self.experiments, experiments_df])
                    all_df = pd.concat([experiments_df, outcomes_df], axis=1)
                    parameters = get_parameters(experiments_df)
                    all_df.index = all_df.apply(lambda row: '_'.join([f"{get_normalized_parameter(parameter)}_{row[parameter]:.9f}" for parameter in parameters]), axis=1)
                    all_df = all_df[~all_df.index.duplicated(keep='first')]
                    all_df['decomposition_id'] = all_df.index
                    self.all = pd.concat([self.all, all_df])
                    with open(f"{model_filename}_model.pkl", 'rb') as file:
                        self.uncertainties_problem[algorithm] = pickle.load(file)
                    with open(f"{model_filename}_partitions.pkl", 'rb') as file:
                        partitions = pickle.load(file)
                        partitions = rename_dict_keys(partitions)
                        self.partitions[algorithm] = partitions
                    partitions_distance = 1 - pd.read_csv(f"{project_full_path}_omega_scores.csv", index_col=0)
                    partitions_distance.index = [reorder_index(idx) for idx in partitions_distance.index]
                    partitions_distance.columns = list(map(lambda c: reorder_index(c), partitions_distance.columns))
                    result = pd.DataFrame(np.nan, index=pd.Index(self.partitions_distance.index.tolist() + partitions_distance.index.tolist(), name='index'),
                                          columns = pd.Index(self.partitions_distance.columns.tolist() + partitions_distance.columns.tolist(), name='columns'))
                    result.loc[self.partitions_distance.index, self.partitions_distance.columns] = self.partitions_distance
                    result.loc[partitions_distance.index, partitions_distance.columns] = partitions_distance
                    self.partitions_distance = result
                    with open(f"{project_full_path}_stable_solutions.pkl", 'rb') as f:
                        stable_solutions = pickle.load(f)
                        other_labels = stable_solutions.keys()
                        self.all['stability'] = 0
                        for index, value in stable_solutions.items():
                            self.all.loc[reorder_index(index), 'stability'] = value
                    mds = MDS(dissimilarity='precomputed', random_state=0)
                    self.embeddings_2d_partitions[algorithm] = mds.fit_transform(partitions_distance)
                    partition_labels_2d, _, silhouette = self._run_agglomerative(self.embeddings_2d_partitions[algorithm], k=5, threshold=None, normalize=True, n_pca=2)
                    self.partition_labels[algorithm] = partition_labels_2d
                    self.feature_scoring[algorithm] = feature_scoring.get_feature_scores_all(experiments_df, outcomes)
                    self.sobol[algorithm] = {}
                    self.sobol[algorithm]["n_partitions"] = sobol.analyze(self.uncertainties_problem[algorithm], outcomes['n_partitions'], calc_second_order=True)
                    self.sobol[algorithm]["modularity"] = sobol.analyze(self.uncertainties_problem[algorithm], outcomes['modularity'], calc_second_order=True)
                    self.sobol[algorithm]["ned"] = sobol.analyze(self.uncertainties_problem[algorithm], outcomes['ned'], calc_second_order=True)
                    self.sobol[algorithm]["density"] = sobol.analyze(self.uncertainties_problem[algorithm], outcomes['density'], calc_second_order=True)
                    self.sobol[algorithm]["noise_classes"] = sobol.analyze(self.uncertainties_problem[algorithm], outcomes['noise_classes'], calc_second_order=True)
                    self.class_color = {}
                    self.legends = {}
                    resolution = []
                    partition_id = []
                    class_name = []
                    for res, partitions_2 in partitions.items():
                        for part_id, classes in partitions_2.items():
                            for cls in classes:
                                resolution.append(res)
                                partition_id.append(part_id)
                                class_name.append(cls)
                    partitions_df = pd.DataFrame({
                        'decomposition_id': resolution,
                        'microservice_id': partition_id,
                        'class_name': class_name,
                        'algorithm': algorithm
                    })
                    self.partitions_df = pd.concat([self.partitions_df, partitions_df])
                    metrics = []
                    partitions_3 = []
                    value_1 = []
                    value_2 = []
                    for metric, partitions_dict in self.sobol[algorithm].items():
                        for partition, values in partitions_dict.items():
                            if values.ndim == 1:
                                metrics.append(metric)
                                partitions_3.append(partition)
                                value_1.append(values[0])
                                value_2.append(values[1])
                            elif values.ndim == 2:
                                for i in range(values.shape[0]):
                                    metrics.append(metric)
                                    partitions_3.append(partition)
                                    value_1.append(values[i, 0])
                                    value_2.append(values[i, 1])

                    sobol_df = pd.DataFrame({
                        'algorithm': algorithm,
                        'metric': metrics,
                        'Sobol index': partitions_3,
                        'Variance lower': value_1,
                        'Variance upper': value_2
                    })
                    self.sobol_df = pd.concat([self.sobol_df, sobol_df])

    def _run_agglomerative(self, df, k, threshold=200, n_pca=None, normalize=False, archstructure=None):
        if normalize:
            sample = StandardScaler().fit_transform(df)
        else:
            sample = df.values

        if n_pca is not None:
            sample_pca = sample
            model = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward',
                                            connectivity=archstructure, distance_threshold=threshold)
            model.fit(sample_pca)
        else:
            model = AgglomerativeClustering(n_clusters=k, metric='precomputed', linkage='single',
                                            connectivity=archstructure, distance_threshold=threshold)
            model.fit(sample)
        fixed_labels = np.where(model.labels_ < 0, 0, model.labels_)
        classes = set(fixed_labels)
        if len(classes) > 1:
            if n_pca is not None:
                silhouette = metrics.silhouette_score(sample_pca, fixed_labels)
            else:
                silhouette = metrics.silhouette_score(sample, fixed_labels)
        else:
            silhouette = 0.0
        return fixed_labels, model, silhouette

    def _select_medoids_from_clusters(self, algorithm):
        partition_labels = self.partition_labels[algorithm]
        distance_df = self.partitions_distance
        embeddings_2d = self.embeddings_2d_partitions[algorithm]
        classes = set(partition_labels)
        medoids = []
        medoid_labels = []
        for c in classes:
            cluster_indices = [i for i, x in enumerate(partition_labels) if x == c]
            cluster_distances = []
            for i in cluster_indices:
                cluster_distances.append(np.sum(distance_df.iloc[i, cluster_indices]))
            idx = cluster_indices[np.argmin(cluster_distances)]
            medoid_labels.append(distance_df.columns[idx])
            kmodel = KMedoids(n_clusters=1, method='pam').fit(embeddings_2d[cluster_indices])
            medoids.extend(kmodel.cluster_centers_)
        return medoids, medoid_labels

    def _show_sobol(self, Si, algorithm, title='', filename=None):
        scores_filtered = {k:Si[k] for k in ['ST','ST_conf','S1','S1_conf']}
        Si_df = pd.DataFrame(scores_filtered, index=self.uncertainties_problem[algorithm]['names'])

        sns.set_style('white')
        fig, ax = plt.subplots(1)

        indices = Si_df[['S1','ST']]
        err = Si_df[['S1_conf','ST_conf']]

        indices.plot.bar(yerr=err.values.T,ax=ax)
        fig.set_size_inches(8,6)
        fig.subplots_adjust(bottom=0.3)
        plt.title(title)
        if filename is not None:
            plt.savefig(filename)
        return plt

    def show_pairplot(self, title='', group='scenario'):
        data = pd.DataFrame(self.outcomes)
        policies = self.experiments['scenario']
        data['scenario'] = policies
        g = sns.pairplot(data, hue=group, vars=list(self.outcomes.keys()), corner=True, plot_kws={'alpha':0.25})
        g._legend.remove()
        g.fig.suptitle(title)
        return plt

    def get_stable_solutions(self) -> plt.plot:
        num_plots = len(self.embeddings_2d_partitions.items())
        cols = 2
        rows = (num_plots + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
        axes = axes.flatten()
        for idx, (algorithm, values) in enumerate(self.embeddings_2d_partitions.items()):
            df = pd.DataFrame(values, columns=['x', 'y'])
            df['stability'] = self.all.loc[self.all['algorithm'] == algorithm]['stability'].tolist()
            df.sort_values('stability', inplace=True) # To plot it correctly
            ax = axes[idx]
            sns.scatterplot(data=df, x="x", y="y", hue="stability", palette="magma_r", edgecolors='dimgrey',  alpha=0.7, s=35, marker='o', ax=ax)
            ax.set(xlabel=None)
            ax.set(ylabel=None)
            ax.set_title(f'Stable decompositions - {algorithm}', fontsize=12)
            plt.grid(False)
        return plt

    def _get_xy_coordinates(self, labels, algorithm):
        distance_df = self.partitions_distance
        embeddings_2d = self.embeddings_2d_partitions[algorithm]
        xy_coordinates = []
        for lb in labels:
            idx = list(distance_df.columns).index(lb)
            xy_coordinates.append(embeddings_2d[idx])
        return xy_coordinates

    def get_decomposition_space(self, algorithm: str, labels=None) -> plt.plot:
        if labels is None:
            labels = []
        fig = plt.figure(figsize=(8,8))
        df = pd.DataFrame(self.embeddings_2d_partitions[algorithm], columns=['x', 'y'])
        df['cluster'] = self.partition_labels[algorithm]
        ax = sns.scatterplot(data=df, x="x", y="y", hue="cluster", palette="tab10", alpha=0.3, sizes=(20, 200))
        plt.legend([],[], frameon=False)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])
        ax.set(xlabel=None)
        ax.set(ylabel=None)
        ax.set_title(f"{algorithm}", fontsize=12)
        if len(labels) > 0:
            medoids = np.array(self._get_xy_coordinates(labels, algorithm))
            if medoids.size != 0:
                ax.plot(medoids[:,0], medoids[:,1], 'X', markersize=9, alpha=0.7, color='black')
                for idx, label in enumerate(labels):
                    parts = label.split('_')
                    formatted_label = '\n'.join([f"{parts[i]}: {int(float(parts[i+1]))}" for i in range(0, len(parts), 2)])
                    ax.annotate(formatted_label, (medoids[idx,0], medoids[idx,1]))
        plt.grid(False)
        return plt

    def plot_2d_embeddings(self, algorithm: str, title: str = "") -> plt.plot:
        embeddings_2d = self.embeddings_2d_partitions[algorithm]
        partitions_dict = self.partitions[algorithm]
        df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
        df['cluster size'] = [len(partitions_dict[k]) for k in partitions_dict.keys()]
        ax = sns.scatterplot(data=df, x="x", y="y", hue="cluster size", palette="tab10", size="cluster size",  alpha=0.3, legend='full', sizes=(20, 200))
        plt.title(title)
        ax.legend(loc='upper right',ncol=2, title="n_partitions", bbox_to_anchor=(1.3, 1.05))
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])
        ax.set(xlabel=None)
        ax.set(ylabel=None)
        plt.grid(False)
        plt.show()

    def get_decompositions_by_metric(self, metric: str, k: int, asc: bool) -> pd.DataFrame:
        return self.all.sort_values(by=[metric], ascending=[asc]).head(k)

    def get_sensitivity_analysis_results(self, metric: str, algorithm: str, show_plot: bool, title: str = None) -> Union[pd.DataFrame, plt.plot]:
        if show_plot:
            return self._show_sobol(self.sobol[algorithm][metric], algorithm, title)
        return self.sobol[algorithm][metric]

    def get_feature_importance(self, show_plot: bool, algorithm: str) -> Union[pd.DataFrame, plt.plot]:
        if show_plot:
            sns.heatmap(self.feature_scoring[algorithm], cmap='viridis', annot=True)
            plt.title('Feature scoring')
            return plt
        return self.feature_scoring[algorithm]

    def get_influential_parameters(self, metric: str, algorithm: str)-> pd.DataFrame:
        if metric == "all":
            max_index = self.feature_scoring[algorithm].values.argmax() // self.feature_scoring[algorithm].shape[1]
            max_row = self.feature_scoring[algorithm].iloc[max_index]
            max_row_df = pd.DataFrame(max_row).T  # Transpose to get it in the correct format
            max_row_df.index = [self.feature_scoring[algorithm].index[max_index]]
            return max_row_df
        else:
            return self.feature_scoring[algorithm].loc[[self.feature_scoring[algorithm][metric].idxmax()]]

    def show_decomposition_structure(self, decomposition_index: str, decomposition: dict = None):
        if not decomposition:
            result = {key: val for subdict in self.partitions.values() for key, val in subdict.items()}
            decomposition = result.get(decomposition_index)
        decomposition = collections.OrderedDict(sorted(decomposition.items()))
        return self._draw(decomposition)

    def _get_class_color(self, class_name):
        if self.class_color.get(class_name):
            return self.class_color[class_name]
        color = "#" + "%06x" % random.randint(0, 0xFFFFFF)
        self.class_color[class_name] = color
        return color

    def _normalize_class_name(self, class_name):
        if class_name.endswith('.java'):
            return os.path.splitext(os.path.basename(class_name))[0]
        return class_name.split('.')[-1]

    def _draw(self, decomposition):
        plt.rcParams['figure.figsize'] = [18, 18]
        max_classes_len = -999999
        ax = plt.subplot(211)
        for microservice_id, microservice_classes in decomposition.items():
            if len(microservice_classes) > max_classes_len:
                max_classes_len = len(microservice_classes)
            for position, class_name in enumerate(microservice_classes):
                class_name = self._normalize_class_name(class_name)
                color = self._get_class_color(class_name)
                self.legends |= {class_name: mpatch.Patch(color=color, label=class_name)}
                ax.add_artist(mpatch.Rectangle((position, microservice_id), 1, 1, color=color, fc = color))

        labels = [f"Microservice {id + 1}" for id in decomposition.keys()]

        ax.set_yticks(np.arange(len(labels)))
        ax.set_yticklabels(labels)

        ax.set_xlim((0, max_classes_len))
        ax.set_ylim((0, len(decomposition.keys())))

        for tick in ax.get_yticklabels():
            tick.set_verticalalignment("bottom")

        ax.tick_params(size=0, axis='y', which='major', labelsize=20)
        ax.set_xticklabels([])
        ax.grid(axis = 'y', color = 'black')
        ax.legend(handles=self.legends.values(), bbox_to_anchor=(0., 1.07, 1., .102), loc=0,ncols=5, mode="expand", borderaxespad=0., fontsize=10)
        plt.tight_layout(pad=1.5)

        return ax

In [None]:
class SemanticLayer:
    SYSTEM_PROMPT = """You are an expert software architect that assists users to explore and understand a decomposition space.
        You have a deep understanding of monolith to microservices migration and microservices quality metrics.
        Your role is to help users to understand the decomposition space to pick the most suitable microservices decomposition according to the user need.
        """

    NEW_PANDAS_RESPONSE_PROMPT = PromptTemplate("""
        Your are an expert software designer and also a translator from Pandas to English.
        Given an input query and a Pandas tabular output, create a textual response based on the output.
        Both the input query and the output are related to the exploration of microservices decomposition alternatives.
        In your response, include all relevant details. Always include the relevant indexes of the Pandas dataframe you used to answer.
        If the answer includes several results, describe them all and always pick the first one to exemplify and give a more detailed answer along its index and decomposition id.
        When possible, use a bullet list to enumerate key points and include that data in tabular format including all metadata available (including indexes) to look for records.
        Do not mention that the response is based on a Pandas structure or output. Instead, focus on the content of the output.
        Do not make up information that is not provided in the Pandas dataframe. If you do not know the answer, respond "I don't know".

        Query: {query_str}

        Pandas Output: {pandas_output}

        Response: """
    )

    def __init__(self, decomposition_space=None, evaluation_mode=False) -> None:
        self.llm = Groq(model="llama3-70b-8192", api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
        self.tool_llm = Groq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
        self.intent_detector = None
        self.decomposition_space = decomposition_space
        self.evaluation_mode = evaluation_mode
        tools = self._get_tools()
        self.agent = OpenAIAgent.from_tools(tools, llm=self.tool_llm, system_prompt=SemanticLayer.SYSTEM_PROMPT, verbose=self.evaluation_mode)
        encoder = HuggingFaceEncoder()
        self.intent_detector = RouteLayer(encoder=encoder, routes=SemanticLayer._configure_routes(), llm=self.tool_llm)
        self.client = GroqV2(api_key=GROQ_API_KEY)
        if self.evaluation_mode:
            Settings.llm = self.llm
            sobol_qe = PandasQueryEngine(df=decomposition_space.sobol_df, synthesize_response=True, verbose=self.evaluation_mode)
            sobol_qe.update_prompts({"response_synthesis_prompt": self.NEW_PANDAS_RESPONSE_PROMPT})
            sobol_tool = QueryEngineTool.from_defaults(
                query_engine=sobol_qe,
                description="Useful for retrieving data about Sobol sensitivity analysis of each parameter.",
            )

            all_qe = PandasQueryEngine(df=decomposition_space.all, synthesize_response=True, verbose=self.evaluation_mode)
            all_qe.update_prompts({"response_synthesis_prompt": self.NEW_PANDAS_RESPONSE_PROMPT})
            all_tool = QueryEngineTool.from_defaults(
                query_engine=all_qe,
                description="Useful for retrieving parameters and metrics (non-extreme distribution or, density, modularity, and number of partitions/microservices) of each decomposition.",
            )

            partition_qe = PandasQueryEngine(df=decomposition_space.partitions_df, synthesize_response=True, verbose=self.evaluation_mode)
            partition_qe.update_prompts({"response_synthesis_prompt": self.NEW_PANDAS_RESPONSE_PROMPT})
            partition_tool = QueryEngineTool.from_defaults(
                query_engine=partition_qe,
                description="Useful for retrieving classes and the number of microservices/partitions of each decomposition. It contains the classes that are included in a microservice of a decomposition.",
            )

            self.query_engine = RouterQueryEngine(
                selector=LLMSingleSelector.from_defaults(),
                query_engine_tools=[
                    sobol_tool,
                    all_tool,
                    partition_tool
                ],
            )



    def get_decomposition_space(self, algorithm: str) -> plt.plot:
        """ Get the decomposition space graphically.

        Args:
           algorithm (str): The algorithm or tool to get decomposition space from

        Returns:
            plt.plot: The plot of the decomposition space, showing a 2D projection of each decomposition as well as identification of the cluster each decomposition is into
        """
        return self.decomposition_space.get_decomposition_space(algorithm)

    def get_stable_solutions(self) -> plt.plot:
        """ Get the stable decompositions/solutions graphically.

        Returns:
            plt.plot: The plot of the stable decompositions/solutions space, showing a 2D projection of each decomposition as well as identification of the cluster each decomposition is into
        """
        return self.decomposition_space.get_stable_solutions()


    def get_decomposition_by_metric(self, k: int, metric: str, asc: bool, show_plot: bool) -> Union[pd.DataFrame, plt.plot]:
        """ Get the K decompositions that match a preferred metric (ned, density, modularity, stability, or n_partitions)).

          Args:
            k (int): The number of decompositions to retrieve.
            metric (str): The metric to match the decompositions against, one of ned, density, modularity, stability, or number of partitions (n_partitions).
            asc (bool): Whether to prefer lower values (True) or higher values (False).
            show_plot (bool): Whether to return the decompositions that match a metric graphically through a plot

        Returns:
            pd.DataFrame: The K decompositions are ordered by asc param against the metric passed as a parameter.
            plt.plot: The plot of the K decompositions in the 2D projection space
        """
        decompositions = self.decomposition_space.get_decompositions_by_metric(metric, k, asc)
        indexes = decompositions.index.values
        return self.decomposition_space.get_decomposition_space(decompositions.index.values)

    def get_influential_parameters(self, algorithm: str, metric: str) -> pd.DataFrame:
        """ Get the most influential parameters of a decomposition tool for a preferred metric (ned, density, modularity, n_partitions, all).

         Args:
           algorithm (str): The algorithm or tool to get parameters
           metric (str):  The metric to match the decompositions against, one of ned, density, modularity, number of partitions or microservices (n_partitions), or all.

         Returns:
            pd.DataFrame: The most influential parameters for a metric.
        """
        return self.decomposition_space.get_influential_parameters(metric, algorithm)

    def show_decomposition_structure(self, decomposition_id: str) -> plt.plot:
        """ Get the microservices of a decomposition obtained by a decomposition id.

        Args:
          decomposition_id (str): The decomposition id to obtain the structure from.

        Returns:
          plt.plot: The microservices architecture, each row represents a microservice, and each column represents a class inside that includes number of microservices (rows) and classes (columns) that compose each microservice distinguished by different colors
        """
        return self.decomposition_space.show_decomposition_structure(decomposition_id)

    def _get_tools(self):
        return [
            FunctionTool.from_defaults(fn=self.get_decomposition_space, return_direct=True),
            FunctionTool.from_defaults(fn=self.get_decomposition_by_metric, return_direct=True),
            FunctionTool.from_defaults(fn=self.get_influential_parameters, return_direct=True),
            FunctionTool.from_defaults(fn=self.show_decomposition_structure, return_direct=True),
            FunctionTool.from_defaults(fn=self.get_stable_solutions, return_direct=True)
        ]

    def _configure_routes():
        return [
            Route(
                name="get_decomposition_space",
                utterances=[
                    "Which decompositions are generated?",
                    "Show me the decomposition space graphically",
                    "Get all decompositions graphically",
                    "Show me all decompositions",
                    "Show the decomposition space",
                    "Show the decomposition space graphically",
                    "Show me a visual representation of the decomposition space, including the 2D projection of each decomposition and its cluster.",
                    "Provide the decomposition space plot to help visualize how decompositions are grouped in the 2D space.",
                    "Give me a plot showing the decomposition space with each decomposition's 2D projection and its cluster."
                ],
                description="Get the decomposition space graphically."
            ),
            Route(
                name="get_decomposition_by_metric",
                utterances=[
                    "Get the X decompositions with less Y",
                    "Get me X decompositions with more Y",
                    "Which is the decomposition with more X?",
                    "Which is the decomposition with less X?",
                    "Which is the decomposition of highest X?",
                    "Which is the decomposition of lowest X?",
                    "Show me the decomposition with more X",
                    "Get a plot of the decomposition X",
                    "Show me the decomposition with more X graphically",
                    "Show me the decomposition with lowest X graphically",
                    "Show me the decomposition with highest X graphically",
                    "Show me the decomposition of lowest X graphically",
                    "Show me the decomposition of highest X graphically",
                    "Provide the K decompositions based on modularity with a plot showing the 2D projection of each.",
                    "Show me a plot comparing decompositions sorted by stability, with higher values preferred.",
                    "Display a plot of the K decompositions ordered by density."
                    "Show me the K decompositions ordered by the cohesion or coupling metric, with higher values preferred.",
                    "Give me the decomposition sorted by modularity, showing the top K results with either ascending or descending values.",
                    "I need to see the decompositions ranked by density, with a plot showing the comparison.",
                    "Provide the top K decompositions based on the ned metric, ordered by highest to lowest value."
                ],
                description="Get the K decompositions that match a preferred metric, graphically or raw."
            ),
            Route(
                name="get_influential_parameters",
                utterances=[
                    "Which is the most influential parameter for X in algorithm Y?"
                    "Which is the most influential parameter of algorithm Y?",
                     "Give me the most influential parameters for the cohesion metric in the decomposition.",
                    "What parameters influence the modularity metric the most in the decomposition of algorithm Y?",
                    "Show me the influential parameters for the density metric across different microservices.",
                    "List the parameters that have the greatest influence on the n_partitions (number of partitions) metric in the decomposition.",
                    "Provide the influential parameters for the stability metric in the decomposition."
                ],
                description="Get the most influential parameter for a metric or all metrics in an algoirthm."
            ),
            Route(
                name="show_decomposition_structure",
                utterances=[
                    "How is structured the decomposition obtained by setting X to Y and Z to W?"
                    "Which are the classes of the microservices of the decomposition with parameters X=Y and Z=W?",
                    "How is the architecture of the decomposition with parameters X=Y and Z=W?",
                    "Show me the structure of the decomposition, including the microservices and the classes assigned to each one.",
                    "Give me a breakdown of how many classes are assigned to each microservice in the decomposition.",
                    "Provide the decomposition structure so I can see how the microservices are organized by classes.",
                    "How are entities like X, Y, and Z distributed across microservices in the decomposition?",
                    "Check if any microservices are specifically handling X in the decomposition."
                ],
                description="Get the decomposition structure."
            ),
            Route(
                name="get_stable_solutions",
                utterances=[
                    "Plot the stable solutions in the decomposition space, with a 2D projection and cluster identification for each solution.",
                    "Visualize the stable solutions in the decomposition space, showing their 2D projection and the clusters they belong to.",
                    "Generate a graphical representation of the stable solutions in the decomposition space, highlighting clusters.",
                    "Display a plot of the stable solutions within the decomposition space, with each solution's cluster identified.",
                    "Show a 2D plot of the stable solutions in the decomposition space, emphasizing cluster identification.",
                    "Provide a graphical visualization of stable solutions, showing their position in the decomposition space and the clusters they belong to.",
                    "Return a plot highlighting the stable solutions within the decomposition space, along with their respective clusters.",
                    "Generate a 2D projection of stable solutions in the decomposition space, identifying the clusters for each.",
                    "Visualize the stable solutions in the decomposition space with a plot that highlights their clusters.",
                    "Show a graphical plot of stable solutions, including a 2D projection and the clusters associated with each solution.",
                    "Show the stable solutions",
                    "Get the stable decompositions"
                ],
                description="Get stable solutions/descompositions"
            )
        ]

    def clear_memory(self):
        self.agent.reset()

    def chat(self, question):
        graphical_response = None
        textual_response = self.query_engine.query(question).response
        intent = self.intent_detector(question + textual_response)
        if intent.name is not None:
            function_name = "\nTry to execute tool "+intent.name if (intent.name is not None) and (intent.name != 'misc') else ""
            graphical_response = self.agent.chat(question+textual_response+function_name)
        return display(Markdown(f"{textual_response}<br>{graphical_response.response if graphical_response else ''}"))

In [None]:
decomposition_space = DecompositionSpace(PROJECT_PATH, PROJECT_NAME)

In [None]:
layer = SemanticLayer(decomposition_space)

In [None]:
layer.chat("How many different kind of decompositions there are?")

# Evaluation

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
DEFAULT_TEXT_QA_PROMPT = """\
You are an expert software developer that needs to decompose an objective-oriented monolithic application into microservices, while taking scalability and maintenance factors into consideration.
For this monolithic application, you have several decomposition alternatives at your disposal, and you have to choose the decomposition that best fit your needs.
Since the decompositions are generated by an automated tool, each decomposition includes information about the parameters used by the tool to obtain the decomposition,
such as: the class partitioning and several microservices quality metrics.

Monolithic application: {app_description}
---------------------
{context_str}
---------------------
Your task is to generate {k} specific questions about the decompositions that should help you inform your selection of the best decomposition.

The documentation is as follows:
{query_str}

Please ensure that the questions are based on the documentation provided above and are not compound, and explore different analysis perspectives with respect to the decompositions.
The questions should be formatted as a JSON list.
"""
text_question_template = PromptTemplate(DEFAULT_TEXT_QA_PROMPT)
text_question_template = PromptTemplate(text_question_template.format(k=50, app_description="Cargo Tracking. The main focus of Cargo Tracking is to move a Cargo (identified by a TrackingId) between two Locations through a RouteSpecification. Once a Cargo becomes available, it is associated with one of the Itineraries (lists of CarrierMovements), selected from existing Voyages. HandlingEvents then trace the progress of the Cargo on the Itinerary. The Delivery of a Cargo informs about its state, estimated arrival time, and being on track."))
text_qa_template = text_question_template
question_gen_query = (f"Your task is to setup questions about the decompositions that should help you inform your selection of the best decomposition." )
documentation = [Document(text="""
Factors:
- Scalability: Microservices are independent. Therefore, their sizing is easily accomplished. For instance, if a microservice is receiving several requests and consequently needs computational resources, it can be scaled individually using a container manager. Scalability then measures the ability of a system to cope with a load increase without sacrificing performance.
- Maintenance: Independence, reduced size, and limited context are characteristics that belong to microservices, which provide ease of maintenance. Therefore, it can be inferred that modification is straightforward, that is, it is possible to modify a microservice directly without major concerns about the impact that will be caused on the other microservices that make up the application, since that there is a low degree of coupling between them. In case of introducing errors to a microservice, they can be fixed more quickly.
 """)]
data_generator = DatasetGenerator.from_documents(documentation, text_question_template=text_question_template, text_qa_template=text_qa_template, question_gen_query=question_gen_query)
questions = data_generator.generate_questions_from_nodes()
questions

Manual filtering of relevant questions

In [None]:
questions = [
    "What is the number of microservices in each decomposition?",
    "What is the average size of each microservice of each decomposition in terms of number of classes?",
    "What is the largest microservice in each decomposition in terms of number of classes?",
    "What is the smallest microservice in each decomposition in terms of number of classes?",
    "What is the cohesion metric for each microservice in each decomposition?",
    "What is the coupling metric between microservices in each decomposition?",
    "What is the complexity metric for each microservice in the decomposition?",
    "How many dependencies exist between microservices in each decomposition?",
    "What is the average number of interfaces per microservice in each decomposition?",
    "Which decomposition has the highest scalability potential?",
    "Which decomposition has the lowest maintenance complexity?",
    "How many Cargo-related classes are in each microservice of each decomposition?",
    "How many Location-related classes are in each microservice of each decomposition?",
    "How many RouteSpecification-related classes are in each microservice of each decomposition?",
    "How many Itinerary-related classes are in each microservice of each decomposition?",
    "How many CarrierMovement-related classes are in each microservice of each decomposition?",
    "How many HandlingEvent-related classes are in each microservice of each decomposition?",
    "How many Delivery-related classes are in each microservice of each decomposition?",
    "Which decomposition has the most balanced distribution of classes across microservices?",
    "Which decomposition has the least balanced distribution of classes across microservices?",
    "Which decomposition has the highest degree of independence between microservices?",
    "Which decomposition has the lowest degree of independence between microservices?",
    "What is the average number of configuration files per microservice in each decomposition?",
    "Which decomposition has the highest degree of reusability across microservices?",
    "Which decomposition has the lowest degree of reusability across microservices?",
    "Which decomposition has the highest degree of alignment with business capabilities?",
    "Which decomposition has the lowest degree of alignment with business capabilities?"
]

In [None]:
queries = []
responses = []

layer = SemanticLayer(decomposition_space, True)

for query in questions:
    try:
        response = layer.chat(query)
        responses.append(response)
    except:
        responses.append("None")
    finally:
        queries.append(query)
        layer.clear_memory()


In [None]:
from llama_index.core.evaluation import (
    AnswerRelevancyEvaluator
)
import tqdm

judges = {}
judges["answer_relevancy"] = AnswerRelevancyEvaluator()

evals = {
    "answer_relevancy": [],
}

for query, response in tqdm.tqdm(zip(queries, responses)):
    answer_relevancy_result = judges["answer_relevancy"].evaluate(
        query=query,
        response=response
    )

    evals["answer_relevancy"].append(answer_relevancy_result)

Some results do not exactly match the evaluation mode, so we correct the result based on some texts

In [None]:
import re

extracted_scores = []
filtered_results = [result for result in evals["answer_relevancy"] if result.invalid_result]
for result in filtered_results:
    match = re.search(r"(?:\n\n\*\*Final Result:\*\*\s*|\n\nFinal Result:\s*)\[[^\]]+\]\s*(\d+(?:\.\d+)?)", result.feedback)

    if match:
        score = match.group(1)
        extracted_scores.append((result.query, score))

final_results = []

for result in evals["answer_relevancy"]:
    if result.invalid_result:
        result.score = next((float(r[1])/2 for r in extracted_scores if r[0] == result.query), None)
    final_results.append(result)
final_results

In [None]:
final_results["Answer relevancy score"].mean()

In [None]:
final_results.to_csv(f"{PROJECT_PATH}/results.csv")
