# TODOs

In [None]:
from IPython.display import display, Markdown

display(Markdown('TODO.md'))

# Design Pattern Recognition with Software Metrics

## Library/Package Imports
All required modules should be in the next cell to avoid scattered imports

In [None]:
# Ignore missing imports warnings in vs code
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from typing import Callable
import ipywidgets as widgets
from IPython.display import display, HTML
from typing import Optional, Dict, List
import numpy as np
from enum import Enum, auto
from constants import ClassMetricVectorConstants, get_label_column, get_metric_columns

In [None]:
# Common utility functions
def generate_subplot(df: pd.DataFrame, plot_func: Callable[[pd.DataFrame, str], go.Figure], subplot_width: int = 600, subplot_height: int = 2400) -> go.Figure:
    metric_columns = get_metric_columns()
    subplots = make_subplots(
        len(metric_columns), subplot_titles=metric_columns)
    for i, metric in enumerate(metric_columns):
        figure = plot_func(df, metric)
        subplots.add_trace(figure, row=i+1, col=1)
    subplots['layout'].update(height=subplot_height, width=subplot_width)
    return subplots


def generate_selectable_graph_for_metrics(df: pd.DataFrame, initial_plot_func: Callable[[], go.Figure], update_func: Callable[[go.Figure, pd.DataFrame, str], None], y_label: Optional[str] = None):
    metric_dropdown = widgets.Dropdown(options=get_metric_columns())
    fig = go.FigureWidget(initial_plot_func())

    def on_metric_changed(change):
        metric = change['new']
        with fig.batch_update():
            figure = fig.data[0]
            update_func(figure, df, metric)
            figure['name'] = metric
            label = y_label if y_label else ' '
            fig.update_layout(title=metric, bargap=0.5,
                              xaxis_title=metric, yaxis_title=label)

    metric_dropdown.observe(on_metric_changed, names='value')
    display(widgets.VBox([metric_dropdown, fig]))

## Generation of metrics

If the metrics are not yet generated, the following steps are required:

1. Make sure that `source_files.zip` is located in the current directory. The archive contains the actual zipped source code of the projects in [P-MArT](https://www.ptidej.net/tools/designpatterns/) and `pmart.xml` with descriptions of the micro architectures
2. Create a new virtual Python environment with `python -m venv .` in the current directory if not yet done
3. Activate the virtual environment ([refer here for the actual command to run](https://docs.python.org/3/library/venv.html#how-venvs-work))
4. Execute `python3 preprocess_source_files.py` to extract the source files from `source_files.zip` and move the source files described in `pmart.xml` into `dataset` directory. For more information run `python3 preprocess_source_files.py -h`.
    - Source files are structured as `<dataset_dir>/<design_pattern/micro_architecture_<id>`
    - Each micro architecture directory contains the following files:
        - `roles.csv`: Roles, entity names and role kind as described in `pmart.xml`
        - `projects.txt`: From which project the source files come from
        - The source files to be evaluated
5. 
    - **OLD**: Execute `python3 generate_source_file_metrics.py` to generate `metrics.csv`. For more information run `python3 generate_source_file_metrics.py`.
    - **NEW**: Execute `docker build --file docker/sourcefileparser.dockerfile . -t sourcefilerparser:latest` in the `project` directory to build the tool and run `docker run -v ./:/home/app/volume  -e DATASET_PATH=./dataset -e OUTPUT_CSV=./m.csv sourcefilerparser:latest` for metric generation

**NOTES**: 
- As the projects in this dataset are old and not all projects listed in P-MaRT are not accessible, some source files and their entries in `metrics` may be missing.
- The tool for generating the metrics was originally written with a Java Parser implemented Python only. This lead to parsing issues in some source files. As a result, the tool was rewritten as a Java project with a native parser. The original Python script is included for completeness.

## Overview about `metrics.csv`

In order to detect applied Gang Of Four design patterns in source code with machine learning strategies, we first need to transform the source file into a numerical representation that can be understood by a machine learning model.
This approach aims to solve this by generating numerical characteristics for each source file in the context of the regarded micro architecture. As there are several methods to define what metrics to include in the evaluation, the metrics as described [in this paper](../sources/JSEA-DP-2014.pdf):

- NOF: Number of fields
- NSF: Number of static fields
- NOM: Number of methods
- NSM: Number of static methods
- NOAM: Number of abstract methods
- NORM: Number of overridden methods
- NOPC: Number of private constrcutors
- NOOF: Number of object fields
- NCOF: Number of other classes with field of own type


In addition to these metrics, the following Chidamber & Kemerer object-oriented metrics were added to quantify the relation, coupling and cohesion between participants in a design pattern:

- FAN_IN: Number of input dependencies
- FAN_OUT: Number of output dependencies
- CBO: Coupling between objects
- NOC: Number of inheriting children
- RFC: Response for a class (number of unique method invocations in a class)
- TCC: Tight class cohesion (via direct connections between visible methods, two methods or their invocation trees access the same class variable)
- LCC: Low class cohesion

## Outlier Detection and Removal

As the dataset may contain a varied implementation of datasets, outlier detection and removal may be required to reduce the noise in the dataset. `sklearn` provides the some automatic and unsupervised approaches out of the box. The following are considered

**NOTE**: This list is subject to change

* Isolation Forest
* Local Outlier Factor

In [None]:
# Required imports for this section
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

### Isolation Forest

In [None]:
def apply_isolation_forest(df: pd.DataFrame):
    df_filtered = df.copy()
    isolation_forest = IsolationForest(contamination=0.1)
    df_filtered['outlier'] = isolation_forest.fit_predict(
        df_filtered[get_metric_columns()])
    df_filtered = df_filtered[df_filtered['outlier'] == 1]
    return df_filtered.drop(columns=['outlier'])

### Local Outlier Factor

In [None]:
def apply_local_outlier_factor(df: pd.DataFrame) -> pd.DataFrame:
    threshold = 0

    df_copy = df.copy()
    lof = LocalOutlierFactor(contamination=0.5)
    df_copy['outlier_score'] = lof.fit_predict(df_copy[get_metric_columns()])
    df_copy = df_copy[df_copy['outlier_score'] > threshold]
    return df_copy.drop(columns=('outlier_score'))

## Explorative Data Analysis of the Dataset

In [None]:
df = pd.read_csv('./metrics.csv')
df = df.dropna()
#df = apply_isolation_forest(df)
print(f'{df.shape[0]} rows were imported')

In [None]:
df[ClassMetricVectorConstants.ROLE] = df[ClassMetricVectorConstants.ROLE].str.lower().str.strip()
df[ClassMetricVectorConstants.ROLE_KIND] = df[ClassMetricVectorConstants.ROLE_KIND].str.lower().str.strip()

In [None]:
# Check if columns in dataframe have expected types
df.dtypes

### Filter Dataframe entries by micro architecture

In [None]:
micro_arches = df[ClassMetricVectorConstants.MICRO_ARCHITECTURE].unique().tolist()

def view(micro_arch=''):
    cols = [ClassMetricVectorConstants.ROLE_KIND, ClassMetricVectorConstants.ENTITY] + get_metric_columns()
    display(df[df[ClassMetricVectorConstants.MICRO_ARCHITECTURE] == micro_arch]
            [cols], clear=True)


w = widgets.Dropdown(options=micro_arches)
widgets.interactive(view, micro_arch=w)

### Corelation Between Columns
For each column we caclulate pairwaise the coefficient of corelation with other columns. The value of the coefficient can be interpreteted as:

- between -1.0 and 0: Negative correlation; a increase in one column expects a decrease in the other; the lower the bigger the impact
- equals 0: No correlation
- between 0 and 1: Postive correlation; a increase in one column causes an increase the other; the higher the bigger the impact

In [None]:
df_corr = df[get_metric_columns()].copy()
corr = df_corr.corr()
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x=corr.columns,
        y=corr.index,
        z=np.array(corr),
        text=corr.values,
        texttemplate='%{text:.2f}'
    )
)
fig.show()


### Distribution of roles

In [None]:
temp = df.groupby([ClassMetricVectorConstants.ROLE]).size()
temp = temp.sort_values(ascending=False).reset_index()
px.bar(temp, x=ClassMetricVectorConstants.ROLE, y=0).update_layout(yaxis_title='count')

### Distribution of design patterns

In [None]:
df_binned_by_role = df.copy()
df_binned_by_role = df_binned_by_role.drop_duplicates(
    [ClassMetricVectorConstants.MICRO_ARCHITECTURE, ClassMetricVectorConstants.DESIGN_PATTERN]).reset_index()
df_binned_by_role = df_binned_by_role[ClassMetricVectorConstants.DESIGN_PATTERN].value_counts(
).reset_index()

fig = px.histogram(df_binned_by_role, x=ClassMetricVectorConstants.DESIGN_PATTERN, y='count')
fig.update_layout(xaxis_title='Design Pattern',
                  yaxis_title='Count of Design Pattern')

### Distribution for metrics

In [None]:
def initial_histogram():
    return go.Histogram(
        histfunc='count',
    )


def update_histogram(figure: go.Figure, df: pd.DataFrame, metric: str):
    figure['x'] = df[metric]


generate_selectable_graph_for_metrics(
    df, initial_histogram, update_histogram, 'count')

### Box Plots for metrics

In [None]:
def initial_histogram():
    return go.Box(
    )


def update_histogram(figure: go.Figure, df: pd.DataFrame, metric: str):
    figure['x'] = df[metric]


generate_selectable_graph_for_metrics(df, initial_histogram, update_histogram)

## Model Training

As design patterns can be considered as small scale appliances of software architecture, they consist of different entities with different relationships and roles to fulfill in the regarded design pattern. In order to detect design patterns, we first need to detect what kind of role a given Java class or entity it most likely corresponds to. To achieve this, machine learning model capable of classifying multiple labels should be considered. The extracted software metrics are the numerical inputs and the most likely roles in a design pattern are the result. 
As this falls in the area of supervised machine learning, initially the following models/techniques are to be considered:

**NOTE:** This list is subject to change 

* Support Vector Machines
* Tree Classifiers
* Ensemble Classifiers (e.g Random Forest Classifier)
* Custom Convoluted Network

In order to optimize the given results of a given model, first RandomGridSearch is applied to determine a range of values or selection for the hyperparameters while GridSearch is used to determine the most optimal available value or selection for the regarded hyperparameter.

In [76]:
# Required import for machine learning
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import hpsklearn
import hyperopt
from dataclasses import dataclass, field
import joblib
from sklearn.model_selection import cross_val_score, KFold
import numpy as np
from sklearn.base import clone
from imblearn.over_sampling import RandomOverSampler, SMOTE
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from collections import defaultdict
import joblib

In [None]:
class Dataset:
    train: pd.DataFrame
    test: pd.DataFrame
    label_col: List[str]
    feature_cols: List[str]
    #roleKindEncoder: LabelEncoder
    roleEncoder: LabelEncoder
    dataset: pd.DataFrame
    
    def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame):
        self.label_col = get_label_column()
        self.feature_cols = get_metric_columns()
        self.train = train_df
        self.test = test_df
        self.roleEncoder = LabelEncoder()
        self.roleEncoder.fit(self.train[ClassMetricVectorConstants.ROLE])
        train_df[ClassMetricVectorConstants.ROLE] = self.roleEncoder.transform(train_df[ClassMetricVectorConstants.ROLE])
        test_df[ClassMetricVectorConstants.ROLE] = self.roleEncoder.transform(test_df[ClassMetricVectorConstants.ROLE])

    


    @classmethod
    def top_k_design_patterns(cls, df: pd.DataFrame, k: int) -> "Dataset":
        train, test = split_by_micro_arch(df, k)
        return cls(train, test)
        

    def get_X_train(self):
        return self.train[self.feature_cols]
    
    def get_Y_train(self):
        return self.train[self.label_col].values.ravel()

    def get_X_test(self):
        return self.test[self.feature_cols]

    def get_Y_test(self):
        return self.test[self.label_col].values.ravel()


def get_top_k_labels(df: pd.DataFrame, k: int):
    df_binned_by_role = df.copy()
    df_binned_by_role = df_binned_by_role.drop_duplicates(
        [ClassMetricVectorConstants.MICRO_ARCHITECTURE, ClassMetricVectorConstants.DESIGN_PATTERN])
    df_binned_by_role = df_binned_by_role[ClassMetricVectorConstants.DESIGN_PATTERN].value_counts(
    ).sort_values(ascending=False).head(k)
    return df_binned_by_role.index.to_list()


def split_by_micro_arch(df: pd.DataFrame, k: int):
    train_data_size = 0.4
    dp_map = defaultdict(list)
    top_patterns = get_top_k_labels(df, k)
    df_top_k = df[df[ClassMetricVectorConstants.DESIGN_PATTERN].isin(top_patterns)].reset_index()
    micro_arches = df_top_k[ClassMetricVectorConstants.MICRO_ARCHITECTURE].unique().tolist()
    for m in micro_arches:
        micro_rows = df_top_k[df_top_k[ClassMetricVectorConstants.MICRO_ARCHITECTURE] == m].copy().reset_index(drop=True)
        dp_key = micro_rows[ClassMetricVectorConstants.DESIGN_PATTERN].iloc[0]
        dp_map[dp_key].append(micro_rows)
    [dfs.sort(key=lambda l: -len(l)) for dfs in dp_map.values()]
    train_data = []
    test_data = []
    for micro_arches in dp_map.values():
        list_size = len(micro_arches)
        train_size = round(list_size * train_data_size)

        train_data.append(pd.concat(micro_arches[:train_size].copy()).reset_index(drop=True))
        test_data.append(pd.concat(micro_arches[train_size:].copy()).reset_index(drop=True))

    train = pd.concat(train_data, ignore_index=True)
    test = pd.concat(test_data, ignore_index=True)

    return train, test

def scoring(target, pred):
        return -f1_score(target, pred, average='micro')

In [None]:
dataset = Dataset.top_k_design_patterns(df, 4)

### Support Vector Machines

In [None]:
def apply_svm(dataset: Dataset):
    X_train = dataset.get_X_train()
    y_train = dataset.get_Y_train()
    
    X_test = dataset.get_X_test()
    y_test = dataset.get_Y_test()

    standard_scaler = StandardScaler()
    X_train = standard_scaler.fit_transform(X_train)
    X_test = standard_scaler.fit_transform(X_test)

    svm_classifier = SVC(kernel='rbf', gamma=0.1, C=1.75)
    svm_classifier.fit(X_train, y_train)

    pred = svm_classifier.predict(X_test)
    return f1_score(y_test, pred, average='micro')

apply_svm(dataset)

### Random Forest Classifier

In [None]:
def apply_random_forest(dataset: Dataset):
    X_train = dataset.get_X_train()
    y_train = dataset.get_Y_train()

    X_test = dataset.get_X_test()
    y_test = dataset.get_Y_test()

    standard_scaler = StandardScaler()
    X_train = standard_scaler.fit_transform(X_train)
    X_test = standard_scaler.fit_transform(X_test)

    random_forest_classifier = RandomForestClassifier(
        max_depth=30, random_state=1)
    random_forest_classifier.fit(X_train, y_train)

    pred = random_forest_classifier.predict(X_test)
    return  f1_score(y_test, pred, average='micro')


apply_random_forest(dataset)

### Get Best Possible Classifier with hyperopt-sklearn

In [None]:
def apply_hyperopt(dataset: Dataset, evals: int = 10):
    
    
    X_train = dataset.get_X_train()
    y_train = dataset.get_Y_train()


    X_test = dataset.get_X_test()
    y_test = dataset.get_Y_test()


    standard_scaler = StandardScaler()
    X_train = standard_scaler.fit_transform(X_train)
    X_test = standard_scaler.fit_transform(X_test)
    
    chosen_classifiers = [
        hpsklearn.random_forest_classifier('random_forest'),
        hpsklearn.k_neighbors_classifier('knn'),
        hpsklearn.svc('svm')
    ]

    p = 1 / len(chosen_classifiers)
    classifiers = hyperopt.hp.pchoice('cls', [(p, c) for c in chosen_classifiers])

    hyper_estimator = hpsklearn.HyperoptEstimator(
        classifier=classifiers,
        preprocessing=[],
        max_evals=evals,
        algo=hyperopt.tpe.suggest,
        trial_timeout=20,
        loss_fn=scoring,
        
        
    )

    hyper_estimator.fit(X_train, y_train)
    best_model = hyper_estimator.best_model()['learner']
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return hyper_estimator.score(X_test, y_test), best_model


In [31]:
def get_best_iteration(df: pd.DataFrame, top_k: int, max_evals: int, k_split: int):
    dataset = Dataset.top_k_design_patterns(df, top_k)
    score, estimator = apply_hyperopt(dataset, evals=max_evals)
    #unfitted_estimator = clone(estimator)
    cross_score = cross_val_score(estimator, dataset.get_X_train(), dataset.get_Y_train(), cv=k_split, scoring='f1_macro')
    joblib.dump(estimator, 'estimater.joblib')
    return f'HyperOpt-Score: {score} Mean Cross Validation Score: {cross_score}'
    
get_best_iteration(df, 4, 50, 5)

100%|██████████| 1/1 [00:00<00:00,  2.05trial/s, best loss: -0.0]
100%|██████████| 2/2 [00:00<00:00,  1.02trial/s, best loss: -0.4]
100%|██████████| 3/3 [00:00<00:00,  2.07trial/s, best loss: -0.4]
100%|██████████| 4/4 [00:00<00:00,  1.02trial/s, best loss: -0.4]
100%|██████████| 5/5 [00:00<00:00,  2.06trial/s, best loss: -0.4]
100%|██████████| 6/6 [00:00<00:00,  2.10trial/s, best loss: -0.4]
100%|██████████| 7/7 [00:00<00:00,  2.10trial/s, best loss: -0.4]
100%|██████████| 8/8 [00:00<00:00,  2.09trial/s, best loss: -0.4]
100%|██████████| 9/9 [00:00<00:00,  1.02trial/s, best loss: -0.4]
100%|██████████| 10/10 [00:00<00:00,  2.06trial/s, best loss: -0.4]
100%|██████████| 11/11 [00:00<00:00,  2.06trial/s, best loss: -0.4]
100%|██████████| 12/12 [00:04<00:00,  4.01s/trial, best loss: -0.4]
100%|██████████| 13/13 [00:00<00:00,  2.04trial/s, best loss: -0.4]
100%|██████████| 14/14 [00:00<00:00,  2.07trial/s, best loss: -0.4]
100%|██████████| 15/15 [00:00<00:00,  1.02trial/s, best loss: -0.4

Process Process-124:



 97%|█████████▋| 29/30 [00:00<?, ?trial/s, best loss=?]

Traceback (most recent call last):

  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()

  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)

  File "/home/memi/Dokumente/master_thesis/project/.venv/lib/python3.10/site-packages/hpsklearn/estimator/_cost_fn.py", line 211, in _cost_fn
    learner.fit(XEXfit, yfit)

  File "/home/memi/Dokumente/master_thesis/project/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)

  File "/home/memi/Dokumente/master_thesis/project/.venv/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 478, in fit
    trees = [

  File "/home/memi/Dokumente/master_thesis/project/.venv/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 479, in <listcomp>
    self._make_estimator(append=False, random_state=random_state)

  File "/home/memi/Dokumente/master_thesi




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00        11
           3       1.00      0.25      0.40         4
           4       0.00      0.00      0.00        14
           5       0.00      0.00      0.00        12
           6       0.57      0.33      0.42        12
           7       0.26      0.75      0.39         8
           8       0.25      0.12      0.17         8
           9       0.00      0.00      0.00         5
          10       0.33      0.11      0.17         9
          11       0.00      0.00      0.00         3
          12       0.03      0.33      0.06         3

    accuracy                           0.15        95
   macro avg       0.19      0.15      0.12        95
weighted avg       0.19      0.15      0.13        95



'HyperOpt-Score: 0.14736842105263157 Mean Cross Validation Score: [0.17592593 0.21223545 0.29604236 0.24490741 0.21132713]'

In [59]:
import re

s = "adaptee       0.02      0.33      0.04         3\n         adapter       0.00      0.00      0.00         3\n          client       0.00      0.00      0.00        11\n         command       0.00      0.00      0.00         4\n concreteCommand       0.43      0.93      0.59        14\nconcreteObserver       0.00      0.00      0.00        12\n concreteSubject       0.00      0.00      0.00        12\n         invoker       0.10      0.25      0.14         8\n        observer       0.00      0.00      0.00         8\n        receiver       0.00      0.00      0.00         5\n       singleton       0.00      0.00      0.00         9\n         subject       0.00      0.00      0.00         3\n          target       0.00      0.00      0.00         3\n\n        accuracy                           0.17        95\n       macro avg       0.04      0.12      0.06        95\n    weighted avg       0.07      0.17      0.10        95\n"
tab = ""
for r in s.split("\n"):
    t = [i for i in re.split(r'\s', r) if i.strip()]
    t = t[:4]
    if t:
        tab= tab + " & ".join(t) + "\\\\"

print(tab)


adaptee & 0.02 & 0.33 & 0.04\\adapter & 0.00 & 0.00 & 0.00\\client & 0.00 & 0.00 & 0.00\\command & 0.00 & 0.00 & 0.00\\concreteCommand & 0.43 & 0.93 & 0.59\\concreteObserver & 0.00 & 0.00 & 0.00\\concreteSubject & 0.00 & 0.00 & 0.00\\invoker & 0.10 & 0.25 & 0.14\\observer & 0.00 & 0.00 & 0.00\\receiver & 0.00 & 0.00 & 0.00\\singleton & 0.00 & 0.00 & 0.00\\subject & 0.00 & 0.00 & 0.00\\target & 0.00 & 0.00 & 0.00\\accuracy & 0.17 & 95\\macro & avg & 0.04 & 0.12\\weighted & avg & 0.07 & 0.17\\


In [101]:
@dataclass
class DpRole:
    role_name: str
    mutliple_occurences: bool = field(default=False)

#Singelton-, Adapter-, Command- und Observer-Entwurfsmuster 
design_patterns_to_detect = {
    'Singleton': [DpRole('singleton')],
    'Adapter': [DpRole('target'), DpRole('client', True), DpRole('adaptee', True), DpRole('adapter', True)],
    'Command': [DpRole('command'), DpRole('concreteCommand', True), DpRole('client'), DpRole('invoker'), DpRole('receiver')],
    'Observer': [DpRole('subject'), DpRole('observer'), DpRole('concreteSubject'), DpRole('concreteObserver', True)]
}


dataset = Dataset.top_k_design_patterns(df, 4)
random_micro_arch = dataset.test.groupby([ClassMetricVectorConstants.DESIGN_PATTERN]).sample(4)[ClassMetricVectorConstants.MICRO_ARCHITECTURE].to_list()

vectors = []
for m in random_micro_arch:
    v = dataset.test[dataset.test[ClassMetricVectorConstants.MICRO_ARCHITECTURE] == m].copy().reset_index()
    vectors.append(v)

model = joblib.load('estimater.joblib')

def match_pattern(role_encoder: LabelEncoder, roles_df: pd.DataFrame, dps: dict):
    # X = roles_df[get_metric_columns()]
    # y = model.predict(X)
    # roles = role_encoder.inverse_transform(y.ravel())
    # scores = defaultdict(int)
    # for dp, dp_roles in dps.items():
    #     single_roles = set()
    #     role_freq = defaultdict(int)
    #     for role in roles:
    #         dp_role = None
    #         for d in dp_roles:
    #             if d.role_name == role:
    #                 dp_role = d
    #                 break
    #         if dp_role and not dp_role.mutliple_occurences and dp_role.role_name not in single_roles:
    #             role_freq[dp_role.role_name] += 1
    #             single_roles.add(dp_role.role_name)
    #         elif dp_role and dp_role.role_name not in single_roles and dp_role.mutliple_occurences:
    #             role_freq[dp_role.role_name] += 1
    #         else:
    #             role_freq[role] = 0
    #     score = sum([freq for freq in role_freq.values()])/len(roles)
    #     scores[dp] = score
    # return scores

    X = roles_df[get_metric_columns()]
    X = StandardScaler().fit_transform(X)
    y = model.predict(X)
    roles = role_encoder.inverse_transform(y.ravel())
    scores = defaultdict(float)

    predicted_role_freq = defaultdict(int)
    for role in roles:
        predicted_role_freq[role] += 1

    for dp, dp_roles in dps.items():
        matched_roles = defaultdict(int)

        dp_roles_dict = {d.role_name: d for d in dp_roles}

        for role in predicted_role_freq:
            if role in dp_roles_dict:
                dp_role = dp_roles_dict[role]
                if (dp_role.mutliple_occurences or predicted_role_freq[role] == 1) and matched_roles[role] < predicted_role_freq[role]:
                    matched_roles[role] += 1

        total_possible_matches = sum([1 for d in dp_roles if d.mutliple_occurences or d.role_name in predicted_role_freq])
        score = sum(matched_roles.values()) / total_possible_matches if total_possible_matches > 0 else 0
        scores[dp] = score

    return scores

labels = []
preds = []
for v in vectors:
    label = v[ClassMetricVectorConstants.DESIGN_PATTERN][0]
    labels.append(label)
    predictions = match_pattern(dataset.roleEncoder, v, design_patterns_to_detect)
    p = max(predictions.items(), key=lambda x: x[1])
    preds.append(p[0])

print(labels)
print(preds)
print('Entire Process')
print(classification_report(labels, preds))

print('Model')
predictions = model.predict(dataset.get_X_test())
predictions = dataset.roleEncoder.inverse_transform(predictions)
real_labels = dataset.roleEncoder.inverse_transform(dataset.get_Y_test())
print(classification_report(real_labels, predictions))

    

print("Params for Random Forest")
print(model.get_params())

['Adapter', 'Adapter', 'Adapter', 'Adapter', 'Command', 'Command', 'Command', 'Command', 'Observer', 'Observer', 'Observer', 'Observer', 'Singleton', 'Singleton', 'Singleton', 'Singleton']
['Adapter', 'Command', 'Adapter', 'Command', 'Observer', 'Observer', 'Singleton', 'Command', 'Observer', 'Adapter', 'Observer', 'Command', 'Command', 'Command', 'Command', 'Command']
Entire Process
              precision    recall  f1-score   support

     Adapter       0.67      0.50      0.57         4
     Command       0.12      0.25      0.17         4
    Observer       0.50      0.50      0.50         4
   Singleton       0.00      0.00      0.00         4

    accuracy                           0.31        16
   macro avg       0.32      0.31      0.31        16
weighted avg       0.32      0.31      0.31        16

Model
                  precision    recall  f1-score   support

         adaptee       0.02      0.33      0.04         3
         adapter       0.00      0.00      0.00        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
