In [None]:
!pip install matplotlib~=3.6.3
!pip install networkx~=2.5
!pip install numpy~=1.22.4
!pip install pandas~=1.3.5
!pip install scipy~=1.9.3
!pip install tqdm~=4.64.1
!pip install umap-learn~=0.5.3
!pip install python-louvain~=0.16
!pip install fuzzy-c-means
!pip install hmmlearn~=0.2.8
!pip install scikit-learn~=1.1.3
!pip install umap-learn
!pip install seaborn~=0.12.2
!pip install karateclub~=1.3.3

## utils

In [None]:
import numpy as np
from sklearn.decomposition import PCA, FastICA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, BisectingKMeans
from sklearn.manifold import SpectralEmbedding, Isomap, MDS
from fcmeans import FCM
from sklearn.cluster import SpectralClustering
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.neighbors import NearestNeighbors
from umap import UMAP


def k_mean(k, data: np.ndarray):
    model = KMeans(k, max_iter=1000)
    labels = model.fit_predict(data)
    return labels


def fuzzy_c_means(k, data: np.ndarray):
    model = FCM(n_clusters=k, n_jobs=8)
    model.fit(data)
    return model.predict(data)


def gaussian_mixture(k: int, data: np.ndarray):
    model = GaussianMixture(k, max_iter=3000)
    labels = model.fit_predict(data)
    return labels


def hierarchical_clustering(k: int, data: np.ndarray):
    model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
    labels = model.fit_predict(data)
    return labels


def birch(k: int, data: np.ndarray):
    model = Birch(n_clusters=k)
    labels = model.fit_predict(data)
    return labels


def spectral_clustering(k: int, data: np.ndarray):
    model = SpectralClustering(n_clusters=k, assign_labels='discretize', n_jobs=-1)
    labels = model.fit_predict(data)
    return labels


def dbscan(k: int, data: np.ndarray):
    neigh = NearestNeighbors(n_neighbors=k)
    nbrs = neigh.fit(data)
    distances, _ = nbrs.kneighbors(data)
    distances = np.mean(distances[:, 1:], axis=1)
    cutoff = np.quantile(distances, q=0.98)
    model = DBSCAN(eps=cutoff, min_samples=1, n_jobs=-1)
    labels = model.fit_predict(data)
    return labels


def bisecting_kmeans(k: int, data: np.ndarray):
    model = BisectingKMeans(n_clusters=k)
    labels = model.fit_predict(data)
    return labels


clustering_algorithms = {
    "k_mean": k_mean,
    "fuzzy_c_means": fuzzy_c_means,
    "gaussian_mixture": gaussian_mixture,
    "hierarchical_clustering": hierarchical_clustering,
    "birch": birch,
    "dbscan": dbscan,
    "bisecting_kmeans": bisecting_kmeans
}

dim_reduction_algorithms = {
    "MDS": lambda k: MDS(n_components=k, n_jobs=-1, max_iter=300),
    # "PCA": lambda k: PCA(n_components=k),
    # "FastICA": lambda k: FastICA(n_components=k, max_iter=200),
    "without_reduction": None,
    # "Isomap": lambda k: Isomap(n_components=k, n_jobs=-1, max_iter=200),
    # "SpectralEmbedding": lambda k: SpectralEmbedding(n_components=k, n_jobs=-1),
    # "LLE": lambda k: LocallyLinearEmbedding(n_components=k, n_jobs=-1),
    # "UMAP": lambda k: UMAP(n_neighbors=100, n_components=k, n_epochs=1000, init='spectral', low_memory=False, verbose=False)
}

anomaly_detection_algorithms = {
    "without_anomaly": None,
    "OneClassSVM": OneClassSVM(kernel="rbf", nu=0.01, gamma='scale'),
    "IsolationForest": IsolationForest(random_state=0, n_jobs=-1, n_estimators=500, max_samples=256),
    "DBSCAN": DBSCAN(eps=5, min_samples=5, n_jobs=-1)
}


In [None]:
def find_best_algo(scores_mapping: Dict[Any, List[float]]) -> Tuple[Any, float, float, str]:
    _, p_value = f_oneway(*list(scores_mapping.values()))
    best_algo = random.choice(list(scores_mapping.keys()))
    t_test_p_value = -1
    print(f"annova value: {p_value}")
    msg = ""
    if p_value < P_VALUE_THR:
        sorted_scores = sorted(
            scores_mapping,
            key=lambda key: np.mean(scores_mapping[key]),
            reverse=True
        )
        candidate1, candidate2 = sorted_scores[:2]
        _, t_test_p_value = ttest_rel(
            scores_mapping[candidate1],
            scores_mapping[candidate2]
        )
        print(f"t_test value: {t_test_p_value}")
        best_algo = sorted_scores[0]
        if t_test_p_value >= P_VALUE_THR:
            msg = f"followed by t-test: algorithms {candidate1}, {candidate2} are the same"
            print(msg)
    else:
        msg = f"followed by annova: algorithms {scores_mapping.keys()} are the same"
        print(msg)
    return best_algo, p_value, t_test_p_value, msg


In [None]:
import os
import pickle
import random
from collections import defaultdict
from pathlib import Path
from typing import List, Tuple, Any, Dict
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score, mutual_info_score
from tqdm import tqdm

from scipy.stats import f_oneway
from scipy.stats import ttest_rel

P_VALUE_THR = 0.05


def get_silhouette_scores(
        X_cvs: List[np.ndarray],
        clustering_algo_name: str, reduction_algo_name: str,
        dim_num: int, k_clusters: int
) -> List[float]:
    scores = []
    for cv_id, cv_data in enumerate(X_cvs):
        try:
            cv_data = reduction_algo_wrapper(reduction_algo_name, dim_num, cv_data, cv_id)
            print(f"doing {clustering_algo_name} for {cv_data.shape} by {reduction_algo_name}")
            labels = clustering_algorithms[clustering_algo_name](k_clusters, cv_data)
            # print(f"anomaly: {labels[labels == -1].size / labels.size}")
            scores.append(silhouette_score(cv_data[labels != -1], labels[labels != -1]))
        except Exception as e:
            print(e)
            break
    return scores


def reduction_algo_wrapper(reduction_algo_name: str, dim_num: int, cv_data: np.ndarray, cv_id: int, CACHE_PATH) -> np.ndarray:
    reduction_algo = dim_reduction_algorithms[reduction_algo_name]
    if reduction_algo is None:
        return cv_data
    cache_file = os.path.join(CACHE_PATH, f"{reduction_algo_name}-{dim_num}-{cv_id}.pkl")
    if Path(cache_file).is_file():
        with open(cache_file, "rb") as file:
            return pickle.load(file)
    cv_data = reduction_algo(dim_num).fit_transform(cv_data)
    with open(cache_file, "wb") as file:
        pickle.dump(cv_data, file)
    return cv_data


def generate_cvs(X: np.ndarray, y: pd.DataFrame, num_of_cvs, cv_size) -> Tuple[List[np.ndarray], List[pd.DataFrame]]:
    X_cvs = []
    y_cvs = []
    for i in range(num_of_cvs):
        rows = np.random.randint(X.shape[0], size=cv_size)
        X_cvs.append(X[rows, :])
        y_cvs.append(y.iloc[rows])
    return X_cvs, y_cvs


def find_best_algo(scores_mapping: Dict[Any, List[float]]) -> Tuple[Any, float, float, str]:
    _, p_value = f_oneway(*list(scores_mapping.values()))
    best_algo = random.choice(list(scores_mapping.keys()))
    t_test_p_value = -1
    print(f"annova value: {p_value}")
    msg = ""
    if p_value < P_VALUE_THR:
        sorted_scores = sorted(
            scores_mapping,
            key=lambda key: np.mean(scores_mapping[key]),
            reverse=True
        )
        candidate1, candidate2 = sorted_scores[:2]
        _, t_test_p_value = ttest_rel(
            scores_mapping[candidate1],
            scores_mapping[candidate2]
        )
        print(f"t_test value: {t_test_p_value}")
        best_algo = sorted_scores[0]
        if t_test_p_value >= P_VALUE_THR:
            msg = f"followed by t-test: algorithms {candidate1}, {candidate2} are the same"
            print(msg)
    else:
        msg = f"followed by annova: algorithms {scores_mapping.keys()} are the same"
        print(msg)
    return best_algo, p_value, t_test_p_value, msg


def external_var_to_anomalies(X_cvs, y_cvs, external_vars):
    results = list()
    for anomaly_algo_name, anomaly_algo in tqdm(anomaly_detection_algorithms.items()):
        if anomaly_algo is None:
            continue
        scores = defaultdict(list)
        for X, y in zip(X_cvs, y_cvs):
            labels = anomaly_algo.fit_predict(X)
            for external_var_name in external_vars:
                scores[external_var_name].append(
                    mutual_info_score(
                        labels,
                        y[external_var_name]
                    )
                )

        for external_var_name, external_var_scores in scores.items():
            results.append({
                "algo_name": anomaly_algo_name,
                "external_var": external_var_name,
                "MI": np.mean(external_var_scores)
            })

    return pd.DataFrame(results)


In [None]:
import json

import pandas as pd
import scipy.cluster.hierarchy as sch
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.metrics import silhouette_samples
import seaborn as sns

sns.set_theme()


def hierarchical_clustering_vis(X, ):
    sch.dendrogram(sch.linkage(X[np.random.randint(X.shape[0], size=100), :], method='ward'))
    plt.title("hierarchical clustering - dendrogram")
    plt.plot()


def elbow_method(X: np.ndarray, labels: np.ndarray, n_clusters: int):
    sample_silhouette_values = silhouette_samples(X, labels)
    y_lower = 10
    plt.figure(figsize=(15, 8))
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        plt.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )
        # TODO: legend instead of text
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    plt.title("The silhouette plot for the various clusters.")
    plt.xlabel("The silhouette coefficient values")
    plt.ylabel("Cluster label")

    plt.yticks([])
    plt.savefig("elbow.png")


# def reduce_dimension(X):
#     models = [TSNE(n_components=2), Isomap(n_components=2), MDS(n_components=2), SpectralEmbedding(n_components=2)]
#     fig, axs = plt.subplots(nrows=len(models), ncols=1, figsize=(8, 8))
#     fig.tight_layout()
#     for i, model in tqdm(enumerate(models)):
#         X_embedded_tsne = model.fit_transform(X)
#         axs[i].scatter(X_embedded_tsne[:, 0], X_embedded_tsne[:, 1], s=40, cmap='viridis')
#         axs[i].set_title(f"{model} dimensionality reduction")
#     plt.show()

def anomaly_external_var_to_mi(df):
    g = sns.catplot(
        data=df, kind="bar",
        x="external_var", y="MI", hue="algo_name",
        errorbar="sd", palette="dark", alpha=.6, height=6
    )
    g.despine(left=True)
    g.set_axis_labels("External Variable", "Mutual Information")
    g.legend.set_title("Anomaly MI Per External Var")
    plt.savefig("anomaly_external_var_to_mi.png")


def plot_silhouette():
    with open("./reports/without_anomaly_algo/silout_per_clustreing.json", "r") as file:
        without_anomaly_algo_scores = json.load(file)
    with open("./reports/with_anomaly_algo/sillout_pre_clustering.json", "r") as file:
        with_anomaly_algo_scores = json.load(file)
    rows = []
    for key, scores in without_anomaly_algo_scores.items():
        for score in scores:
            rows.append({
                "Clustering Algorithm": key.replace("_", " ").title(),
                "score": score,
                "With Anomaly Filtering": "No"
            })
    for key, scores in with_anomaly_algo_scores.items():
        for score in scores:
            rows.append({
                "Clustering Algorithm": key.replace("_", " ").title(),
                "score": score,
                "With Anomaly Filtering": "Yes"
            })
    df = pd.DataFrame(rows)
    plt.figure(figsize=(15, 8))
    g = sns.barplot(
        data=df, x="Clustering Algorithm", y="score",
        hue="With Anomaly Filtering", errorbar="sd",  palette="dark", alpha=.6,
    )
    g.set_ylabel("Silhouette Scores")
    g.set_xlabel("")
    g.set_title("Silhouette Scores By Algorithm")
    plt.savefig("static_silhouettes.png")


## dynamic

In [None]:
import json
import os
import random
from collections import defaultdict
from typing import Dict, Any, List

import numpy as np
import pandas as pd
from hmmlearn.hmm import CategoricalHMM
from sklearn.metrics import normalized_mutual_info_score, mutual_info_score, silhouette_score
from tqdm import tqdm

from pathlib import Path

CACHE_PATH = "cache-dynamic-new/"
Path(CACHE_PATH).mkdir(parents=True, exist_ok=True)
EXTERNAL_VARS = ["gas_type", "concentration"]
DIMENTIONS_OPTIONS = [2, 10]
NUM_CLUSTERS_OPTIONS = [2, 6, 12, 20]
NUM_OF_CVS = 3
# CV_SIZE = 2600
DATA_PATH = "data/driftdataset"

random.seed(10)
cvs = [[
    random.randint(0, 9),
    random.randint(0, 9),
    random.randint(0, 9),
    random.randint(0, 9)
]
    for i in range(NUM_OF_CVS)
]


def load_dynamic_dataset():
    X_cvs, y_cvs = [], []
    for filename in os.listdir(DATA_PATH):
        with open(os.path.join(DATA_PATH, filename), "r") as file:
            df_rows = []
            for line in file.readlines():
                curr_row = {}
                line = line.split(";")
                curr_row["gas_type"] = line[0]
                line = line[1].split(" ")
                curr_row["concentration"] = line[0]

                for sensor_value in line[1:]:
                    sensor_value = sensor_value.split(":")
                    if len(sensor_value) == 2:
                        curr_row[f"sensor_{sensor_value[0]}"] = float(sensor_value[1])
                df_rows.append(curr_row)
            df = pd.DataFrame(df_rows)
            X = df.drop(EXTERNAL_VARS, axis=1).values
            y = df[EXTERNAL_VARS]
            y["concentration"] = y["concentration"].apply(lambda val: int(float(val)))
            X_cvs.append(X)
            y_cvs.append(y)
    return X_cvs, y_cvs


def update_scores(labels: np.ndarray, cv_data, cv_y: pd.DataFrame, lengths, scores: Dict[str, List[float]]):
    sil_score = silhouette_score(cv_data, labels)
    model = CategoricalHMM(n_components=cv_y["gas_type"].nunique()).fit(labels.reshape(-1, 1), lengths=lengths)
    hidden_states_gas_type = model.predict(labels.reshape(-1, 1))
    mi_gas_type_mi = normalized_mutual_info_score(
        cv_y["gas_type"].values,
        hidden_states_gas_type
    )
    model = CategoricalHMM(n_components=cv_y["concentration"].nunique()).fit(labels.reshape(-1, 1), lengths=lengths)
    hidden_states_concentration = model.predict(labels.reshape(-1, 1))
    mi_concentration = normalized_mutual_info_score(
        cv_y["concentration"].values,
        hidden_states_concentration
    )
    scores["mi_concentration_scores"].append(mi_concentration)
    scores["mi_gas_type_scores"].append(mi_gas_type_mi)
    scores["weighted_scores"].append(
        (mi_concentration + mi_gas_type_mi + (sil_score + 1) / 2) / 2
    )
    return scores


def main():
    X_cvs, y_cvs = load_dynamic_dataset()
    best_config_by_clustering = dict()
    for clustering_algo_name in clustering_algorithms.keys():
        dim_reduction_meta: Dict[str, Dict[str, Any]] = dict()
        for reduction_algo_name in dim_reduction_algorithms.keys():
            max_score = float("-inf")
            for dim_num in DIMENTIONS_OPTIONS:
                for k_clusters in NUM_CLUSTERS_OPTIONS:
                    scores = defaultdict(list)
                    for cv_id, in_index in enumerate(cvs):
                        cv_data = np.concatenate([X_cvs[i] for i in range(len(X_cvs)) if i in in_index])
                        cv_y = pd.concat([y_cvs[i] for i in range(len(y_cvs)) if i in in_index])
                        lengths = [X_cvs[i].shape[0] for i in range(len(X_cvs)) if i in in_index]
                        try:
                            cv_data = reduction_algo_wrapper(reduction_algo_name, dim_num, cv_data, cv_id, CACHE_PATH)
                            print(f"doing {clustering_algo_name} for {cv_data.shape} by {reduction_algo_name}")
                            labels = clustering_algorithms[clustering_algo_name](k_clusters, cv_data)
                            scores = update_scores(labels, cv_data, cv_y, lengths, scores)
                        except Exception as e:
                            print(e)
                            break
                    avg_score = np.mean(scores["weighted_scores"])
                    if avg_score > max_score:
                        max_score = avg_score
                        dim_reduction_meta[reduction_algo_name] = {
                            **scores,
                            "avg_score": avg_score,
                            "dim_num": dim_num,
                            "cluster_num": k_clusters,
                            "reduction_algo_name": reduction_algo_name
                        }
        best_algo_name, p_value, t_test_p_value, msg = find_best_algo({
            key: value["weighted_scores"] for key, value in dim_reduction_meta.items()
        })
        best_config_by_clustering[clustering_algo_name] = {
            **dim_reduction_meta[best_algo_name],
            "annova": p_value,
            "t-test": t_test_p_value,
            "msg": msg
        }
        print(f"picking for {clustering_algo_name}: {best_config_by_clustering[clustering_algo_name]}")
    print(best_config_by_clustering)
    best_config_by("weighted_scores", best_config_by_clustering)
    best_config_by("mi_gas_type_scores", best_config_by_clustering)
    best_config_by("mi_concentration_scores", best_config_by_clustering)
    find_best_external_var_per_clustering(X_cvs, y_cvs, best_config_by_clustering)


def find_best_external_var_per_clustering(X_cvs: List[np.ndarray], y_cvs: List[pd.DataFrame],
                                          best_config_by_clustering: Dict[str, Dict[str, Any]]):
    best_external_var_per_clustering = dict()
    for clustering_algo_name, best_config in best_config_by_clustering.items():
        all_mi = dict()
        print(f"-----------{clustering_algo_name}-------")
        for external_var_name in EXTERNAL_VARS:
            scores = []
            clustering_algo = clustering_algorithms[clustering_algo_name]
            for cv_id, in_index in enumerate(cvs):
                cv_data = np.concatenate([X_cvs[i] for i in range(len(X_cvs)) if i in in_index])
                cv_y = pd.concat([y_cvs[i] for i in range(len(y_cvs)) if i in in_index])
                lengths = [X_cvs[i].shape[0] for i in range(len(X_cvs)) if i in in_index]
                try:
                    cv_data = reduction_algo_wrapper(
                        best_config["reduction_algo_name"],
                        best_config["dim_num"],
                        cv_data, cv_id, CACHE_PATH
                    )
                    labels = clustering_algo(cv_y[external_var_name].nunique(), cv_data)
                    model = CategoricalHMM(
                        n_components=cv_y[external_var_name].nunique()
                    ).fit(
                        labels.reshape(-1, 1),
                        lengths=lengths
                    )
                    hidden_states = model.predict(labels.reshape(-1, 1))
                    scores.append(mutual_info_score(hidden_states, cv_y[external_var_name].values))
                except Exception as e:
                    print(e)
                    scores.append(-1)
            all_mi[external_var_name] = scores
        best_var, p_value, t_test_p_value, msg = find_best_algo(all_mi)
        best_external_var_per_clustering[clustering_algo_name] = {
            "scores": all_mi[best_var],
            "best_var": best_var,
            "anova": p_value,
            "t_test": t_test_p_value,
            "msg": msg
        }
    print(best_external_var_per_clustering)
    with open(f"best_external_var_per_clustering_dynamic.json", "w") as file:
        json.dump(best_external_var_per_clustering, file)
    return best_external_var_per_clustering


def best_config_by(key: str, best_config_by_clustering: Dict):
    clustering_scores = dict()
    for clustering_algo_name, metadata in best_config_by_clustering.items():
        clustering_scores[clustering_algo_name] = metadata[key]
    best_algo_name, p_value, t_test_p_value, msg = find_best_algo(clustering_scores)
    result = {
        "best_algo_name": best_algo_name,
        "annova": p_value,
        "t_test_p_value": t_test_p_value,
        "config": best_config_by_clustering[best_algo_name],
        "msg": msg,
        "all_config": best_config_by_clustering
    }
    print(f"---------final_result_{key}--------")
    print(result)
    with open(f"final_result_{key}.json", "w") as file:
        json.dump(result, file)


def external_var_to_anomalies(X_cvs, y_cvs, external_vars):
    results = list()
    for anomaly_algo_name, anomaly_algo in tqdm(anomaly_detection_algorithms.items()):
        if anomaly_algo is None:
            continue
        scores = defaultdict(list)
        for cv_id, in_index in enumerate(cvs):
            cv_data = np.concatenate([X_cvs[i] for i in range(len(X_cvs)) if i in in_index])
            cv_y = pd.concat([y_cvs[i] for i in range(len(y_cvs)) if i in in_index])
            lengths = [X_cvs[i].shape[0] for i in range(len(X_cvs)) if i in in_index]
            labels = anomaly_algo.fit_predict(cv_data)
            for external_var_name in external_vars:
                model = CategoricalHMM(n_components=cv_y[external_var_name].nunique()).fit(
                    labels.reshape(-1, 1),
                    lengths=lengths
                )
                labels = model.predict(labels.reshape(-1, 1))
                scores[external_var_name].append(
                    mutual_info_score(
                        labels,
                        cv_y[external_var_name]
                    )
                )

        for external_var_name, external_var_scores in scores.items():
            results.append({
                "algo_name": anomaly_algo_name,
                "external_var": external_var_name,
                "MI": np.mean(external_var_scores)
            })

    return pd.DataFrame(results)


def main2():
    X_cvs, y_cvs = load_dynamic_dataset()
    df = external_var_to_anomalies(X_cvs, y_cvs, EXTERNAL_VARS)
    anomaly_external_var_to_mi(df)
    print(df)



main2()
# main3()


## graph

In [None]:
import json
import os
import pickle
from typing import Dict

import numpy as np
import pandas as pd
import networkx as nx
from community import community_louvain
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, normalized_mutual_info_score, mutual_info_score
from tqdm import tqdm
from karateclub.graph_embedding import Graph2Vec

from sklearn.metrics import silhouette_score
from sknetwork.embedding import LouvainEmbedding, LouvainNE

DATA_PATH = "data/deezer_ego_nets"
DIMENTIONS_OPTIONS = [10, 50, 100]
NUM_CLUSTERS_OPTIONS = [2, 4, 8, 12, 16, 20]
NUM_OF_CVS = 5
CV_SIZE = 2000
p_value_thr = 0.05


def preprocess_data():
    graphs = []
    with open(os.path.join(DATA_PATH, "deezer_edges.json")) as f:
        graphs_dict: Dict = json.load(f)
    for graph_id, edges in graphs_dict.items():
        curr_graph = nx.Graph()
        for u, v in edges:
            curr_graph.add_edge(u, v)
        graphs.append(curr_graph)
    return graphs
    # adj_matrix = np.zeros((n, n))
    # for i in range(n):
    #     for _, edge in graph_dict[str(i)]:
    #         adj_matrix[i, edge] = 1
    # data = PCA(n_components=0.98).fit_transform(adj_matrix)
    # with open("data.pkl", "wb") as file:
    #     pickle.dump(data, file)
    # return data


def main_flow():
    with open("data.pkl", "rb") as file:
        data = pickle.load(file)
    print(data.shape)
    target = pd.read_csv(os.path.join(DATA_PATH, "deezer_target.csv"))["target"]
    X_cvs, y_cvs = generate_cvs(data, target, NUM_OF_CVS, CV_SIZE)
    best_config_by_clustering = dict()
    for clustering_algo_name in clustering_algorithms.keys():
        scores_by_k = dict()
        for k_clusters in NUM_CLUSTERS_OPTIONS:
            silhouette_scores = []
            mi_scores = []
            scores = []
            for cv_id, (cv_data, cv_y) in enumerate(zip(X_cvs, y_cvs)):
                try:
                    labels = clustering_algorithms[clustering_algo_name](k_clusters, cv_data)
                    sil_score = silhouette_score(cv_data[labels != -1], labels[labels != -1])
                    silhouette_scores.append(sil_score)
                    sil_score = (sil_score + 1) / 2  # normalize between 0 and 1
                    mi_score = normalized_mutual_info_score(labels, cv_y.values)
                    scores.append((sil_score + mi_score) / 2)
                    mi_scores.append(mutual_info_score(labels, cv_y.values))
                except Exception as e:
                    print(e)
                    break
            scores_by_k[k_clusters] = {
                "scores": scores,
                "mi_scores": mi_scores,
                "silhouette_scores": silhouette_scores
            }
        best_k_clusters, p_value, t_test_p_value, msg = find_best_algo({
            key: value["scores"] for key, value in scores_by_k.items()
        })
        best_config_by_clustering[clustering_algo_name] = {
            **scores_by_k[best_k_clusters],
            "best_k_clusters": best_k_clusters,
            "annova": p_value,
            "t_test_p_value": t_test_p_value
        }
        print(f"picking for {clustering_algo_name}: {best_k_clusters}")
    print(best_config_by_clustering)
    # with open("best_config_by_clustering.json") as file:
    #     json.dump(best_config_by_clustering, file)
    clustering_scores = dict()
    for clustering_algo_name, metadata in best_config_by_clustering.items():
        clustering_scores[clustering_algo_name] = metadata["scores"]
    best_k_clusters, p_value, t_test_p_value, msg = find_best_algo(clustering_scores)
    print(f"best_k_clusters: {best_k_clusters}")
    print(f"annova: {p_value}")
    print(f"t test: {t_test_p_value}")


def main():
    graphs = preprocess_data()
    target = pd.read_csv(os.path.join(DATA_PATH, "deezer_target.csv"))["target"].values
    # louvain = LouvainEmbedding()
    louvain = LouvainNE(n_components=10)
    results = []
    max_size = 0
    for graph in tqdm(graphs):
        embedding = louvain.fit_transform(nx.adjacency_matrix(graph))
        # embedding = louvain.fit_transform(nx.adjacency_matrix(graph))
        embedding = embedding.mean(axis=0)
        results.append(embedding)
        max_size = max(max_size, embedding.shape[0])
    # for i in range(len(results)):
    #     num_zeros = max_size - results[i].shape[0]
    #     if num_zeros > 0:
    #         results[i] = np.pad(results[i], (0, num_zeros), 'constant')
    data = np.vstack(results)
    print("doing k means")
    scores = []
    for k in tqdm([2, 6, 12, 20]):
        labels = hierarchical_clustering(k, data)
        scores.append({
            "silhouette_score": silhouette_score(data, labels),
            "mutual_info_score": normalized_mutual_info_score(labels, target)
        })
    print(scores)


main()
# main_flow()