In [None]:
from glob import glob
import pandas as pd
from os import path
import seaborn as sns

In [None]:
pd.read_csv("benchmark_exp/eval/uni/SRCNN.csv").file.str.removesuffix(".csv")

In [None]:
dfs = []
for f in glob("benchmark_exp/eval/uni/*.csv"):
    print(f)
    new_df = pd.read_csv(f)
    new_df['dataset'] = new_df.file.str.removesuffix(".csv")
    algorithm = path.basename(f).split(".")[0]
    new_df['algorithm'] = algorithm
    dfs.append(new_df)

In [None]:
all_results = pd.concat(dfs, ignore_index=True).set_index(["dataset", "algorithm"])

In [None]:
pd.concat(dfs, ignore_index=True).groupby("algorithm")['dataset'].value_counts().unstack("algorithm").dropna().shape

In [None]:
all_results.unstack("algorithm")

In [None]:
all_results.shape

In [None]:
# drop extra rows that are in SRCNN
all_results = all_results.unstack("algorithm").dropna(subset=[("file", 'Sub_KNN')]).stack("algorithm", future_stack=True)

In [None]:
pwd

In [None]:
files = all_results.file.unique()

def load_series(files):
    results = {}
    for f in files:
        if not f.endswith(".csv"):
            # extra files from Joel
            continue
        df = pd.read_csv("benchmark_exp/TSB-AD/TSB-AD-U/" + f)
        results[f.split(".")[0]] = df
    return results

res = load_series(files)

In [None]:
periodicity = pd.read_csv("series_length_debugging_iqr.csv")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.ticker import MultipleLocator
import numpy as np


def plot_detection(signal, label, scores=None, train=None, ax=None, linewidth=1, window_length=None):
    if train is not None:
        if signal.index.min() < train.index.max():
            signal.index = signal.index + train.index.max()
    scores = pd.Series(scores, index=signal.index)
    label = pd.Series(np.array(label), index=signal.index)
    if ax is None:
        plt.figure(figsize=(40, 5), dpi=300)
        signal_ax = plt.gca()
    else:
        signal_ax = ax
    signal_ax.set_ylabel("signal")
    a, = signal_ax.plot(signal, label='signal', c='k', linewidth=linewidth)
    b = None
    if train is not None:
        signal_ax.plot(train, label='train', c='grey', linewidth=linewidth)
    if scores is not None:
        scores_ax = plt.twinx(signal_ax)
        b, = scores_ax.plot(scores, label='scores', c='b', alpha=0.5, linewidth=linewidth)
        scores_ax.set_ylabel("scores")
    ylims = signal_ax.get_ylim()
    yrange = ylims[1] - ylims[0]
    ymin = ylims[0] - 0.1 * yrange
    ymax = ylims[1] + 0.1 * yrange
    signal_ax.set_ylim(ymin, ymax)
    for start, end in get_anomaly_regions(label):
        width = end - start
        thin_thresh = len(label) / 1e3
        width = np.maximum(width, thin_thresh)
        signal_ax.add_patch(patches.Rectangle((start, ylims[0]), width, ylims[1] - ylims[0], facecolor='red', alpha=0.4))
    red_patch = patches.Patch(color='red', label='anomaly', alpha=0.3)
    plt.legend(handles=[a, b, red_patch] if b is not None else [a, red_patch])
    if window_length is not None:
        locator = MultipleLocator(window_length)
        locator.MAXTICKS = 2000
        signal_ax.xaxis.set_minor_locator(locator)
    return signal_ax

def get_anomaly_regions(labels):
    anomaly_starts = np.where(np.diff(labels) == 1)[0] + 1
    anomaly_ends, = np.where(np.diff(labels) == -1)
    if len(anomaly_ends):
        if not len(anomaly_starts) or anomaly_ends[0] < anomaly_starts[0]:
            # we started with an anomaly, so the start of the first anomaly is the start of the lables
            anomaly_starts = np.concatenate([[0], anomaly_starts])
    if len(anomaly_starts):
        if not len(anomaly_ends) or anomaly_ends[-1] < anomaly_starts[-1]:
            # we ended on an anomaly, so the end of the last anomaly is the end of the labels
            anomaly_ends = np.concatenate([anomaly_ends, [len(labels) - 1]])
    return list(zip(anomaly_starts, anomaly_ends))

    

In [None]:
plot_detection(res['001_NAB_data_Traffic_4_624_2087']['Data'], res['001_NAB_data_Traffic_4_624_2087']['Label'])

In [None]:
all_results['AUC-PR'].unstack("algorithm").plot(kind="box", rot=90)

In [None]:
auc_pr_ranks = all_results['AUC-PR'].unstack("algorithm").rank(axis=1, ascending=False)
auc_pr_ranks

In [None]:
rank_order = auc_pr_ranks.median().sort_values().index
sns.boxplot(auc_pr_ranks, orient='h', order=rank_order)

In [None]:
from TSB_AD.cd_plot.cd_plot_code import cd_evaluation
import matplotlib.pyplot as plt

plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(all_results['AUC-PR'].unstack("algorithm"), maximize_metric=True, ax=plt.gca())
plt.title("AUC-PR")

In [None]:
seasonal_decompose = pd.read_csv("results_seasonal_decompose.csv", index_col=0)
seasonal_decompose['algorithm'] = "seasonal decompose"
seasonal_decompose = seasonal_decompose.rename(columns={'name': 'dataset'})

In [None]:
seasonal_decompose

In [None]:
seasonal_decompose

In [None]:
all_results_sd = pd.concat([all_results, seasonal_decompose.set_index(["dataset", "algorithm"])])

In [None]:
sd_subset = all_results_sd['AUC-PR'].unstack("algorithm").dropna(subset="seasonal decompose")

In [None]:
plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(sd_subset, maximize_metric=True, ax=plt.gca())
plt.title("AUC-PR")

In [None]:
sd_subset_pretty = sd_subset.T.drop(index=["Random2", "PCA", "KNN", "LOF", "Donut", "FITS", "HBOS", "SRCNN"]).rename(index=lambda x: x.replace("Sub_", "Window "))
sd_subset_pretty

In [None]:
comparison = sd_subset_pretty.T[['SR', 'seasonal decompose', 'Decompose']]

In [None]:
(comparison['SR'] - comparison['seasonal decompose']).sort_values()

In [None]:
(comparison['Decompose'] - comparison['seasonal decompose']).sort_values()

In [None]:
periodicity

In [None]:
subset_merged = sd_subset_pretty.T.merge(periodicity[['name', 'mine_sig', 'score']], left_on="dataset", right_on="name")
subset_merged['score'] = subset_merged['score'].fillna(0)
subset_merged['mine_sig'] = subset_merged['mine_sig'].astype(float)
subset_merged = subset_merged.set_index("name").drop(columns="mine_sig")

In [None]:
subset_merged

In [None]:
cm = sns.clustermap(subset_merged.T, figsize=(15, 20), standard_scale=1, cbar_pos=(0.98, 0.85, 0.01, 0.1), method="ward", dendrogram_ratio=0.1)
cm.ax_heatmap.set_xticklabels(());

In [None]:
cm = sns.clustermap(sd_subset_pretty, figsize=(15, 20), standard_scale=1, cbar_pos=(0.98, 0.85, 0.01, 0.1), method="ward", dendrogram_ratio=0.1)
cm.ax_heatmap.set_xticklabels(());

In [None]:
plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(all_results['PA-F1'].unstack("algorithm"), maximize_metric=True, ax=plt.gca())
plt.title("Adjusted F1")

In [None]:
ranks = all_results.drop(columns=["file", "Time"]).unstack("dataset").rank(ascending=False).stack("dataset").unstack("algorithm").mean().unstack().T

In [None]:

pd.set_option("display.precision", 1)
ranks.sort_values("AUC-PR")

In [None]:
plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(all_results['AUC-ROC'].unstack("algorithm"), maximize_metric=True, ax=plt.gca())
plt.title("AUC-ROC")

In [None]:
plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(all_results['VUS-ROC'].unstack("algorithm"), maximize_metric=True, ax=plt.gca())
plt.title("VUS-ROC")

In [None]:
plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(all_results['VUS-PR'].unstack("algorithm"), maximize_metric=True, ax=plt.gca())
plt.title("VUS-PR")

In [None]:
import numpy as np

In [None]:
plt.figure(figsize=(16,  4* 2), dpi=300)
_ = cd_evaluation(all_results['PA-F1'].unstack("algorithm"), maximize_metric=True, ax=plt.gca())
plt.title("PA-F1")

In [None]:
from benchmark_exp.HP_list import Optimal_Uni_algo_HP_dict

In [None]:
algs = all_results.index.levels[1]

In [None]:
[x for x in Optimal_Uni_algo_HP_dict.keys() if x not in algs]

In [None]:
vus_pr_ranks = all_results['VUS-PR'].unstack("algorithm").rank(axis=1, ascending=False)
vus_rank_order = vus_pr_ranks.median().sort_values().index
sns.boxplot(vus_pr_ranks, orient='h', order=vus_rank_order)

In [None]:
#sns.clustermap(all_results['VUS-PR'].unstack("algorithm"), figsize=(20, 20))

In [None]:
# sns.clustermap(all_results['VUS-PR'].unstack("algorithm").rank(axis=1, ascending=False), figsize=(15, 20))

In [None]:
auc_pr = all_results['AUC-ROC'].unstack("algorithm").T
cm = sns.clustermap(auc_pr, figsize=(15, 20), standard_scale=1, cbar_pos=(0.9, 0.8, 0.01, 0.1))
cm.ax_heatmap.set_xticklabels(());

In [None]:
vus_pr = all_results['VUS-PR'].unstack("algorithm").T
cm = sns.clustermap(vus_pr, figsize=(15, 20), standard_scale=1, cbar_pos=(0.9, 0.8, 0.01, 0.1))
cm.ax_heatmap.set_xticklabels(());

In [None]:
# auc_pr = all_results['PA-F1'].unstack("algorithm").T
# cm = sns.clustermap(auc_pr, figsize=(15, 20), standard_scale=1, cbar_pos=(0.9, 0.8, 0.01, 0.1))
# cm.ax_heatmap.set_xticklabels(());

In [None]:
auc_pr = all_results['AUC-PR'].unstack("algorithm").T
auc_pr_sub = auc_pr.drop(index=["Random2", "PCA", "KNN", "LOF", "Donut", "FITS", "HBOS", "SRCNN"]).rename(index=lambda x: x.replace("Sub_", "Window "))

In [None]:
auc_pr_sub

In [None]:
cm = sns.clustermap(auc_pr_sub, figsize=(15, 20), cbar_pos=(0.98, 0.85, 0.01, 0.1), method="ward", dendrogram_ratio=0.1)
cm.ax_heatmap.set_xticklabels(());

In [None]:
for col in ['season_iqr', 'trend_iqr', 'res_iqr']:
    periodicity.loc[np.array(~np.isfinite(periodicity[col])), col] = 0

In [None]:
periodicity[['season_iqr', 'trend_iqr', 'res_iqr']].max(axis=1).median()

In [None]:
periodicity['weird'] = (periodicity[['season_iqr', 'trend_iqr', 'res_iqr']].sum(axis=1) > 1.5).astype(float)

In [None]:
periodicity

In [None]:
#auc_pr_sub_score = auc_pr_sub.T.merge(periodicity[['name', 'r2', 'season_iqr', 'trend_iqr', 'res_iqr', 'weird']].fillna(0), left_on="dataset", right_on="name").set_index("name").T
auc_pr_sub_score = auc_pr_sub.T.merge(periodicity[['name',  'weird']].fillna(0), left_on="dataset", right_on="name").set_index("name").T

In [None]:
cm = sns.clustermap(auc_pr_sub_score, figsize=(15, 20), cbar_pos=(0.98, 0.85, 0.01, 0.1), method="ward", dendrogram_ratio=0.1, vmax=1)
cm.ax_heatmap.set_xticklabels(());

In [None]:
threshold = auc_pr_sub.quantile(0.8)

In [None]:
(auc_pr_sub > threshold).T.mean().sort_values()

In [None]:
good = (auc_pr_sub > threshold).T[['Window PCA', 'SR']].T.max()
good.mean()

In [None]:
(auc_pr_sub > threshold).loc[:, ~good].T.mean().sort_values()

In [None]:
good2 = (auc_pr_sub > threshold).T[['Window PCA', 'SR', 'Window KMeansAD']].T.max()
good2.mean()

In [None]:
(auc_pr_sub > threshold).loc[:, ~good2].T.mean().sort_values()

In [None]:
(auc_pr_sub > threshold).T[['Window PCA', 'SR', 'Window KMeansAD', 'Window LOF']].T.max().mean()

In [None]:
bla = (auc_pr_sub > threshold).T[['Window KMeansAD', 'Window PCA', 'SR']].T.max()

In [None]:
(auc_pr_sub > threshold).loc[:, ~bla].T.mean().sort_values()

In [None]:
from scipy.cluster.hierarchy import fcluster
import numpy as np

In [None]:
clusters = fcluster(cm.dendrogram_col.calculated_linkage, criterion='maxclust', t=20)

In [None]:
datasets= [x.split("_")[1] for x in auc_pr.columns]
adjusted_rand_score(datasets, clusters)

In [None]:
from TSB_AD.utils.slidingWindows import find_length_rank

In [None]:
auc_pr = all_results['AUC-PR'].unstack("algorithm").T
cm = sns.clustermap(auc_pr_sub_score, figsize=(15, 20), cbar_pos=(0.9, 0.85, 0.01, 0.1), col_colors=plt.cm.tab20(clusters / clusters.max()), method="ward")
cm.ax_heatmap.set_xticklabels(());

In [None]:
pd.Series(clusters).value_counts()

In [None]:
clusters == 6

In [None]:
def get_score(method, dataset):
    return np.load(f"benchmark_exp/score/uni/{method}/{d}.npy")

In [None]:
from TSB_AD.utils.slidingWindows import find_length_rank
for d in auc_pr.columns[clusters == 3]:
    score_SR = get_score("Sub_KNN", d)
    window_length = find_length_rank(res[d]['Data'])
    plot_detection(res[d]['Data'], res[d]['Label'], scores=score_SR, window_length=window_length)
    plt.title(f"Sub_KNN: {d} ({window_length})")
    score_SR = get_score("Random2", d)
    plot_detection(res[d]['Data'], res[d]['Label'], scores=score_SR, window_length=window_length)
    plt.title(f"Sub_IForest: {d} ({window_length})")

In [None]:
from sklearn.cluster import SpectralCoclustering, SpectralBiclustering


In [None]:
from numpy.linalg import svd

In [None]:
sco = SpectralBiclustering(n_clusters=(5, 8))
auc_pr_scaled = (auc_pr - auc_pr.min()) / (auc_pr.max() - auc_pr.min())
sco.fit(auc_pr_scaled)

In [None]:
u, s, vt = svd(auc_pr_scaled)

In [None]:
u.shape

In [None]:
vt.shape

In [None]:
from sklearn.decomposition import PCA
X_pca = PCA().fit_transform(auc_pr_scaled)

In [None]:
for i, a in enumerate(auc_pr.index):
    plt.text(X_pca[i,0],  X_pca[i, 1], a)
plt.scatter(X_pca[:,0], X_pca[:, 1])

In [None]:
X_pca = PCA().fit_transform(auc_pr_scaled.T)
datasets= [x.split("_")[1] for x in auc_pr.columns]
plt.scatter(X_pca[:,0], X_pca[:, 1], c=pd.Series(datasets, dtype="category").cat.codes, cmap=plt.cm.tab20)

In [None]:
from umap import UMAP

In [None]:
datasets_umap = UMAP(spread=2).fit_transform(auc_pr_scaled.T)

In [None]:
plt.scatter(datasets_umap[:,0], datasets_umap[:, 1], c=pd.Series(datasets, dtype="category").cat.codes, cmap=plt.cm.tab20)

In [None]:
from sklearn.manifold import TSNE

In [None]:
datasets_tsne = TSNE(perplexity=100).fit_transform(auc_pr_scaled.T)

In [None]:
tsne_df = pd.concat([pd.DataFrame(datasets_tsne), pd.Series(datasets, name="dataset")], axis=1)

In [None]:
tsne_df

In [None]:
fig = sns.scatterplot(data=tsne_df, x=0, y=1, hue="dataset", ax=plt.gca(), palette=plt.cm.tab20(range(18)))
sns.move_legend(fig, (1, -.5))

In [None]:
sco.row_labels_[row_inds]

In [None]:
row_inds = np.argsort(sco.row_labels_)

In [None]:
col_inds = np.argsort(sco.column_labels_)

In [None]:
plt.figure(figsize=(10, 10), dpi=300)
plt.imshow(np.array(auc_pr_scaled)[row_inds][:, col_inds], aspect="auto")
plt.yticks(range(len(auc_pr_scaled)), auc_pr.index[row_inds]);

In [None]:
np.array(auc_pr_scaled).shape

In [None]:
u,s,vt = svd(auc_pr_scaled)
row_inds = np.argsort(u[:, 2])
col_inds = np.argsort(vt[:, 0])
plt.figure(figsize=(10, 10), dpi=300)
plt.imshow(np.array(auc_pr_scaled)[row_inds][:, col_inds], aspect="auto")
plt.yticks(range(len(auc_pr_scaled)), auc_pr.index[row_inds]);

In [None]:
cm = sns.clustermap(auc_pr, figsize=(15, 20), standard_scale=1, cbar_pos=(0.9, 0.8, 0.01, 0.1), row_clusters=sco.row_labels_, col_cluster=sco.column_labels_)
cm.ax_heatmap.set_xticklabels(());

In [None]:
from sklearn.metrics.cluster import contingency_matrix, adjusted_rand_score

In [None]:
datasets= [x.split("_")[1] for x in auc_pr.columns]
adjusted_rand_score(datasets, clusters)

In [None]:
datasets= [x.split("_")[1] for x in auc_pr.columns]
plt.imshow(contingency_matrix(datasets, clusters))
plt.yticks(range(len(np.unique(datasets))), np.unique(datasets));

In [None]:
np.bincount(clusters)

In [None]:
for d in auc_pr.columns[clusters == 1]:
    plot_detection(res[d]['Data'], res[d]['Label'])
    plt.title(d)

In [None]:
!ls

In [None]:
for d in auc_pr.columns[clusters == 2]:
    scores = np.load(f"benchmark_exp/score/uni/MatrixProfile/{d}.npy")
    plot_detection(res[d]['Data'], res[d]['Label'], scores)
    plt.title(d)

In [None]:
for d in auc_pr.columns[clusters == 3]:
    plot_detection(res[d]['Data'], res[d]['Label'])
    plt.title(d)

In [None]:
for d in auc_pr.columns[clusters == 4]:
    plot_detection(res[d]['Data'], res[d]['Label'])
    plt.title(d)

In [None]:
for d in auc_pr.columns[clusters == 5]:
    plot_detection(res[d]['Data'], res[d]['Label'])
    plt.title(d)

In [None]:
def plot_subset(datasets):
    pd.Series([x.split("_")[1] for x in datasets]).value_counts().plot(kind="bar")

In [None]:
plot_subset(auc_pr.columns[clusters == 1])

In [None]:
plot_subset(auc_pr.columns[clusters == 2])

In [None]:
plot_subset(auc_pr.columns[clusters == 3])

In [None]:
plot_subset(auc_pr.columns[clusters == 4])

In [None]:
plot_subset(auc_pr.columns[clusters == 5])

In [None]:
bla = all_results['AUC-PR'].unstack("algorithm").rank(axis=1, ascending=False).mean().sort_values().reset_index()

In [None]:
bla

In [None]:
sns.clustermap(all_results['AUC-PR'].unstack("algorithm").rank(axis=1, ascending=False).T, figsize=(15, 20))

In [None]:
vus_scores = all_results['VUS-PR'].unstack("algorithm")

In [None]:
vus_scores.T.apply(pd.Series.idxmax)

In [None]:
keep_datasets = vus_scores.T.apply(pd.Series.idxmax).isin(["Sub_PCA", "POLY"])

In [None]:
_ = cd_evaluation(vus_scores[keep_datasets].drop(columns=["Sub_PCA", "POLY"]), maximize_metric=True)

In [None]:
all_results['VUS-PR'].unstack("algorithm")

In [None]:
sns.clustermap(all_results['VUS-PR'].unstack("algorithm")[['POLY', "Sub_PCA", "Sub_IForest"]].T.rank(), figsize=(20, 20))

In [None]:
sns.clustermap(all_results['VUS-PR'].unstack("algorithm")[['POLY', "Sub_PCA", "Sub_IForest"]].T, figsize=(20, 20))