In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings


from GaussianMixtureSelect import GaussianMixtureSelect
from KMeansSelect import KMeansSelect
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.metrics import adjusted_rand_score
from sklearn.mixture import BayesianGaussianMixture

from kdquantile import KDQuantileDiscretizer, KDDiscretizer

warnings.filterwarnings('ignore')

In [None]:
def gen_data(N, seed):
    np.random.seed(seed)
    a1 = np.random.normal(1, 0.75, size=int(0.55*N))
    a2 = np.random.normal(4, 1, size=int(0.3*N))
    a3 = np.random.uniform(0, 20, size=int(0.15*N))
    a = np.sort(np.r_[a1, a2, a3])
    a_labels = np.array([0]*int(0.55*N) + [1]*int(0.3*N) + [2]*int(0.15*N))
    
    b1 = np.random.normal(1, 0.5, size=int(0.45*N))
    b2 = np.random.normal(4, 1, size=int(0.45*N))
    b3 = np.random.uniform(0, 20, size=int(0.1*N))
    b = np.sort(np.r_[b1, b2, b3])
    b_labels = np.array([0]*int(0.45*N) + [1]*int(0.45*N) + [2]*int(0.1*N))
    
    c1 = np.random.normal(1, 0.5, size=int(0.67*N))
    c2 = np.random.normal(4, 1, size=int(0.33*N))
    c = np.sort(np.r_[c1, c2])
    c_labels = np.array([0]*int(0.67*N) + [1]*int(0.33*N))
      
    dd1 = np.random.exponential(1, size=int(0.8*N))
    dd2 = 10+np.random.exponential(4, size=int(0.2*N))
    d = np.sort(np.r_[dd1, dd2])
    d_labels = np.array([0]*int(0.8*N) + [1]*int(0.2*N))
    
    e1 = np.random.exponential(8, size=int(0.5*N))
    e2 = 100 - np.random.exponential(5, size=int(0.5*N))
    e = np.sort(np.r_[e1, e2])
    e_labels = np.array([0]*int(0.5*N) + [1]*int(0.5*N))
    
    x_list = [a, b, c, d, e]
    true_ks = [3, 3, 2, 2, 2]
    true_labels = [a_labels, b_labels, c_labels, d_labels, e_labels]
    return (x_list, true_ks, true_labels)

In [None]:
(x_list, true_ks, true_labels) = gen_data(N=500, seed=1)
fig, axes = plt.subplots(
    nrows=4, ncols=len(x_list), sharex='col', sharey='row', figsize=(7,5))
for curix, curx in enumerate(x_list):
    method = KDQuantileDiscretizer(enable_predict_proba=True)
    curx = curx.reshape(-1, 1)
    method.fit(curx)
    curp = method.kdqt_.transform(curx)
    cury = method.transform(curx)
    curz = method.predict_proba(curx)
    axes[0,curix].hist(curx,30)
    axes[1,curix].scatter(curx, curp, s=3)
    axes[2,curix].scatter(curx,cury, s=3)
    for k in range(method.kdd_.n_bins_[0]):
        axes[3,curix].scatter(curx, curz[:,k], s=3)
axes[0,0].set_ylabel('original data');
axes[1,0].set_ylabel('KD-quantile \n transform');
axes[2,0].set_ylabel('predicted \n discretized');
axes[3,0].set_ylabel('predicted \n probabilities');
fig.patch.set_facecolor('white');

In [None]:
(x_list, true_ks, true_labels) = gen_data(N=500, seed=1)
method_titles = [
    "Ground-truth",
    "KMeans", 
    "GMM",
    "Bayesian GMM",
    "MeanShift", 
    "HDBSCAN", 
    "KDE Local Minima", 
    "KDQuantile",
]
n_methods = len(method_titles)
max_k = 8

fig, big_axes = plt.subplots(figsize=(8, 8), nrows=n_methods, ncols=1, sharey=True)
#fig.tight_layout()
plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95, hspace=0.45)
for row, big_ax in enumerate(big_axes, start=1):
    big_ax.set_title(method_titles[row-1], fontsize=10)
    big_ax.tick_params(
        labelcolor=(1.,1.,1., 0.0), 
        top='off', bottom='off', left='off', right='off')
    big_ax.spines["top"].set_visible(False)
    big_ax.spines["right"].set_visible(False)
    big_ax._frameon = False
    big_ax.set_xticks([])
    big_ax.set_yticks([])
for curix, curx in enumerate(x_list):
    true_k = true_ks[curix]
    true_label = true_labels[curix]
    mycurx = curx.reshape(-1, 1) # sklearn expects shape (N,1) not (N,)
    methods = list(zip(
        method_titles, [
            None, 
            KMeansSelect(max_clusters=max_k),
            GaussianMixtureSelect(max_components=max_k, criteria="BIC"),
            BayesianGaussianMixture(n_components=max_k),
            MeanShift(bandwidth=estimate_bandwidth(mycurx, quantile=0.2, n_samples=500), bin_seeding=True),
            HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.5),
            KDDiscretizer(), 
            KDQuantileDiscretizer()
        ]
    ))
    for mix, minfo in enumerate(methods):
        ax = fig.add_subplot(
            n_methods, len(x_list), mix*len(x_list)+curix+1)
        (method_name, method) = minfo
        if method_name in ("KDQuantile", "KDE Local Minima"):
            # My methods have the KBinsDiscretizer fit-transform API,
            # not the sklearn.cluster fit-predict API.
            method.fit(mycurx)
            cury = method.transform(mycurx).flatten()
        elif method_name == "HDBSCAN":
            cury = method.fit_predict(mycurx)
        elif method_name == "Ground-truth":
            cury = true_label
        else:
            method.fit(mycurx)
            cury = method.predict(mycurx)
        k_list = list(np.unique(cury))     
        clust_means = [(np.mean(curx[cury==k]), k) for k in k_list]
        k_list_sorted = [k for _, k in sorted(clust_means, key=lambda _: _[0])]
        curxclustered = [curx[cury==k] for k in k_list_sorted]
        ax.hist(curxclustered, bins=20, stacked=True, edgecolor="white", linewidth=0.01)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.tick_params(axis='y', which='major', labelsize=8, rotation=90, direction="in", top=False, right=False, pad=0)
        ax.tick_params(axis='x', which='major', labelsize=8, direction="in")
        if len(list(np.unique(cury))) == true_k:
            tcolor = "green"
        else:
            tcolor = "red"
        if method_name == "Ground-truth":
            tcolor = "black"
        ari = adjusted_rand_score(true_label, cury)        
        ax.text(
            0.63*ax.get_xlim()[1],
            0.75*ax.get_ylim()[1],
            #"$\hat{K}$=%i\nARI=%.2f" % (len(k_list), ari),
            "$\hat{K}$=%i" % len(k_list) if method_name != "Ground-truth" else "$K$=%i" % len(k_list),
            color=tcolor, fontsize=8)
        if mix != len(methods) - 1:
            ax.set_xticks([])
        else:
            pass
fig.patch.set_facecolor('white');
plt.savefig("discretizing-synthetic-N500.pdf")

In [None]:
n_random = 20
N_list = [100, 200, 500, 1000, 2000, 5000]
method_titles = [
    "KMeans", 
    "GMM", 
    "Bayesian GMM",
    "MeanShift", 
    "HDBSCAN", 
    "KDQuantile",
]
n_methods = len(method_titles)
res_list = []
# N, rix, {a, b, c, d, e, f}, method -> (Kcorrect, ARI)
for N in N_list:
    print(N)
    for rix in range(n_random):
        (x_list, true_ks, true_labels) = gen_data(N=N, seed=rix)
        for curix, curx in enumerate(x_list):
            true_k = true_ks[curix]
            true_label = true_labels[curix]
            mycurx = curx.reshape(-1, 1) # sklearn expects shape (N,1) not (N,)
            methods = list(zip(
                method_titles, [
                    KMeansSelect(max_clusters=max_k),
                    GaussianMixtureSelect(max_components=max_k, criteria="BIC"),
                    BayesianGaussianMixture(n_components=max_k),
                    MeanShift(bandwidth=estimate_bandwidth(mycurx, quantile=0.2, n_samples=500), bin_seeding=True),
                    HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.5),
                    KDQuantileDiscretizer(),
                ]
            ))
            for mix, minfo in enumerate(methods):
                (method_name, method) = minfo
                if method_name in ("KDE Local Minima", "KDQuantile"):
                    # My methods have the KBinsDiscretizer fit-transform API,
                    # not the sklearn.cluster fit-predict API.
                    method.fit(mycurx)
                    cury = method.transform(mycurx).flatten()
                elif method_name == "HDBSCAN":
                    cury = method.fit_predict(mycurx)
                elif method_name == "Ground-truth":
                    cury = true_label
                else:
                    method.fit(mycurx)
                    cury = method.predict(mycurx)
                k_list = list(np.unique(cury))     
                ari = adjusted_rand_score(true_label, cury)
                res_list.append([rix, N, curix, method_name, len(k_list) == true_k, ari])

In [None]:
a = pd.DataFrame(data=res_list, columns=["rix", "N", "dataset", "method", "correctK", "ARI"])
b = a.groupby(["N", "dataset", "method"]).mean().reset_index()
b = b[(b.method != "Ground-truth") &  (b.method != "KDE Local Minima")]
fig, axes = plt.subplots(figsize=(8., 2.), ncols=5, sharey=True)
fig.tight_layout();
plt.subplots_adjust(top=0.98, right=0.99, wspace=0.08)
for dataset in range(0, 5):
    sns.lineplot(
        data=b[b.dataset == dataset], 
        x="N", y="ARI", 
        hue="method",
        hue_order=method_titles,
        style="method",
        style_order=method_titles,
        ax=axes[dataset], markers=True, alpha=0.8, legend=False)
    axes[dataset].set_xticks(N_list);
    axes[dataset].set_xscale('log');
    axes[dataset].tick_params(axis='y', which='major', labelsize=8, direction="in", top=False, right=False, pad=0.12)
    axes[dataset].tick_params(axis='x', which='major', labelsize=8, direction="in")
fig.savefig("discretizing-synthetic-ARI.pdf");
fig, axes = plt.subplots(figsize=(8., 2.), ncols=5, sharey=True)
fig.tight_layout();
plt.subplots_adjust(top=0.98, right=0.99, wspace=0.08)
for dataset in range(0, 5):
    sns.lineplot(
        data=b[b.dataset == dataset], 
        x="N", y="correctK", 
        hue="method",
        hue_order=method_titles,
        style="method",
        style_order=method_titles,
        ax=axes[dataset], markers=True, alpha=0.8, 
        legend=dataset==4)
    axes[dataset].set_xticks(N_list);
    axes[dataset].set_ylabel("Accuracy ($\hat{K} = K$)");
    axes[dataset].set_xscale('log');
    axes[dataset].tick_params(axis='y', which='major', labelsize=8, direction="in", top=False, right=False, pad=0.12)
    axes[dataset].tick_params(axis='x', which='major', labelsize=8, direction="in")
sns.move_legend(axes[-1], "upper right", bbox_to_anchor=(1.0, -0.3), title="")
fig.savefig("discretizing-synthetic-accuracy.pdf");