In [11]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
from munkres import Munkres
from scipy.special import comb
from PIL import Image
import random
from sklearn.neighbors import NearestNeighbors

In [12]:
def make_cost_matrix(c1, c2):
    """
    """
    uc1 = np.unique(c1)
    uc2 = np.unique(c2)
    l1 = uc1.size
    l2 = uc2.size
    assert(l1 == l2 and np.all(uc1 == uc2))

    m = np.ones([l1, l2])
    for i in range(l1):
        it_i = np.nonzero(c1 == uc1[i])[0]
        for j in range(l2):
            it_j = np.nonzero(c2 == uc2[j])[0]
            m_ij = np.intersect1d(it_j, it_i)
            m[i,j] =  -m_ij.size
    return m

def translate_clustering(clt, mapper):
    return np.array([ mapper[i] for i in clt ])



def map_label(pred, gt):
    """entry point"""

    num_labels = len(np.unique(gt))

    # cm = confusion_matrix(gt, pred, labels=range(num_labels)) # gets the confusion matrix

    cost_matrix = make_cost_matrix(pred, gt)

    m = Munkres()
    indexes = m.compute(cost_matrix)
    mapper = { old: new for (old, new) in indexes }


    new_labels = translate_clustering(pred, mapper)
    new_cm = confusion_matrix(gt, new_labels, labels=range(num_labels))
    return new_labels, mapper


In [13]:
test_dir = "test_out"
num_test = len(os.listdir(test_dir)) // 2

classes = [dir for dir in os.listdir("test_out/test_0")]
classes.sort()

data = []
labels = []
fnames = []
idx = 3
for idx in range(num_test):
    data_tmp = []
    label_tmp = []
    fnames_tmp = []
    for i, dir in enumerate(classes):
        _dir = os.path.join(f"{test_dir}/test_{idx}")
        for fname in os.listdir(os.path.join(_dir, dir)):
            data_tmp.append(np.load(os.path.join(_dir, dir, fname)))
            label_tmp.append(i)
            fnames_tmp.append(os.path.join(_dir, dir, fname))
    data.append(np.vstack(data_tmp))
    labels.append(np.array(label_tmp))
    fnames.append(fnames_tmp)




In [14]:
show_data = {}


for idx in range(num_test):
    ks = [1, 3, 5, 7, 10]
    
    for k in ks:
        knn = NearestNeighbors(n_neighbors=k+1, algorithm='brute').fit(data[idx])
        distances, indices = knn.kneighbors(data[idx])
        nn_class = labels[idx][indices]
        acc = []
        total_acc = 0
        
        for i in range(len(classes)):
            cur = 0
            mask = (labels[idx] == i)
            nn_class_msk = nn_class[mask]
            for i in range(nn_class_msk.shape[0]):
                max_num = 0
                for j in range(k):
                    if ((nn_class[i, 1:] == nn_class[i, 1+j]).sum() > max_num) or (k > 1 and (nn_class[i, 1:] == nn_class[i, 1+j]).sum() == max_num and nn_class[i, 1+j] != nn_class[i, 0]):
                        max_num = (nn_class[i, 1:] == nn_class[i, 1+j]).sum()
                        max_class = nn_class[i, 1+j]
                
                if max_class == nn_class[i, 0]:
                    cur += 1
            
            acc.append(cur / nn_class_msk.shape[0])
            total_acc += cur
        show_data[f"Top{k}_{idx}"] =  acc + [total_acc / nn_class.shape[0]] 
        


df = pd.DataFrame(data=show_data, index=classes + ["total"])
print(df)


                       Top1_0    Top3_0    Top5_0    Top7_0   Top10_0  \
1080Lines            0.922374  0.899543  0.899543  0.908676  0.913242   
1400Ripples          0.915385  0.900000  0.907692  0.915385  0.923077   
Air_Compressor       0.836735  0.806122  0.823129  0.840136  0.816327   
Blip                 0.836806  0.812500  0.833333  0.850694  0.822917   
Blip_Low_Frequency   0.880952  0.853175  0.865079  0.880952  0.876984   
Chirp                0.857143  0.785714  0.785714  0.857143  1.000000   
Extremely_Loud       0.836237  0.811847  0.832753  0.850174  0.825784   
Fast_Scattering      0.835052  0.807560  0.828179  0.845361  0.821306   
Helix                0.850000  0.800000  0.750000  0.850000  0.950000   
Koi_Fish             0.837370  0.813149  0.833910  0.851211  0.823529   
Light_Modulation     0.923077  0.871795  0.897436  0.910256  0.935897   
Low_Frequency_Burst  0.833898  0.806780  0.820339  0.837288  0.813559   
Low_Frequency_Lines  0.930481  0.903743  0.914439  

In [15]:
show_data = {}
for idx in range(num_test):
    kmeans = KMeans(n_clusters=len(classes), random_state=0, n_init="auto").fit(data[idx])
    acc = []
    total_mis = 0
    clusters = []
    new_cluster, cluster_map = map_label(kmeans.labels_, labels[idx])
    # cluster_map = {classes[old] : new for old, new in cluster_map.items()}
    # print(cluster_map)
    for i in range(len(classes)):
        mask = (labels[idx] == i)
        cluster = new_cluster[mask]
        cur_acc = (cluster == i).mean()
        acc.append(f"{cur_acc: .2f}")
        clusters.append(cluster)
        # if classes[i] == "Violin_Mode" or i == 0:
        #     print(kmeans.labels_[mask])

    cluster_class = classes
    # cluster_class = [""] * len(classes)
    # for k,v in cluster_map.items():
    #     cluster_class[v] += f"/{k}"

    # for i in range(len(cluster_class)):
    #     if cluster_class[i] == "":
    #         cluster_class[i] = "None"


    show_data[f"ACC_{idx}"] = acc + [(new_cluster == labels[idx]).mean()]

    # print(cluster_map)


df = pd.DataFrame(data=show_data, index=classes + ["total"])
print(df)


                        ACC_0     ACC_1     ACC_2
1080Lines                0.83      0.86      0.83
1400Ripples              0.00      0.79      0.00
Air_Compressor           0.99      0.98      0.96
Blip                     0.91      0.46      0.89
Blip_Low_Frequency       0.92      0.00      0.92
Chirp                    0.00      0.00      0.00
Extremely_Loud           0.20      0.26      0.25
Fast_Scattering          0.70      0.64      0.53
Helix                    0.00      0.30      0.15
Koi_Fish                 0.53      0.48      0.50
Light_Modulation         0.47      0.47      0.42
Low_Frequency_Burst      0.49      0.51      0.54
Low_Frequency_Lines      0.56      0.56      0.56
No_Glitch                0.65      0.52      0.60
Paired_Doves             0.42      0.42      0.32
Power_Line               0.99      0.98      0.97
Repeating_Blips          0.08      0.78      0.07
Scattered_Light          0.91      0.91      0.59
Scratchy                 0.00      0.00      0.77


In [16]:
# class_idx = 0
# mis_class = 0
# mask = (labels == class_idx)
# fname = np.array(fnames)[mask][clusters[class_idx] == mis_class]
# print(fname.shape)
# random.shuffle(fname)
# data = [np.array(Image.open(fname[i][:-4].replace("test", "test_im"))) for i in range(4)]
# data = np.vstack(data)
# fig = plt.figure(figsize=(30, 30)) 
# fig.add_subplot(1, 2, 1)
# plt.imshow(data)
# plt.title(f"{class_idx}-{mis_class}")




# class_idx = 0
# mis_class = 21
# mask = (labels == class_idx)
# fname = np.array(fnames)[mask][clusters[class_idx] == mis_class]
# print(fname.shape)
# random.shuffle(fname)

# data = [np.array(Image.open(fname[i][:-4].replace("test", "test_im"))) for i in range(4)]
# data = np.vstack(data)
# fig.add_subplot(1, 2, 2)
# plt.imshow(data)
# plt.title(f"{class_idx}-{mis_class}")

In [17]:
# for i, cluster in enumerate(clusters):
#     fig, ax = plt.subplots(figsize=(20, 10))
#     counts, bins, patches = ax.hist(cluster, list(range(len(classes))))
#     ax.set_title(classes[i])
#     ax.set_xlabel("Cluster")
#     ax.set_ylabel("Number")
#     ax.set_xticks(bins + 0.5)
#     ax.set_xticklabels(classes, rotation=60, ha="right")






In [18]:
def confusion(actual, pred):

    tp_plus_fp = comb(np.bincount(actual), 2).sum()
    tp_plus_fn = comb(np.bincount(pred), 2).sum()
    A = np.c_[(actual, pred)]
    tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
             for i in set(actual))
    fp = tp_plus_fp - tp
    fn = tp_plus_fn - tp
    tn = comb(len(A), 2) - tp - fp - fn
    return tp, tn, fp, fn

In [19]:
show_data = {}
for idx in range(num_test):
    kmeans = KMeans(n_clusters=len(classes), random_state=0, n_init="auto").fit(data[idx])
    tp, tn, fp, fn = confusion(labels[idx], kmeans.labels_)
    total_prec = tp / (tp + fp)
    total_rec = tp / (tp + fn)
    total_acc = (tp + tn) / (tp + tn + fp + fn)
    total_ari = adjusted_rand_score(labels[idx], kmeans.labels_)
    total_nmi = normalized_mutual_info_score(labels[idx], kmeans.labels_)

    show_data[f"{idx}"] = [f"{total_prec : .2f}", f"{total_rec : .2f}", f"{total_acc : .2f}", f"{total_ari : .2f}", f"{total_nmi : .2f}"]











df = pd.DataFrame(data=show_data, index=["precision", "recall", "accuracy", "adjusted RI", "nmi"])
print(df)

                 0      1      2
precision     0.57   0.54   0.53
recall        0.45   0.49   0.45
accuracy      0.94   0.94   0.94
adjusted RI   0.47   0.48   0.45
nmi           0.68   0.69   0.68


In [20]:
from sklearn.manifold import TSNE
x_embedded = TSNE(n_components=3, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(data)




AttributeError: 'list' object has no attribute 'shape'

In [None]:
import matplotlib._color_data as mcd
palette = list(mcd.XKCD_COLORS.values())[::10]



for i in range(len(classes)):
    mask = (labels == i)
    x_show = x_embedded[mask, :][:500, :]
    plt.figure()
    plt.scatter(x_show[:, 0], x_show[:, 1], color=palette[3 * i])
    plt.title(classes[i])


plt.figure()
for i in range(len(classes)):
    mask = (labels == i)
    x_show = x_embedded[mask, :][:50, :]
    
    plt.scatter(x_show[:, 0], x_show[:, 1], color=palette[3 * i])

plt.figure()
for i in range(len(classes)):
    mask = (labels == i)
    x_show = x_embedded[mask, :][:50, :].mean(0, keepdims=True)
    
    plt.scatter(x_show[:, 0], x_show[:, 1], color=palette[3 * i], label=classes[i])
    plt.legend(bbox_to_anchor=(1.1, 1.05))



In [None]:
x_embedded = TSNE(n_components=3, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(data)




In [None]:
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
from plotly.graph_objs import *

# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()







for i in range(len(classes)):
    mask = (labels == i)
    x_show = x_embedded[mask, :][:500, :]
    # Configure the trace.
    trace = go.Scatter3d(
        x=x_show[:, 0],  
        y=x_show[:, 1], 
        z=x_show[:, 2], 
        mode='markers',
        marker={
            'size': 10,
            'opacity': 0.8,
        },
    )

    # Configure the layout.
    layout = go.Layout(
        margin={'l': 0, 'r': 0, 'b': 0, 't': 0},
        scene=Scene(
                xaxis=XAxis(title=classes[i]),
                yaxis=YAxis(title=classes[i]),
                zaxis=ZAxis(title=classes[i])
            )
    )

    data = [trace]

    plot_figure = go.Figure(data=data, layout=layout)

    # Render the plot.
    plotly.offline.iplot(plot_figure)





In [None]:
data = []
for i in range(len(classes)):
    mask = (labels == i)
    x_show = x_embedded[mask, :][:100, :]
    cluster = kmeans.labels_[mask]
    # mask = cluster == 14
    # x_show = x_show[mask]
    # Configure the trace.
    trace = go.Scatter3d(
        x=x_show[:, 0],  
        y=x_show[:, 1], 
        z=x_show[:, 2], 
        mode='markers',
        marker={
            'size': 10,
            'opacity': 0.8,
        },
    )

    # Configure the layout.
    layout = go.Layout(
        margin={'l': 0, 'r': 0, 'b': 0, 't': 0},
        scene=Scene(
                xaxis=XAxis(title=classes[i]),
                yaxis=YAxis(title=classes[i]),
                zaxis=ZAxis(title=classes[i])
            )
    )

    data.append(trace)

plot_figure = go.Figure(data=data, layout=layout)

# Render the plot.
plotly.offline.iplot(plot_figure)

