In [1]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


import altair as alt
import hdbscan

import umap


In [2]:
datasets = [
    'boston', 
    'breastcancercoimbra',
    'breastcancerwisconsinprognostic',
    'covertype',
    'dermatology',
    'drybean',
    'echocardiogram',
    'ecoli',
    'extyaleb',
    'glassidentification',
    'heartdisease',
    'hepatitis',
    'housing', 'iris', 'mnist64', 'olive', 'weather', 'wine', 'world12d']

In [3]:
metric_type = [
    'DTM_KL1',
    'DTM_KL01',
    'DTM_KL001',
    'RMSE',
    'Sammon',
    'Spearman',
    'Trustworthiness',
    'Continuity',
    'Steadiness',
    'Cohesiveness'
    ]

In [4]:
normalized_data = {}
scaled_data = {}
raw = {}

for dataset in datasets:
    boston = pd.read_csv(f'./metric_raw/{dataset}_metrics.csv', index_col='num')
    raw[dataset] = boston

    method = boston["method"]
    boston.drop(['method'], axis=1, inplace=True)


    scaled  = StandardScaler().fit_transform(boston)

    normalized = preprocessing.normalize(boston, axis=0)
    np_normalized = np.array(normalized)
    
    boston["method"] = method
    kmeans_result = KMeans(n_clusters=10).fit(np_normalized).predict(np_normalized)
    hdbscan_result = hdbscan.HDBSCAN(min_cluster_size=3).fit_predict(np_normalized)
    normalized_data[dataset] = np_normalized
    scaled_data[dataset] = scaled
    boston["cluster_kmeans"] = kmeans_result
    boston["cluster_hdbscan"] = hdbscan_result
    cluster_kmeans = {}
    cluster_hdbscan = {}
    for idx, data in boston.groupby('cluster_kmeans'):
        cluster_kmeans[idx] = data.index.tolist()

    for idx, data in boston.groupby('cluster_hdbscan'):
        cluster_hdbscan[idx] = data.index.tolist()

    # with open(f'hdbscan/clustering_{dataset}.json', 'w') as f:
    #     json.dump(cluster_hdbscan, f)

    # with open(f'kmeans/clustering_{dataset}.json', 'w') as f:
    #     json.dump(cluster_kmeans, f)

    # boston.to_json(f'./metric/{dataset}_metrics.json', orient='index')
    

In [82]:
reducer = umap.UMAP(n_neighbors=3, min_dist=0.1, n_components=2)

for dataset in datasets[:1]:
    res_n = reducer.fit_transform(normalized_data[dataset])
    res_s = reducer.fit_transform(scaled_data[dataset])

    res_n = pd.DataFrame(res_n)
    res_s = pd.DataFrame(res_s)

    res_n.columns = ['x', 'y']
    res_s.columns = ['x', 'y']
    res_n.index.name = 'num'
    res_s.index.name = 'num'

    raw_d = raw[dataset]
    res_n = raw_d.merge(res_n, left_index=True, right_index=True)
    res_s = raw_d.merge(res_s, left_index=True, right_index=True)

    charts = alt.Chart(res_n).mark_circle(size=10).encode(
        x='x',
        y='y',
        color='method',
    ).properties(title=dataset+' normalized', width=200, height=200) | alt.Chart(res_s).mark_circle(size=10).encode(
        x='x',
        y='y',
        color='method',
        ).properties(title=dataset+' standard', width=200, height=200)


for dataset in datasets[1:]:
    res_n = reducer.fit_transform(normalized_data[dataset])
    res_s = reducer.fit_transform(scaled_data[dataset])

    res_n = pd.DataFrame(res_n)
    res_s = pd.DataFrame(res_s)

    res_n.columns = ['x', 'y']
    res_s.columns = ['x', 'y']
    res_n.index.name = 'num'
    res_s.index.name = 'num'

    raw_d = raw[dataset]
    res_n = raw_d.merge(res_n, left_index=True, right_index=True)
    res_s = raw_d.merge(res_s, left_index=True, right_index=True)

    res_chart = alt.Chart(res_n).mark_circle(size=10).encode(
        x='x',
        y='y',
        color='method',
    ).properties(title=dataset+' normalized', width=200, height=200) | alt.Chart(res_s).mark_circle(size=10).encode(
        x='x',
        y='y',
        color='method',
        ).properties(title=dataset+' standard', width=200, height=200)

    charts &= res_chart

In [None]:
charts.title = 'UMAP(n_neighbors = 3, min_dist = 0.1, n_components=2)'
charts

In [5]:
from sklearn.manifold import TSNE

In [None]:
chartss={}
for dataset in datasets[1:]:
    perplexity = 5

    tsne = TSNE(n_components=2, perplexity=perplexity)

    res_n = tsne.fit_transform(normalized_data[dataset])
    res_s = tsne.fit_transform(scaled_data[dataset])

    res_n = pd.DataFrame(res_n)
    res_s = pd.DataFrame(res_s)

    res_n.columns = ['x', 'y']
    res_s.columns = ['x', 'y']
    res_n.index.name = 'num'
    res_s.index.name = 'num'

    raw_d = raw[dataset]
    res_n = raw_d.merge(res_n, left_index=True, right_index=True)
    res_s = raw_d.merge(res_s, left_index=True, right_index=True)

    charts = alt.Chart(res_n).mark_circle(size=10).encode(
        x='x',
        y='y',
        color='method',
    ).properties(title='n /  perplexity : '+ str(perplexity), width=200, height=200) | alt.Chart(res_s).mark_circle(size=10).encode(
        x='x',
        y='y',
        color='method',
        ).properties(title='s /perplexity : '+ str(perplexity), width=200, height=200)

    for perplexity in [10, 15, 20, 30, 40, 50]:
        tsne = TSNE(n_components=2, perplexity=perplexity)

        res_n = tsne.fit_transform(normalized_data[dataset])
        res_s = tsne.fit_transform(scaled_data[dataset])

        res_n = pd.DataFrame(res_n)
        res_s = pd.DataFrame(res_s)

        res_n.columns = ['x', 'y']
        res_s.columns = ['x', 'y']
        res_n.index.name = 'num'
        res_s.index.name = 'num'

        raw_d = raw[dataset]
        res_n = raw_d.merge(res_n, left_index=True, right_index=True)
        res_s = raw_d.merge(res_s, left_index=True, right_index=True)

        res_chart = alt.Chart(res_n).mark_circle(size=10).encode(
            x='x',
            y='y',
            color='method',
        ).properties(title='perplexity : '+ str(perplexity), width=200, height=200) | alt.Chart(res_s).mark_circle(size=10).encode(
            x='x',
            y='y',
            color='method',
            ).properties(title='perplexity : '+ str(perplexity), width=200, height=200)

        charts &= res_chart

    charts.title = 't-SNE /' + dataset
    chartss[dataset] = charts




# for dataset in datasets[1:]:
#     res_n = tsne.fit_transform(normalized_data[dataset])
#     res_s = tsne.fit_transform(scaled_data[dataset])

#     res_n = pd.DataFrame(res_n)
#     res_s = pd.DataFrame(res_s)

#     res_n.columns = ['x', 'y']
#     res_s.columns = ['x', 'y']
#     res_n.index.name = 'num'
#     res_s.index.name = 'num'

#     raw_d = raw[dataset]
#     res_n = raw_d.merge(res_n, left_index=True, right_index=True)
#     res_s = raw_d.merge(res_s, left_index=True, right_index=True)

#     res_chart = alt.Chart(res_n).mark_circle(size=10).encode(
#         x='x',
#         y='y',
#         color='method',
#     ).properties(title=dataset+' normalized', width=200, height=200) | alt.Chart(res_s).mark_circle(size=10).encode(
#         x='x',
#         y='y',
#         color='method',
#         ).properties(title=dataset+' standard', width=200, height=200)

#     charts &= res_chart

In [38]:
i = 18
print(i)
print(datasets[i])
chartss[datasets[i]] 

18
world12d
