## Blobs: varying the number of dimensions

In [1]:
import sys  
sys.path.insert(0, '../../')

In [2]:
import optuna
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

import logging
import sys

In [3]:
seed = 0
n_samples = 300
n_features = 3
n_centers = 2
datasets = []

feature_grid = [2, 3, 4, 5, 10, 15, 20]

In [4]:
experiment = []
rock_results = []
other_results = []

In [5]:
def getBlobDensities(n_centers):
  return np.round([3/n_centers * x for x in range(1, n_centers + 1)], decimals=2)

In [7]:
for f in feature_grid:
    step = 0
    for s in range(seed, seed+10):
        dataset = make_blobs(n_samples=n_samples, centers=n_centers, n_features=f, 
                             cluster_std=getBlobDensities(n_centers), random_state=s)
        run = {}
        run['step'] = step
        run['n_samples'] = n_samples
        run['n_centers'] = n_centers
        run['features'] = f
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])
        kmeans = KMeans(n_clusters=n_centers, random_state=seed).fit(data).labels_
        eps, min_pts = (0.2, n_features * 2)
        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = n_features * 2
        run['min_pts'] = min_pts
        spectral = SpectralClustering(n_clusters=2, random_state=seed).fit(data).labels_
        bandwidth = estimate_bandwidth(data)

        run['bandwidth'] = bandwidth
        mean_shift = MeanShift(bandwidth=bandwidth).fit(data).labels_

        rock = ROCK(tmax=15).fit(data).labels_

        gt = dataset[1]
        rock_results.append(AMI(rock, gt))
        other_results.append(np.max([AMI(kmeans, gt), AMI(dbscan, gt), AMI(spectral, gt), AMI(mean_shift, gt)]))

        run['ROCK'] = AMI(rock, gt)
        run['K_MEANS'] = AMI(kmeans, gt)
        run['DBSCAN'] = AMI(dbscan, gt)
        run['SPECTRAL'] = AMI(spectral, gt)
        run['MEAN_SHIFT'] = AMI(mean_shift, gt)

        step += 1

        experiment.append(run)

In [8]:
pd.DataFrame(experiment).to_csv('../../results/analysis/den_blobs_analysis_dim.csv')

In [9]:
pd.DataFrame(experiment)

Unnamed: 0,step,n_samples,n_centers,features,eps,min_pts,bandwidth,ROCK,K_MEANS,DBSCAN,SPECTRAL,MEAN_SHIFT
0,0,300,2,2,6,6,1.245013,0.005039,0.298242,0.239683,0.299603,0.000000
1,1,300,2,2,6,6,0.896702,0.971025,0.948730,0.590690,0.948730,0.957038
2,2,300,2,2,6,6,1.175592,0.721544,0.585288,0.450097,0.545164,0.757600
3,3,300,2,2,6,6,1.160315,0.583578,0.566263,0.385952,0.483559,0.586298
4,4,300,2,2,6,6,1.256347,0.000000,0.332521,0.197635,0.268162,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
65,5,300,2,20,6,6,3.891290,1.000000,1.000000,0.000000,1.000000,0.985680
66,6,300,2,20,6,6,4.246561,1.000000,1.000000,0.000000,1.000000,0.934729
67,7,300,2,20,6,6,4.530091,1.000000,1.000000,0.000000,1.000000,0.985680
68,8,300,2,20,6,6,4.217402,1.000000,1.000000,0.000000,1.000000,0.864755
