# Dataset Optimization - Two Moons

To reproduce this experiment, first delete or remove the database two_moons.db that is stored in the results/optimization folder. Otherwise, the TPE optimization will use the 100 trials stored in the existing two_moons.db and continue with the 101st trial up to the 200th trial. 

In [1]:
import sys  
sys.path.insert(0, '../..')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import optuna
from optuna.samplers import TPESampler
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

import logging
import sys

In [9]:
def objective(trial):
    seed = 0

    rock_results = []
    other_results = []

    n_samples = trial.suggest_int('n_samples', 1, 16) * 100
    jitter = trial.suggest_int('jitter', 1, 20) * 0.01


    datasets = []
    n_centers = None
    
    for s in range(seed, seed+10):
        datasets.append(make_moons(n_samples=n_samples, noise=jitter,
            shuffle=True, random_state=s))
        
    step = 0
    for dataset in datasets: 
        run = {}
        run['step'] = step
        run['n_samples'] = n_samples
        run['jitter'] = jitter
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])
        kmeans = KMeans(n_clusters=2, random_state=seed).fit(data).labels_
        run['k'] = 2
        eps, min_pts = (0.2, 4)
        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = eps
        run['min_pts'] = min_pts
        spectral = SpectralClustering(n_clusters=2, random_state=seed).fit(data).labels_
        bandwidth = estimate_bandwidth(data)
        run['bandwidth'] = bandwidth
        mean_shift = MeanShift(bandwidth=bandwidth).fit(data).labels_

        rock = ROCK(tmax=15).fit(data).labels_

        gt = dataset[1]
        rock_results.append(AMI(rock, gt))
        other_results.append(np.max([AMI(kmeans, gt), AMI(dbscan, gt), AMI(spectral, gt), AMI(mean_shift, gt)]))

        run['ROCK'] = AMI(rock, gt)
        run['K_MEANS'] = AMI(kmeans, gt)
        run['DBSCAN'] = AMI(dbscan, gt)
        run['SPECTRAL'] = AMI(spectral, gt)
        run['MEAN_SHIFT'] = AMI(mean_shift, gt)

        step += 1

        experiment.append(run)

    return np.mean(rock_results) - np.mean(other_results)

In [10]:
experiment = []

In [11]:
study_name = 'two_moons'

In [12]:
storage_name = f'sqlite:///../../results/optimization/{study_name}.db'
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name=study_name, storage=storage_name, sampler=sampler, load_if_exists=True,  direction="maximize")

[32m[I 2021-06-01 13:04:20,341][0m A new study created in RDB with name: two_moons[0m


In [13]:
study.optimize(objective, n_trials=100)

[32m[I 2021-06-01 13:04:54,058][0m Trial 0 finished with value: 0.17165110355273971 and parameters: {'n_samples': 6, 'jitter': 20}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-06-01 13:05:53,324][0m Trial 1 finished with value: 0.1707221500535816 and parameters: {'n_samples': 12, 'jitter': 12}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-06-01 13:06:04,871][0m Trial 2 finished with value: -0.1777327860237462 and parameters: {'n_samples': 3, 'jitter': 4}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-06-01 13:06:09,031][0m Trial 3 finished with value: 0.100519992181483 and parameters: {'n_samples': 1, 'jitter': 18}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-06-01 13:06:59,006][0m Trial 4 finished with value: 0.3580777415846451 and parameters: {'n_samples': 10, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 13:07:03,262][0m Trial 5 finished with value: 0.0

[32m[I 2021-06-01 14:09:44,001][0m Trial 44 finished with value: 0.3196228543529997 and parameters: {'n_samples': 10, 'jitter': 14}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:10:25,271][0m Trial 45 finished with value: 0.3462561084025223 and parameters: {'n_samples': 9, 'jitter': 12}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:11:02,361][0m Trial 46 finished with value: 0.2418578202112659 and parameters: {'n_samples': 8, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:12:05,025][0m Trial 47 finished with value: 0.24642902178190923 and parameters: {'n_samples': 12, 'jitter': 17}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:12:37,795][0m Trial 48 finished with value: 0.1671843432233151 and parameters: {'n_samples': 7, 'jitter': 19}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:13:27,554][0m Trial 49 finished with value: 

[32m[I 2021-06-01 14:50:29,948][0m Trial 88 finished with value: 0.3580777415846451 and parameters: {'n_samples': 10, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:51:50,215][0m Trial 89 finished with value: 0.27018874201047866 and parameters: {'n_samples': 12, 'jitter': 14}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:52:43,856][0m Trial 90 finished with value: 0.2852639926243666 and parameters: {'n_samples': 9, 'jitter': 13}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:53:41,660][0m Trial 91 finished with value: 0.3580777415846451 and parameters: {'n_samples': 10, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:54:40,393][0m Trial 92 finished with value: 0.3580777415846451 and parameters: {'n_samples': 10, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846451.[0m
[32m[I 2021-06-01 14:55:56,543][0m Trial 93 finished with value

In [14]:
pd.DataFrame(experiment).to_csv('../../results/optimization/two_moons.csv')