# Subset

In [None]:
# Reload the notebook if an external file is updated
%load_ext autoreload
%autoreload 2

import os
import sys

from pathlib import Path

path = (
    Path
    .cwd()
    .parent
    .parent
    .joinpath('warbler')
    .as_posix()
)

os.chdir(path)
sys.path.append(path)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots

from constant import PICKLE, SETTINGS
from datatype.dataset import Dataset
from datatype.settings import Settings
from datatype.validation import (
    jaccard_coefficient, 
    rand_index,
    variation_of_information
)
from datatype.voronoi import Builder, VoronoiFCM
from fcmeans import FCM
from sklearn.metrics import (
    adjusted_rand_score, 
    jaccard_score,
    mutual_info_score,
    silhouette_score,
    rand_score
)
from textwrap import dedent

In [None]:
plt.style.use('science')

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
dataset = Dataset('segment')
dataframe = dataset.load()

In [None]:
x = np.array(
    [
        dataframe.umap_x_2d,
        dataframe.umap_y_2d
    ]
).transpose()

In [None]:
strategies = [
    CalinskiHarabaszScore(),
    DaviesBouldinIndex(),
    PartitionCoefficient(),
    PartitionEntropyCoefficient(),
    SilhouetteScore(),
    SumOfSquaredErrors(),
    XieBeniIndex()
]

In [None]:
def score(i):
    if i == 42:
        return None
    
    sample = resample(x, replace=True)

    fcm = FCM(
        m=2.9,
        max_iter=200,
        n_clusters=14,
        random_state=42
    )
    
    fcm.fit(sample)

    label = np.argmax(fcm.u, axis=1)

    scoring = {}
    
    for strategy in strategies:
        scorer = Scorer()
        scorer.strategy = strategy
        scorer.estimator = fcm
        scorer.label = label
        scorer.x = sample
        
        k = repr(scorer)
        v = scorer()
        
        scoring[k] = v

    return scoring

In [None]:
scoring = {
    repr(strategy): [] 
    for strategy in strategies
}

In [None]:
iteration = 10

In [None]:
results = Parallel(n_jobs=2)(
    delayed(score)(i) 
    for i in tqdm(range(iteration), desc='Processing')
)

In [None]:
for local in results:
    if local is not None:
        for k, v in local.items():
            s = v[k]
            scoring[k].append(s)

In [None]:
score = pd.DataFrame.from_dict(scoring)

In [None]:
path = TUNING.joinpath('subset.csv')
score.to_csv(path, index_label='id')

In [None]:
score.mean().to_frame()

In [None]:
score.std().to_frame()

In [None]:
figsize = (18, 9)

for strategy in strategies:
    column = repr(strategy)
    title, ylabel = str(strategy), str(strategy)

    s = score[column].tolist()

    plt.figure(figsize=figsize)
    
    plt.plot(s, marker='o')
    
    plt.xlabel('Iteration')
    plt.ylabel(ylabel)
    
    ax = plt.gca()
    
    locator = ticker.MaxNLocator(integer=True)
    ax.xaxis.set_major_locator(locator)
    
    plt.title(title)

    plt.grid(True)
    plt.show()

In [None]:
n = 200
small = []

for label in unique:
    filename = f"sample{label}"
    
    subset = dataframe[dataframe.fcm_label_2d == label]
    length = len(subset)

    print(f"{n} samples from a total of {length} for cluster {label}")
    
    sample = subset.sample(n=n)
    small.append(sample)