 # Clustering Notebook Test

## Generate evenly separable data

In [1]:
import xarray as xr
import numpy as np
def create_equal_class_separation_dataset(num_clusters):
    """
    Returns an `xarray.Dataset` in the format of output from `datacube.load()`,
    which has a clear separation among classes, with all classes having 
    the same number of points.
    
    Parameters
    ----------
    num_clusters: int
        The number of clusters. Must be a power of two.
    """
    dims = ['time', 'latitude', 'longitude']
    coords = {}
    coords['time'] = np.arange(0,1,0.1)
    coords['latitude'] = np.arange(0,1,0.1)
    coords['longitude'] = np.arange(0,1,1.0/num_clusters)
    import math
    num_dims = int(math.log(num_clusters,2)) # The number of dimensions to create.
    bands = {}
    bands['band0'] = xr.DataArray(data=[[list(range(num_clusters))]*10]*10, coords=coords, dims=dims)
    cluster_bands = list(bands.keys())
    return xr.Dataset(data_vars=bands, coords=coords)

## Take a look at the Mosaic

In [2]:
def figure_ratio(ds, fixed_width = 15):
    width = fixed_width
    height = len(ds.latitude) * (fixed_width / len(ds.longitude))
    return (width, height)

## Perform K-Means Clustering

In [3]:
# from utils.data_cube_utilities.dc_clustering import kmeans_cluster_dataset, get_frequency_counts
import utils.data_cube_utilities.dc_clustering as dc_clustering
import importlib
importlib.reload(dc_clustering)
def get_classes_and_freq_counts(num_clusters):
    dataset = create_equal_class_separation_dataset(num_clusters=num_clusters)
    classification = dc_clustering.kmeans_cluster_dataset(dataset, bands=list(dataset.data_vars),
                                            n_clusters=num_clusters)
    freq_counts = dc_clustering.get_frequency_counts(classification)
    return classification, freq_counts
classification_4, freq_counts_4 = get_classes_and_freq_counts(4)
classification_8, freq_counts_8 = get_classes_and_freq_counts(8)
classification_12, freq_counts_12 = get_classes_and_freq_counts(12)

## Visualize the clusters

In [4]:
for class_num, (freq, fractional_freq) in freq_counts_4:
    print("There were {} data points in class {}, comprising {:.2%} of all data points.".format(freq, class_num, fractional_freq))

There were 100 data points in class 0, comprising 25.00% of all data points.
There were 100 data points in class 1, comprising 25.00% of all data points.
There were 100 data points in class 2, comprising 25.00% of all data points.
There were 100 data points in class 3, comprising 25.00% of all data points.


In [5]:
for class_num, (freq, fractional_freq) in freq_counts_8:
    print("There were {} data points in class {}, comprising {:.2%} of all data points.".format(freq, class_num, fractional_freq))

There were 100 data points in class 0, comprising 12.50% of all data points.
There were 100 data points in class 1, comprising 12.50% of all data points.
There were 100 data points in class 2, comprising 12.50% of all data points.
There were 100 data points in class 3, comprising 12.50% of all data points.
There were 100 data points in class 4, comprising 12.50% of all data points.
There were 100 data points in class 5, comprising 12.50% of all data points.
There were 100 data points in class 6, comprising 12.50% of all data points.
There were 100 data points in class 7, comprising 12.50% of all data points.


In [6]:
for class_num, (freq, fractional_freq) in freq_counts_12:
    print("There were {} data points in class {}, comprising {:.2%} of all data points.".format(freq, class_num, fractional_freq))

There were 100 data points in class 0, comprising 8.33% of all data points.
There were 100 data points in class 1, comprising 8.33% of all data points.
There were 100 data points in class 2, comprising 8.33% of all data points.
There were 100 data points in class 3, comprising 8.33% of all data points.
There were 100 data points in class 4, comprising 8.33% of all data points.
There were 100 data points in class 5, comprising 8.33% of all data points.
There were 100 data points in class 6, comprising 8.33% of all data points.
There were 100 data points in class 7, comprising 8.33% of all data points.
There were 100 data points in class 8, comprising 8.33% of all data points.
There were 100 data points in class 9, comprising 8.33% of all data points.
There were 100 data points in class 10, comprising 8.33% of all data points.
There were 100 data points in class 11, comprising 8.33% of all data points.
