In [None]:
import numpy

import matplotlib.pyplot as plotter
from sklearn.preprocessing import KBinsDiscretizer

from data_generator import generate_uniform_dataset, generate_blobbed_dataset

In [None]:
data_points = 200

In [None]:
uniform_dataset = generate_uniform_dataset(n_data_points=data_points)

In [None]:
# Show Dataset

print(uniform_dataset)

In [None]:
arr_x = numpy.array(uniform_dataset[:, 0])
arr_y = numpy.array(uniform_dataset[:, 1])

plotter.scatter(x=arr_x, y = arr_y)

In [None]:
blobbed_dataset = generate_blobbed_dataset(n_data_points=data_points,
                                           n_blobs=4,
                                           blob_std=0.1,
                                           blob_centers=[[0.7, 0.2], 
                                                         [0.2, 0.1], 
                                                         [0.3, 0.8], 
                                                         [0.9, 0.9]],
                                           size_blobs=[2, 
                                                       3, 
                                                       5, 
                                                       7])

In [None]:
# Show Dataset

print(blobbed_dataset)

In [None]:
arr_x = numpy.array(blobbed_dataset[:, 0])
arr_y = numpy.array(blobbed_dataset[:, 1])

plotter.scatter(x=arr_x, y = arr_y)

In [None]:
strategy_uniform = "uniform" 
strategy_quantile = "quantile" 
strategy_kmeans = "kmeans" 

# Use for quantile
sub_sample_quantile = 200_000

# Use for kmeans or uniform
sub_sample_others = None

In [None]:
enc = KBinsDiscretizer(n_bins=4, 
                       encode="ordinal", 
                       strategy="kmeans", 
                       subsample=sub_sample_quantile)
enc.fit(uniform_dataset)
gd_data = enc.transform(uniform_dataset)

In [None]:
print(gd_data)

In [None]:
uniform_dataset = generate_uniform_dataset(n_data_points=100)

blobbed_dataset_0 = generate_blobbed_dataset(n_data_points=data_points,
                                             n_blobs=2,
                                             blob_std=0.1,
                                             blob_centers=[[0.1, 0.1], 
                                                           [0.9, 0.1]],
                                             size_blobs=[3, 
                                                         5])

blobbed_dataset_1 = generate_blobbed_dataset(n_data_points=data_points,
                                             n_blobs=3,
                                             blob_std=0.15,
                                             blob_centers=[[0.7, 0.2], 
                                                           [0.2, 0.1], 
                                                           [0.3, 0.8]],
                                             size_blobs=[2, 
                                                         3, 
                                                         12])

In [None]:
datasets = [uniform_dataset, 
            blobbed_dataset_0, 
            blobbed_dataset_1]

strategies = [strategy_uniform,
              strategy_quantile,
              strategy_kmeans]

n_bins = 2

In [None]:
figure = plotter.figure(figsize=(14, 9))
i = 1
for ds_cnt, X in enumerate(datasets):
    ax = plotter.subplot(len(datasets), len(strategies) + 1, i)
    ax.scatter(X[:, 0], X[:, 1], edgecolors="k")
    if ds_cnt == 0:
        ax.set_title("Input data", size=14)

    xx, yy = numpy.meshgrid(
        numpy.linspace(X[:, 0].min(), X[:, 0].max(), 300),
        numpy.linspace(X[:, 1].min(), X[:, 1].max(), 300),
    )
    grid = numpy.c_[xx.ravel(), yy.ravel()]

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

    i += 1
    # transform the dataset with KBinsDiscretizer
    for strategy in strategies:
        enc = KBinsDiscretizer(
            n_bins=n_bins, encode="ordinal", strategy=strategy, subsample=200_000
        )
        enc.fit(X)
        grid_encoded = enc.transform(grid)

        ax = plotter.subplot(len(datasets), len(strategies) + 1, i)

        # horizontal stripes
        horizontal = grid_encoded[:, 0].reshape(xx.shape)
        ax.contourf(xx, yy, horizontal, alpha=0.5)
        # vertical stripes
        vertical = grid_encoded[:, 1].reshape(xx.shape)
        ax.contourf(xx, yy, vertical, alpha=0.5)

        ax.scatter(X[:, 0], X[:, 1], edgecolors="k")
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title("strategy='%s'" % (strategy,), size=14)

        i += 1

plotter.tight_layout()
plotter.show()

# Source: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_discretization_strategies.html#sphx-glr-auto-examples-preprocessing-plot-discretization-strategies-py