In [1]:
from sklearn import preprocessing
import math, numpy as np

# 單一筆 data 的 z-order 值

In [2]:
def ZorderEachdataTH(data, bin_len = 3):
    data_len = len(data)
    zbin = [0] * data_len * bin_len
    
    ## integer to binary
    for i in range(data_len):
        bin = format(int(data[i]), "0" + str(bin_len) + "b")
        for j in range(bin_len):
            zbin[i + j * data_len] = int(bin[j])
    
    ## binary to integer
    zbin_len = len(zbin)
    zvalue = 0
    for i in range(zbin_len):
        zvalue += zbin[i] * (2 ** ((zbin_len - 1) - i))
        
    return zvalue        

In [3]:
def ZorderEachdataRC(data, bin_len = 3):
    data_len = len(data)
    zvalue = 0
        
    for idx, value in enumerate(data):
        for i in range(0, bin_len):
            zvalue |= (value & (2 ** i)) << int(max(0, 2 * (i + 1) - idx))

    return zvalue

# 一個 dataset 中，每一筆資料的 z-order 值

In [4]:
def ZorderDataset(data):
    ## normalization
    scaler = preprocessing.StandardScaler().fit(data)
    data_scale = scaler.transform(data)
    
    ## convert -4 ~ 3 to 0 ~ 7
    data_scale[data_scale < -3] = -4
    data_scale[data_scale >= 3] = 3
    data_scale = np.floor(data_scale) + 4
    
    z = []
    for i in range(data_scale.shape[0]):
        z.append(ZorderEachdataTH(data_scale[i, ]))
    
    return np.array(z)

# 依 z-order 值進行抽樣

In [5]:
def ZorderSampling(data, zorder, cluster_num):
    ## data sorted by z-order value
    data_sort = data[zorder.argsort()]
    
    ## cluster_dict will save sampling data
    cluster_dict = {}
    for i in range(cluster_num):
        remainder = [j % cluster_num for j in range(data_sort.shape[0])]
        position = np.where(map(lambda x: x == i, remainder))
        
        cluster_dict[i] = list(data_sort[position])
    
    return cluster_dict

# 以 iris data 實作

In [6]:
from sklearn import datasets

In [7]:
## load iris data
iris = datasets.load_iris()
x = iris.data
y = iris.target

In [8]:
## calculate z-order value for iris data
zorder = ZorderDataset(x)
zorder

array([1212,  244, 1200, 1200, 1204, 1212, 1200, 1200,  244, 1200, 1212,
       1200,  244,  244, 1272, 1276, 1212, 1212, 1212, 1212, 1208, 1212,
       1204, 1209, 1200,  244, 1200, 1212, 1208, 1200, 1200, 1208, 1272,
       1272, 1200, 1200, 1212, 1200,  244, 1208, 1204,  240, 1200, 1205,
       1212,  244, 1212, 1200, 1212, 1200, 3848, 3840, 3848,  968, 2884,
        972, 3840,  243, 2884,  972,  183, 2884, 2641, 2884,  494, 3848,
        972,  733, 2880,  729, 3840, 2884, 2880, 2884, 2884, 2884, 2892,
       2892, 2884,  251,  729,  251,  972, 2884,  972, 3840, 3848, 2880,
        972,  968,  968, 2884,  968,  243,  972,  972,  972, 2884,  251,
        972, 3843,  972, 2895, 2886, 2887, 3015,  960, 2894, 2890, 3855,
       3841, 2884, 2893,  969,  973, 3841, 2884, 3975, 3011, 2880, 3851,
        973, 3015, 2884, 3851, 3850, 2884, 2884, 2887, 2894, 2894, 3975,
       2887, 2884, 2882, 3015, 3843, 3840, 2884, 3849, 3851, 3849,  972,
       3851, 3851, 2893, 2880, 2885, 3841, 2884])

In [9]:
## sample by z-order value
zsampling = ZorderSampling(np.vstack((y, x.T)).T, zorder, 2)
iris_a = zsampling[0]
iris_b = zsampling[1]

In [10]:
iris_a[:5]

[array([ 1. ,  5. ,  2. ,  3.5,  1. ]),
 array([ 1. ,  5. ,  2.3,  3.3,  1. ]),
 array([ 0. ,  4.3,  3. ,  1.1,  0.1]),
 array([ 0. ,  5. ,  3. ,  1.6,  0.2]),
 array([ 0. ,  4.8,  3. ,  1.4,  0.3])]

In [11]:
iris_b[:5]

[array([ 0. ,  4.5,  2.3,  1.3,  0.3]),
 array([ 1. ,  4.9,  2.4,  3.3,  1. ]),
 array([ 0. ,  4.8,  3. ,  1.4,  0.1]),
 array([ 0. ,  4.4,  2.9,  1.4,  0.2]),
 array([ 0. ,  4.9,  3. ,  1.4,  0.2])]

In [12]:
# x < -3       : -4  ->  0
# -3 <= x < -2 : -3  ->  1
# -2 <= x < -1 : -2  ->  2
# -1 <= x < 0  : -1  ->  3
# 0 <= x < 1   :  0  ->  4
# 1 <= x < 2   :  1  ->  5
# 2 <= x < 3   :  2  ->  6
# x >= 3       :  3  ->  7