# Unsupervised discretization

Dataset: clean_pageblock (M)

Updated at: 22 June 22

By: Sam

### About Dataset

Number of Instances: 5473.

Number of Attributes: 10 numeric attributes
   - height: Height of the block.
   - lenght: Length of the block. 
   - area: Area of the block (height * lenght);
   - eccen: Eccentricity of the block (lenght / height);
   - p_black:Percentage of black pixels within the block (blackpix / area);
   - p_and: Percentage of black pixels after the application of the Run Length Smoothing Algorithm (RLSA) (blackand / area);
   - mean_tr: Mean number of white-black transitions (blackpix / wb_trans);
   - blackpix: Total number of black pixels in the original bitmap of the block.
   - blackand: Total number of black pixels in the bitmap of the block after the RLSA.
   - wb_trans: Number of white-black transitions in the original bitmap of the block.

Missing Attribute Values:  No missing value.


In [2]:
# Load library
import pandas as pd
import numpy as np
import time
import timeit

In [3]:
from sklearn.preprocessing import KBinsDiscretizer as kbins # also use for unsupervised

In [4]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd

In [5]:
# Load dataset
data = pd.read_csv('clean_pageblock.csv')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   height    5473 non-null   int64  
 1   length    5473 non-null   int64  
 2   area      5473 non-null   int64  
 3   eccen     5473 non-null   float64
 4   p_black   5473 non-null   float64
 5   p_and     5473 non-null   float64
 6   mean_tr   5473 non-null   float64
 7   blacpix   5473 non-null   int64  
 8   blackand  5473 non-null   int64  
 9   wb_trans  5473 non-null   int64  
 10  class     5473 non-null   int64  
dtypes: float64(4), int64(7)
memory usage: 470.5 KB


In [7]:
# Convert outcome to categorical
data['class'] = pd.Categorical(data['class'])

In [8]:
# get list of numeric attributes to discretize
num_col = data.select_dtypes(include=np.number).columns
num_col = num_col.tolist()

In [9]:
num_col

['height',
 'length',
 'area',
 'eccen',
 'p_black',
 'p_and',
 'mean_tr',
 'blacpix',
 'blackand',
 'wb_trans']

## Equal Width Discretization

In [10]:
# Define function: Inputs: dataset, number of parameters

def ewd_disc(data, k):
    ## set up the discretisation transformer
    ewd_disc = ewd(bins=k, variables=num_col, return_boundaries=False)
    '''
    Parameters
    ----------
    bins : int, default=10
        Desired number of equal width intervals / bins.

    variables : list
        The list of numerical variables to transform. If None, the
        discretiser will automatically select all numerical type variables.

    return_object : bool, default=False
        Whether the numbers in the discrete variable should be returned as
        numeric or as object. The decision should be made by the user based on
        whether they would like to proceed the engineering of the variable as
        if it was numerical or categorical.

    return_boundaries: bool, default=False
        whether the output should be the interval boundaries. If True, it returns
        the interval boundaries. If False, it returns integers.
    '''
    ## fit the transformer
    ewd_disc.fit(data)
    ## transform the data
    data_ewd = ewd_disc.transform(data)
    ## binner_dict contains the boundaries of the different bins: 
    # stores the interval limits identified for each variable
    ewd_disc.binner_dict_
    return data_ewd  # return dataset after discretization

### EWD - Scenario 1: k = 4

In [11]:
# Perform discretization
k = 4
start = time.time() # Starting  time
data_ewd1 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  4 : 0.05166006088256836


In [12]:
# OUTPUT:
data_ewd1.head()

Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blacpix,blackand,wb_trans,class
0,0,0,0,0,1,2,0,0,0,0,1
1,0,0,0,0,1,3,0,0,0,0,1
2,0,0,0,0,0,2,0,0,0,0,1
3,0,0,0,0,1,2,0,0,0,0,1
4,0,0,0,0,1,3,0,0,0,0,1


In [13]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd1.groupby(col)[col].count())

height
height
0    5466
1       5
2       1
3       1
Name: height, dtype: int64
length
length
0    4391
1     644
2     261
3     177
Name: length, dtype: int64
area
area
0    5461
1       4
2       5
3       3
Name: area, dtype: int64
eccen
eccen
0    5393
1      68
2      10
3       2
Name: eccen, dtype: int64
p_black
p_black
0    1852
1    2946
2     395
3     280
Name: p_black, dtype: int64
p_and
p_and
0      68
1     344
2    1863
3    3198
Name: p_and, dtype: int64
mean_tr
mean_tr
0    5472
3       1
Name: mean_tr, dtype: int64
blacpix
blacpix
0    5453
1      12
2       4
3       4
Name: blacpix, dtype: int64
blackand
blackand
0    5447
1      16
2       6
3       4
Name: blackand, dtype: int64
wb_trans
wb_trans
0    5446
1      17
2       8
3       2
Name: wb_trans, dtype: int64


### EWD - Scenario 2: k = 7

In [14]:
# Perform discretization
k = 7
start = time.time() # Starting  time
data_ewd2 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  7 : 0.04986214637756348


In [15]:
# OUTPUT:
data_ewd2.head()

Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blacpix,blackand,wb_trans,class
0,0,0,0,0,2,4,0,0,0,0,1
1,0,0,0,0,2,6,0,0,0,0,1
2,0,0,0,0,1,5,0,0,0,0,1
3,0,0,0,0,2,5,0,0,0,0,1
4,0,0,0,0,3,6,0,0,0,0,1


In [16]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd2.groupby(col)[col].count())

height
height
0    5447
1      20
2       4
3       1
6       1
Name: height, dtype: int64
length
length
0    3615
1     886
2     338
3     291
4     129
5      84
6     130
Name: length, dtype: int64
area
area
0    5444
1      18
2       2
3       4
4       2
6       3
Name: area, dtype: int64
eccen
eccen
0    5348
1      55
2      31
3      36
4       1
5       1
6       1
Name: eccen, dtype: int64
p_black
p_black
0     522
1    1931
2    1967
3     585
4     176
5      68
6     224
Name: p_black, dtype: int64
p_and
p_and
0      27
1      59
2     148
3     518
4    1144
5    1519
6    2058
Name: p_and, dtype: int64
mean_tr
mean_tr
0    5472
6       1
Name: mean_tr, dtype: int64
blacpix
blacpix
0    5421
1      36
2       6
3       4
4       2
5       3
6       1
Name: blacpix, dtype: int64
blackand
blackand
0    5413
1      39
2      10
3       7
5       2
6       2
Name: blackand, dtype: int64
wb_trans
wb_trans
0    5276
1     176
2       9
3       8
4       1
5       1
6       2


### EWD - Scenario 3: k = 10

In [17]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_ewd3 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  10 : 0.04728507995605469


In [18]:
# OUTPUT:
data_ewd3.head()

Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blacpix,blackand,wb_trans,class
0,0,0,0,0,3,6,0,0,0,0,1
1,0,0,0,0,3,8,0,0,0,0,1
2,0,0,0,0,2,7,0,0,0,0,1
3,0,0,0,0,3,7,0,0,0,0,1
4,0,0,0,0,4,9,0,0,0,0,1


In [19]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd3.groupby(col)[col].count())

height
height
0    5427
1      29
2      11
3       4
5       1
9       1
Name: height, dtype: int64
length
length
0    3210
1     949
2     380
3     261
4     235
5     125
6      91
7      71
8      33
9     118
Name: length, dtype: int64
area
area
0    5436
1      25
2       1
3       2
4       1
5       3
6       2
9       3
Name: area, dtype: int64
eccen
eccen
0    5250
1     130
2      28
3      25
4      28
5       9
7       2
9       1
Name: eccen, dtype: int64
p_black
p_black
0     229
1     863
2    1624
3    1460
4     622
5     262
6     109
7      54
8      60
9     190
Name: p_black, dtype: int64
p_and
p_and
0      16
1      27
2      53
3     102
4     214
5     533
6     811
7    1069
8    1060
9    1588
Name: p_and, dtype: int64
mean_tr
mean_tr
0    5471
1       1
9       1
Name: mean_tr, dtype: int64
blacpix
blacpix
0    5388
1      49
2      20
3       5
4       3
5       3
6       1
8       3
9       1
Name: blacpix, dtype: int64
blackand
blackand
0    5383
1      

## Equal Frequency Discretization - EFD
- Reference: https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/EqualFrequencyDiscretiser.ipynb
- Parameter:
- q : int, default=10
    Desired number of equal frequency intervals / bins. In other words the
    number of quantiles in which the variables should be divided.

- variables : list
    The list of numerical variables that will be discretised. If None, the
    EqualFrequencyDiscretiser() will select all numerical variables.

- return_object : bool, default=False
    Whether the numbers in the discrete variable should be returned as
    numeric or as object. The decision is made by the user based on
    whether they would like to proceed the engineering of the variable as
    if it was numerical or categorical.

- return_boundaries: bool, default=False
    whether the output should be the interval boundaries. If True, it returns
    the interval boundaries. If False, it returns integers.

In [20]:
def efd_disc(data, k):
    ## set up the discretisation transformer
    efd_disc = efd(q=k, variables=num_col)
    ## fit the transformer
    efd_disc.fit(data)
    ## transform the data
    data_efd = efd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    efd_disc.binner_dict_
    return data_efd

### Define function efd_disc, inputs include dataset, number of intervals (k)

### EFD - Scenario 1: k = 4

In [21]:
# Perform discretization
k = 4
start = time.time() # Starting time
data_efd1 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  4 : 0.06009697914123535


In [22]:
## OUTPUT: Check number of instance in each interval 
for col in num_col:
    print(col)
    print(data_efd1.groupby(col)[col].count())

height
height
0    1916
1     940
2    1533
3    1084
Name: height, dtype: int64
length
length
0    1384
1    1377
2    1360
3    1352
Name: length, dtype: int64
area
area
0    1376
1    1362
2    1369
3    1366
Name: area, dtype: int64
eccen
eccen
0    1370
1    1373
2    1364
3    1366
Name: eccen, dtype: int64
p_black
p_black
0    1376
1    1364
2    1376
3    1357
Name: p_black, dtype: int64
p_and
p_and
0    1375
1    1362
2    1382
3    1354
Name: p_and, dtype: int64
mean_tr
mean_tr
0    1391
1    1351
2    1368
3    1363
Name: mean_tr, dtype: int64
blacpix
blacpix
0    1386
1    1359
2    1362
3    1366
Name: blacpix, dtype: int64
blackand
blackand
0    1375
1    1370
2    1362
3    1366
Name: blackand, dtype: int64
wb_trans
wb_trans
0    1403
1    1349
2    1361
3    1360
Name: wb_trans, dtype: int64


### EFD - Scenario 2: k = 7

In [23]:
# Perform discretization
k = 7
start = time.time() # Starting time
data_efd2 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  7 : 0.05820107460021973


In [24]:
## OUTPUT
data_efd2.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   height    5473 non-null   int64   
 1   length    5473 non-null   int64   
 2   area      5473 non-null   int64   
 3   eccen     5473 non-null   int64   
 4   p_black   5473 non-null   int64   
 5   p_and     5473 non-null   int64   
 6   mean_tr   5473 non-null   int64   
 7   blacpix   5473 non-null   int64   
 8   blackand  5473 non-null   int64   
 9   wb_trans  5473 non-null   int64   
 10  class     5473 non-null   category
dtypes: category(1), int64(10)
memory usage: 433.3 KB
height
height
0    1085
1     831
2     940
3     903
4     630
5     326
6     758
Name: height, dtype: int64
length
length
0    792
1    790
2    795
3    756
4    793
5    765
6    782
Name: length, dtype: int64
area
area
0    819
1    752
2    777
3    781
4    781
5    781
6    782
Name: area, dtype: int64
eccen


### Scenario 3: k = 10

In [25]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_efd3 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  10 : 0.05880284309387207


In [26]:
## OUTPUT
data_efd3.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   height    5473 non-null   int64   
 1   length    5473 non-null   int64   
 2   area      5473 non-null   int64   
 3   eccen     5473 non-null   int64   
 4   p_black   5473 non-null   int64   
 5   p_and     5473 non-null   int64   
 6   mean_tr   5473 non-null   int64   
 7   blacpix   5473 non-null   int64   
 8   blackand  5473 non-null   int64   
 9   wb_trans  5473 non-null   int64   
 10  class     5473 non-null   category
dtypes: category(1), int64(10)
memory usage: 433.3 KB
height
height
0     734
1    1182
2     940
3     903
4     630
5     593
6     491
Name: height, dtype: int64
length
length
0    623
1    526
2    523
3    535
4    554
5    540
6    542
7    535
8    549
9    546
Name: length, dtype: int64
area
area
0    554
1    602
2    509
3    538
4    535
5    553
6    541
7   

## Fixed Frequency Discretization - FFD

### Define function ffd_disc: modify input of function efd
Input include dataset, interval frequency (m)

In [27]:
def ffd_disc(data, m): # 
    n = len(data)
    ## set up the discretisation transformer
    ffd_disc = efd(q=round(n/m), variables=num_col) # number of bins = n/m
    ## fit the transformer
    ffd_disc.fit(data)
    ## transform the data
    data_ffd = ffd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    ffd_disc.binner_dict_
    return data_ffd

### FFD - Scenario 1: m = 10

In [28]:
# Perform discretization
m = 10
start = time.time() # Starting time
data_ffd1 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD,  m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD,  m =  10 : 0.19887685775756836


In [29]:
## OUTPUT
data_ffd1.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd1.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   height    5473 non-null   int64   
 1   length    5473 non-null   int64   
 2   area      5473 non-null   int64   
 3   eccen     5473 non-null   int64   
 4   p_black   5473 non-null   int64   
 5   p_and     5473 non-null   int64   
 6   mean_tr   5473 non-null   int64   
 7   blacpix   5473 non-null   int64   
 8   blackand  5473 non-null   int64   
 9   wb_trans  5473 non-null   int64   
 10  class     5473 non-null   category
dtypes: category(1), int64(10)
memory usage: 433.3 KB
height
height
0     341
2      58
3      64
4     271
5     351
6     831
7     940
8     903
9     630
10    326
11    164
12    103
14     89
15     20
16     46
17     35
19     10
20     11
21     13
22     15
23     16
24     39
25     33
26      6
27      7
28     15
29      5
30     12
31     13
32      5
33   

### FFD - Scenario 1: m = 30

In [30]:
# Perform discretization
m = 30
start = time.time() # Starting time
data_ffd2 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, EFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, EFD, m =  30 : 0.09447002410888672


In [31]:
## OUTPUT
data_ffd2.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   height    5473 non-null   int64   
 1   length    5473 non-null   int64   
 2   area      5473 non-null   int64   
 3   eccen     5473 non-null   int64   
 4   p_black   5473 non-null   int64   
 5   p_and     5473 non-null   int64   
 6   mean_tr   5473 non-null   int64   
 7   blacpix   5473 non-null   int64   
 8   blackand  5473 non-null   int64   
 9   wb_trans  5473 non-null   int64   
 10  class     5473 non-null   category
dtypes: category(1), int64(10)
memory usage: 433.3 KB
height
height
0     341
1      58
2      64
3     271
4     351
5     831
6     940
7     903
9     630
10    326
11    164
12    103
13     89
14     20
15     46
16     35
18     34
19     31
20     39
21     33
22     13
23     32
24     28
25     30
26     30
27     31
Name: height, dtype: int64
length
length
0   

### FFD - Scenario 3: m = 60

In [32]:
# Perform discretization
m = 60
start = time.time() # Starting time
data_ffd3 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  60 : 0.07576704025268555


In [33]:
## OUTPUT
data_ffd3.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   height    5473 non-null   int64   
 1   length    5473 non-null   int64   
 2   area      5473 non-null   int64   
 3   eccen     5473 non-null   int64   
 4   p_black   5473 non-null   int64   
 5   p_and     5473 non-null   int64   
 6   mean_tr   5473 non-null   int64   
 7   blacpix   5473 non-null   int64   
 8   blackand  5473 non-null   int64   
 9   wb_trans  5473 non-null   int64   
 10  class     5473 non-null   category
dtypes: category(1), int64(10)
memory usage: 433.3 KB
height
height
0     341
1      58
2      64
3     271
4     351
5     831
6     940
7     903
8     630
9     326
10    164
11    103
12     89
13     66
14     35
15     65
16     72
17     45
18     58
19     61
Name: height, dtype: int64
length
length
0     126
1      73
2      45
3      59
4     112
     ... 
77  

#### FFD, m = 100

In [34]:
# Perform discretization
m = 100
start = time.time() # Starting time
data_ffd4 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  100 : 0.07030320167541504


In [35]:
## OUTPUT
data_ffd4.info()

## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd4.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   height    5473 non-null   int64   
 1   length    5473 non-null   int64   
 2   area      5473 non-null   int64   
 3   eccen     5473 non-null   int64   
 4   p_black   5473 non-null   int64   
 5   p_and     5473 non-null   int64   
 6   mean_tr   5473 non-null   int64   
 7   blacpix   5473 non-null   int64   
 8   blackand  5473 non-null   int64   
 9   wb_trans  5473 non-null   int64   
 10  class     5473 non-null   category
dtypes: category(1), int64(10)
memory usage: 433.3 KB
height
height
0     341
1      58
2     335
3     351
4     831
5     940
6     903
7     630
8     326
9     164
10    103
11    109
12     91
13     94
14    106
15     91
Name: height, dtype: int64
length
length
0     126
1      73
2     104
3     112
4     105
5     103
6     169
7     126
8     118
9     113
10  

### Export discretized datasets

In [36]:
# EWD datasets:
data_ewd1.to_csv('pageblock_ewd1.csv') # k=4
data_ewd2.to_csv('pageblock_ewd2.csv') # k=7
data_ewd3.to_csv('pageblock_ewd3.csv') # k=10

In [37]:
# EFD datasets:
data_efd1.to_csv('pageblock_efd1.csv') # k=4
data_efd2.to_csv('pageblock_efd2.csv') # k=7
data_efd3.to_csv('pageblock_efd3.csv') # k=10


In [38]:
# FFD datasets:
data_ffd1.to_csv('pageblock_ffd1.csv') # m=10
data_ffd2.to_csv('pageblock_ffd2.csv') # m=30
data_ffd3.to_csv('pageblock_ffd3.csv') # m=60
data_ffd4.to_csv('pageblock_ffd4.csv') # m=100