# Unsupervised discretization

Dataset: clean_phoneme (M)

Updated at: 22 June 22

By: Sam

### About Dataset
Raw dataset is in format arff, must convert to csv (using tool: https://pulipulichen.github.io/jieba-js/weka/arff2csv/)

Five different attributes were chosen to characterize each vowel: they are the amplitudes of the five first harmonics AHi, normalised by the total energy Ene (integrated on all the frequencies): AHi/Ene. The phonemes are transcribed as follows: sh as in she, dcl as in dark, iy as the vowel in she, aa as the vowel in dark, and ao as the first vowel in water.
=> All attributes are numeric.

The aim of the present database is to distinguish between nasal and oral vowels. There are thus two different classes:
- Class 0 : Nasals
- Class 1 : Orals

In [26]:
# Load library
import pandas as pd
import numpy as np
import time
import timeit

In [27]:
from sklearn.preprocessing import KBinsDiscretizer as kbins # also use for unsupervised

In [28]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd

In [29]:
# Load dataset
data = pd.read_csv('clean_phoneme.csv')

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      5404 non-null   int64  
 1   V1      5404 non-null   float64
 2   V2      5404 non-null   float64
 3   V3      5404 non-null   float64
 4   V4      5404 non-null   float64
 5   V5      5404 non-null   float64
 6   Class   5404 non-null   int64  
dtypes: float64(5), int64(2)
memory usage: 295.7 KB


In [31]:
# Convert outcome to categorical
data['Class'] = pd.Categorical(data['Class'])

In [38]:
data.drop('id', inplace=True, axis=1)

In [39]:
# get list of numeric attributes to discretize
num_col = data.select_dtypes(include=np.number).columns
num_col = num_col.tolist()

In [40]:
num_col 

['V1', 'V2', 'V3', 'V4', 'V5']

## Equal Width Discretization

In [41]:
# Define function: Inputs: dataset, number of parameters

def ewd_disc(data, k):
    ## set up the discretisation transformer
    ewd_disc = ewd(bins=k, variables=num_col, return_boundaries=False)
    '''
    Parameters
    ----------
    bins : int, default=10
        Desired number of equal width intervals / bins.

    variables : list
        The list of numerical variables to transform. If None, the
        discretiser will automatically select all numerical type variables.

    return_object : bool, default=False
        Whether the numbers in the discrete variable should be returned as
        numeric or as object. The decision should be made by the user based on
        whether they would like to proceed the engineering of the variable as
        if it was numerical or categorical.

    return_boundaries: bool, default=False
        whether the output should be the interval boundaries. If True, it returns
        the interval boundaries. If False, it returns integers.
    '''
    ## fit the transformer
    ewd_disc.fit(data)
    ## transform the data
    data_ewd = ewd_disc.transform(data)
    ## binner_dict contains the boundaries of the different bins: 
    # stores the interval limits identified for each variable
    ewd_disc.binner_dict_
    return data_ewd  # return dataset after discretization

### EWD - Scenario 1: k = 4

In [42]:
# Perform discretization
k = 4
start = time.time() # Starting  time
data_ewd1 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  4 : 0.028027057647705078


In [43]:
# OUTPUT:
data_ewd1.head()

Unnamed: 0,V1,V2,V3,V4,V5,Class
0,2,1,1,1,1,1
1,1,1,2,1,1,1
2,2,1,2,2,1,1
3,1,1,3,0,1,1
4,1,1,3,0,0,1


In [44]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd1.groupby(col)[col].count())

V1
V1
0      39
1    4167
2     847
3     351
Name: V1, dtype: int64
V2
V2
0     189
1    3388
2    1627
3     200
Name: V2, dtype: int64
V3
V3
0     220
1    2400
2    2096
3     688
Name: V3, dtype: int64
V4
V4
0     763
1    2689
2    1541
3     411
Name: V4, dtype: int64
V5
V5
0    1115
1    3534
2     652
3     103
Name: V5, dtype: int64


### EWD - Scenario 2: k = 7

In [45]:
# Perform discretization
k = 7
start = time.time() # Starting  time
data_ewd2 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  7 : 0.030812978744506836


In [46]:
# OUTPUT:
data_ewd2.head()

Unnamed: 0,V1,V2,V3,V4,V5,Class
0,3,2,2,2,2,1
1,2,3,3,1,2,1
2,3,2,4,4,3,1
3,2,2,6,1,2,1
4,2,3,6,1,1,1


In [47]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd2.groupby(col)[col].count())

V1
V1
0       8
1     169
2    3385
3     989
4     408
5     351
6      94
Name: V1, dtype: int64
V2
V2
0      42
1     260
2    2497
3    1337
4     932
5     329
6       7
Name: V2, dtype: int64
V3
V3
0      14
1     723
2    1065
3    1605
4    1065
5     756
6     176
Name: V3, dtype: int64
V4
V4
0      45
1     994
2    1724
3    1290
4     809
5     447
6      95
Name: V4, dtype: int64
V5
V5
0     228
1    1689
2    2423
3     554
4     370
5     127
6      13
Name: V5, dtype: int64


### EWD - Scenario 3: k = 10

In [48]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_ewd3 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  10 : 0.027499675750732422


In [49]:
# OUTPUT:
data_ewd3.head()

Unnamed: 0,V1,V2,V3,V4,V5,Class
0,5,3,3,3,3,1
1,3,4,5,2,3,1
2,5,3,6,5,4,1
3,3,4,8,1,3,1
4,3,4,8,1,2,1


In [50]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd3.groupby(col)[col].count())

V1
V1
0       2
1      17
2     159
3    2986
4    1042
5     420
6     292
7     259
8     186
9      41
Name: V1, dtype: int64
V2
V2
0      23
1     163
2     348
3    1862
4    1181
5     762
6     658
7     345
8      59
9       3
Name: V2, dtype: int64
V3
V3
0       2
1      36
2     919
3     618
4    1045
5    1014
6     737
7     629
8     343
9      61
Name: V3, dtype: int64
V4
V4
0      13
1     270
2     908
3    1178
4    1083
5     793
6     564
7     361
8     204
9      30
Name: V4, dtype: int64
V5
V5
0      81
1     456
2    1854
3    1814
4     444
5     337
6     254
7     115
8      41
9       8
Name: V5, dtype: int64


## Equal Frequency Discretization - EFD
- Reference: https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/EqualFrequencyDiscretiser.ipynb
- Parameter:
- q : int, default=10
    Desired number of equal frequency intervals / bins. In other words the
    number of quantiles in which the variables should be divided.

- variables : list
    The list of numerical variables that will be discretised. If None, the
    EqualFrequencyDiscretiser() will select all numerical variables.

- return_object : bool, default=False
    Whether the numbers in the discrete variable should be returned as
    numeric or as object. The decision is made by the user based on
    whether they would like to proceed the engineering of the variable as
    if it was numerical or categorical.

- return_boundaries: bool, default=False
    whether the output should be the interval boundaries. If True, it returns
    the interval boundaries. If False, it returns integers.

In [51]:
def efd_disc(data, k):
    ## set up the discretisation transformer
    efd_disc = efd(q=k, variables=num_col)
    ## fit the transformer
    efd_disc.fit(data)
    ## transform the data
    data_efd = efd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    efd_disc.binner_dict_
    return data_efd

### Define function efd_disc, inputs include dataset, number of intervals (k)

### EFD - Scenario 1: k = 4

In [52]:
# Perform discretization
k = 4
start = time.time() # Starting time
data_efd1 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  4 : 0.04151320457458496


In [53]:
## OUTPUT: Check number of instance in each interval 
for col in num_col:
    print(col)
    print(data_efd1.groupby(col)[col].count())

V1
V1
0    1351
1    1351
2    1351
3    1351
Name: V1, dtype: int64
V2
V2
0    1351
1    1351
2    1351
3    1351
Name: V2, dtype: int64
V3
V3
0    1351
1    1351
2    1351
3    1351
Name: V3, dtype: int64
V4
V4
0    1351
1    1351
2    1351
3    1351
Name: V4, dtype: int64
V5
V5
0    1351
1    1351
2    1351
3    1351
Name: V5, dtype: int64


### EFD - Scenario 2: k = 7

In [54]:
# Perform discretization
k = 7
start = time.time() # Starting time
data_efd2 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  7 : 0.03835010528564453


In [55]:
## OUTPUT
data_efd2.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      5404 non-null   int64   
 1   V2      5404 non-null   int64   
 2   V3      5404 non-null   int64   
 3   V4      5404 non-null   int64   
 4   V5      5404 non-null   int64   
 5   Class   5404 non-null   category
dtypes: category(1), int64(5)
memory usage: 216.6 KB
V1
V1
0    772
1    772
2    772
3    772
4    772
5    772
6    772
Name: V1, dtype: int64
V2
V2
0    772
1    772
2    772
3    772
4    772
5    772
6    772
Name: V2, dtype: int64
V3
V3
0    772
1    772
2    772
3    772
4    772
5    772
6    772
Name: V3, dtype: int64
V4
V4
0    772
1    772
2    772
3    772
4    772
5    772
6    772
Name: V4, dtype: int64
V5
V5
0     772
1     772
2     772
3    1265
4     279
5     772
6     772
Name: V5, dtype: int64


### Scenario 3: k = 10

In [56]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_efd3 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  10 : 0.04153299331665039


In [57]:
## OUTPUT
data_efd3.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      5404 non-null   int64   
 1   V2      5404 non-null   int64   
 2   V3      5404 non-null   int64   
 3   V4      5404 non-null   int64   
 4   V5      5404 non-null   int64   
 5   Class   5404 non-null   category
dtypes: category(1), int64(5)
memory usage: 216.6 KB
V1
V1
0    541
1    540
2    540
3    541
4    540
5    540
6    541
7    540
8    540
9    541
Name: V1, dtype: int64
V2
V2
0    541
1    540
2    540
3    541
4    540
5    540
6    541
7    540
8    540
9    541
Name: V2, dtype: int64
V3
V3
0    541
1    540
2    541
3    540
4    540
5    540
6    541
7    540
8    540
9    541
Name: V3, dtype: int64
V4
V4
0    541
1    540
2    540
3    541
4    540
5    540
6    541
7    540
8    540
9    541
Name: V4, dtype: int64
V5
V5
0    541
1    540
2    540
3    541
4    540
5    879
6

## Fixed Frequency Discretization - FFD

### Define function ffd_disc: modify input of function efd
Input include dataset, interval frequency (m)

In [58]:
def ffd_disc(data, m): # 
    n = len(data)
    ## set up the discretisation transformer
    ffd_disc = efd(q=round(n/m), variables=num_col) # number of bins = n/m
    ## fit the transformer
    ffd_disc.fit(data)
    ## transform the data
    data_ffd = ffd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    ffd_disc.binner_dict_
    return data_ffd

### FFD - Scenario 1: m = 10

In [59]:
# Perform discretization
m = 10
start = time.time() # Starting time
data_ffd1 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD,  m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD,  m =  10 : 0.1317739486694336


In [60]:
## OUTPUT
data_ffd1.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd1.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      5404 non-null   int64   
 1   V2      5404 non-null   int64   
 2   V3      5404 non-null   int64   
 3   V4      5404 non-null   int64   
 4   V5      5404 non-null   int64   
 5   Class   5404 non-null   category
dtypes: category(1), int64(5)
memory usage: 216.6 KB
V1
V1
0      11
1      10
2      10
3      10
4      10
       ..
535    10
536    10
537    10
538    10
539    11
Name: V1, Length: 540, dtype: int64
V2
V2
0      11
1      10
2      10
3      10
4      10
       ..
535    10
536    10
537    10
538    10
539    11
Name: V2, Length: 540, dtype: int64
V3
V3
0      11
1      10
2      10
3      10
4      10
       ..
535    10
536    10
537    10
538    10
539    11
Name: V3, Length: 540, dtype: int64
V4
V4
0      11
1      10
2      10
3      10
4      10
       ..
535    10
536  

### FFD - Scenario 1: m = 30

In [61]:
# Perform discretization
m = 30
start = time.time() # Starting time
data_ffd2 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, EFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, EFD, m =  30 : 0.06967020034790039


In [62]:
## OUTPUT
data_ffd2.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      5404 non-null   int64   
 1   V2      5404 non-null   int64   
 2   V3      5404 non-null   int64   
 3   V4      5404 non-null   int64   
 4   V5      5404 non-null   int64   
 5   Class   5404 non-null   category
dtypes: category(1), int64(5)
memory usage: 216.6 KB
V1
V1
0      31
1      30
2      30
3      30
4      30
       ..
175    30
176    30
177    30
178    30
179    31
Name: V1, Length: 180, dtype: int64
V2
V2
0      31
1      30
2      30
3      30
4      30
       ..
175    30
176    30
177    30
178    30
179    31
Name: V2, Length: 180, dtype: int64
V3
V3
0      31
1      30
2      30
3      30
4      30
       ..
175    30
176    30
177    30
178    30
179    31
Name: V3, Length: 180, dtype: int64
V4
V4
0      31
1      30
2      30
3      30
4      30
       ..
175    30
176  

### FFD - Scenario 3: m = 60

In [63]:
# Perform discretization
m = 60
start = time.time() # Starting time
data_ffd3 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  60 : 0.05823373794555664


In [64]:
## OUTPUT
data_ffd3.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      5404 non-null   int64   
 1   V2      5404 non-null   int64   
 2   V3      5404 non-null   int64   
 3   V4      5404 non-null   int64   
 4   V5      5404 non-null   int64   
 5   Class   5404 non-null   category
dtypes: category(1), int64(5)
memory usage: 216.6 KB
V1
V1
0     61
1     60
2     60
3     60
4     60
      ..
85    60
86    60
87    60
88    60
89    61
Name: V1, Length: 90, dtype: int64
V2
V2
0     61
1     60
2     60
3     60
4     60
      ..
85    60
86    60
87    60
88    60
89    61
Name: V2, Length: 90, dtype: int64
V3
V3
0     61
1     60
2     60
3     60
4     60
      ..
85    60
86    60
87    60
88    60
89    61
Name: V3, Length: 90, dtype: int64
V4
V4
0     61
1     60
2     60
3     60
4     60
      ..
85    60
86    60
87    60
88    60
89    61
Name: V4, Le

#### FFD, m = 100

In [65]:
# Perform discretization
m = 100
start = time.time() # Starting time
data_ffd4 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  100 : 0.054502248764038086


In [66]:
## OUTPUT
data_ffd4.info()

## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd4.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      5404 non-null   int64   
 1   V2      5404 non-null   int64   
 2   V3      5404 non-null   int64   
 3   V4      5404 non-null   int64   
 4   V5      5404 non-null   int64   
 5   Class   5404 non-null   category
dtypes: category(1), int64(5)
memory usage: 216.6 KB
V1
V1
0     101
1     100
2     100
3     100
4     100
5     100
6     100
7     100
8     100
9     100
10    100
11    100
12    100
13    100
14    100
15    100
16    100
17    101
18    100
19    100
20    100
21    100
22    100
23    100
24    100
25    100
26    100
27    100
28    100
29    101
30     99
31    100
32    100
33    100
34    100
35    101
36    100
37    100
38    100
39    101
40     99
41    100
42    100
43    100
44    100
45    100
46    100
47    100
48    100
49    100
50    100
51    100
52    100
5

### Export discretized datasets

In [67]:
# EWD datasets:
data_ewd1.to_csv('phoneme_ewd1.csv') # k=4
data_ewd2.to_csv('phoneme_ewd2.csv') # k=7
data_ewd3.to_csv('phoneme_ewd3.csv') # k=10

In [68]:
# EFD datasets:
data_efd1.to_csv('phoneme_efd1.csv') # k=4
data_efd2.to_csv('phoneme_efd2.csv') # k=7
data_efd3.to_csv('phoneme_efd3.csv') # k=10


In [69]:
# FFD datasets:
data_ffd1.to_csv('phoneme_ffd1.csv') # m=10
data_ffd2.to_csv('phoneme_ffd2.csv') # m=30
data_ffd3.to_csv('phoneme_ffd3.csv') # m=60
data_ffd4.to_csv('phoneme_ffd4.csv') # m=100