# Unsupervised discretization
Dataset: pima

Updated at: 26 June 22

By: Sam

### About Dataset
Therefore, there is one target (dependent) variable and the 8 attributes (TYNECKI, 2018): 
- pregnancies, 
- OGTT(Oral Glucose Tolerance Test), 
- blood pressure, 
- skin thickness, 
- insulin, 
- BMI(Body Mass Index), 
- age, 
- pedigree diabetes function

In [1]:
# Load library
import pandas as pd
import numpy as np
import time
import timeit

In [2]:
from sklearn.preprocessing import KBinsDiscretizer as kbins # also use for unsupervised

In [3]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd

In [4]:
# Load dataset
data = pd.read_csv('clean_pima.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
# Convert outcome to categorical
data['Outcome'] = pd.Categorical(data['Outcome'])

In [7]:
# get list of numeric attributes to discretize
num_col = data.select_dtypes(include=np.number).columns
num_col = num_col.tolist()

In [8]:
num_col

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

## Equal Width Discretization

In [9]:
# Define function: Inputs: dataset, number of parameters

def ewd_disc(data, k):
    ## set up the discretisation transformer
    ewd_disc = ewd(bins=k, variables=num_col, return_boundaries=False)
    '''
    Parameters
    ----------
    bins : int, default=10
        Desired number of equal width intervals / bins.

    variables : list
        The list of numerical variables to transform. If None, the
        discretiser will automatically select all numerical type variables.

    return_object : bool, default=False
        Whether the numbers in the discrete variable should be returned as
        numeric or as object. The decision should be made by the user based on
        whether they would like to proceed the engineering of the variable as
        if it was numerical or categorical.

    return_boundaries: bool, default=False
        whether the output should be the interval boundaries. If True, it returns
        the interval boundaries. If False, it returns integers.
    '''
    ## fit the transformer
    ewd_disc.fit(data)
    ## transform the data
    data_ewd = ewd_disc.transform(data)
    ## binner_dict contains the boundaries of the different bins: 
    # stores the interval limits identified for each variable
    ewd_disc.binner_dict_
    return data_ewd  # return dataset after discretization

### EWD - Scenario 1: k = 4

In [10]:
# Perform discretization
k = 4
start = time.time() # Starting  time
data_ewd1 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  4 : 0.02778482437133789


In [11]:
# OUTPUT:
data_ewd1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,2,2,1,0,2,0,1,1
1,0,1,2,1,0,1,0,0,0
2,1,3,2,0,0,1,1,0,1
3,0,1,2,0,0,1,0,0,0
4,0,2,1,1,0,2,3,0,1


In [12]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd1.groupby(col)[col].count())

Pregnancies
Pregnancies
0    492
1    190
2     72
3     14
Name: Pregnancies, dtype: int64
Glucose
Glucose
0      6
1    191
2    428
3    143
Name: Glucose, dtype: int64
BloodPressure
BloodPressure
0     38
1    121
2    571
3     38
Name: BloodPressure, dtype: int64
SkinThickness
SkinThickness
0    411
1    345
2     11
3      1
Name: SkinThickness, dtype: int64
Insulin
Insulin
0    693
1     57
2     15
3      3
Name: Insulin, dtype: int64
BMI
BMI
0     11
1    439
2    310
3      8
Name: BMI, dtype: int64
DiabetesPedigreeFunction
DiabetesPedigreeFunction
0    598
1    145
2     20
3      5
Name: DiabetesPedigreeFunction, dtype: int64
Age
Age
0    514
1    181
2     64
3      9
Name: Age, dtype: int64


### EWD - Scenario 2: k = 7

In [13]:
# Perform discretization
k = 7
start = time.time() # Starting  time
data_ewd2 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  7 : 0.029242753982543945


In [14]:
# OUTPUT:
data_ewd2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,5,4,2,0,3,1,3,1
1,0,2,3,2,0,2,0,1,0
2,3,6,3,0,0,2,1,1,1
3,0,3,3,1,0,2,0,0,0
4,0,4,2,2,1,4,6,1,1


In [15]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd2.groupby(col)[col].count())

Pregnancies
Pregnancies
0    349
1    143
2    152
3     66
4     44
5     12
6      2
Name: Pregnancies, dtype: int64
Glucose
Glucose
0      5
1      2
2     72
3    273
4    234
5    113
6     69
Name: Glucose, dtype: int64
BloodPressure
BloodPressure
0     35
1      3
2     37
3    243
4    365
5     75
6     10
Name: BloodPressure, dtype: int64
SkinThickness
SkinThickness
0    266
1    220
2    234
3     45
4      2
6      1
Name: SkinThickness, dtype: int64
Insulin
Insulin
0    566
1    143
2     35
3     12
4      9
5      1
6      2
Name: Insulin, dtype: int64
BMI
BMI
0     11
1      5
2    236
3    376
4    126
5     12
6      2
Name: BMI, dtype: int64
DiabetesPedigreeFunction
DiabetesPedigreeFunction
0    423
1    228
2     73
3     31
4      7
5      2
6      4
Name: DiabetesPedigreeFunction, dtype: int64
Age
Age
0    396
1    153
2    114
3     55
4     33
5     16
6      1
Name: Age, dtype: int64


### EWD - Scenario 3: k = 10

In [16]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_ewd3 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  10 : 0.030328750610351562


In [17]:
# OUTPUT:
data_ewd3.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,3,7,5,3,0,5,2,4,1
1,0,4,5,2,0,3,1,1,0
2,4,9,5,0,0,3,2,1,1
3,0,4,5,2,1,4,0,0,0
4,0,6,3,3,1,6,9,1,1


In [18]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd3.groupby(col)[col].count())

Pregnancies
Pregnancies
0    246
1    178
2    125
3     50
4     83
5     52
6     11
7     19
8      3
9      1
Name: Pregnancies, dtype: int64
Glucose
Glucose
0      5
2      4
3     32
4    156
5    211
6    163
7     95
8     56
9     46
Name: Glucose, dtype: int64
BloodPressure
BloodPressure
0     35
1      1
2      2
3     13
4    108
5    260
6    243
7     87
8     14
9      5
Name: BloodPressure, dtype: int64
SkinThickness
SkinThickness
0    231
1    107
2    165
3    175
4     78
5      9
6      2
9      1
Name: SkinThickness, dtype: int64
Insulin
Insulin
0    487
1    155
2     70
3     30
4      8
5      9
6      5
7      1
8      2
9      1
Name: Insulin, dtype: int64
BMI
BMI
0     11
2     15
3    156
4    268
5    224
6     78
7     12
8      3
9      1
Name: BMI, dtype: int64
DiabetesPedigreeFunction
DiabetesPedigreeFunction
0    318
1    206
2    136
3     58
4     25
5     15
6      3
7      3
8      1
9      3
Name: DiabetesPedigreeFunction, dtype: int64
Age
Age
0  

## Equal Frequency Discretization - EFD
- Reference: https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/EqualFrequencyDiscretiser.ipynb
- Parameter:
- q : int, default=10
    Desired number of equal frequency intervals / bins. In other words the
    number of quantiles in which the variables should be divided.

- variables : list
    The list of numerical variables that will be discretised. If None, the
    EqualFrequencyDiscretiser() will select all numerical variables.

- return_object : bool, default=False
    Whether the numbers in the discrete variable should be returned as
    numeric or as object. The decision is made by the user based on
    whether they would like to proceed the engineering of the variable as
    if it was numerical or categorical.

- return_boundaries: bool, default=False
    whether the output should be the interval boundaries. If True, it returns
    the interval boundaries. If False, it returns integers.

In [19]:
def efd_disc(data, k):
    ## set up the discretisation transformer
    efd_disc = efd(q=k, variables=num_col)
    ## fit the transformer
    efd_disc.fit(data)
    ## transform the data
    data_efd = efd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    efd_disc.binner_dict_
    return data_efd

### Define function efd_disc, inputs include dataset, number of intervals (k)

### EFD - Scenario 1: k = 4

In [20]:
# Perform discretization
k = 4
start = time.time() # Starting time
data_efd1 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  4 : 0.0343937873840332


In [21]:
## OUTPUT: Check number of instance in each interval 
for col in num_col:
    print(col)
    print(data_efd1.groupby(col)[col].count())

Pregnancies
Pregnancies
0    246
1    178
2    175
3    169
Name: Pregnancies, dtype: int64
Glucose
Glucose
0    197
1    194
2    185
3    192
Name: Glucose, dtype: int64
BloodPressure
BloodPressure
0    193
1    226
2    184
3    165
Name: BloodPressure, dtype: int64
SkinThickness
SkinThickness
0    399
1    181
2    188
Name: SkinThickness, dtype: int64
Insulin
Insulin
0    384
1    192
2    192
Name: Insulin, dtype: int64
BMI
BMI
0    194
1    192
2    194
3    188
Name: BMI, dtype: int64
DiabetesPedigreeFunction
DiabetesPedigreeFunction
0    192
1    192
2    192
3    192
Name: DiabetesPedigreeFunction, dtype: int64
Age
Age
0    219
1    177
2    200
3    172
Name: Age, dtype: int64


### EFD - Scenario 2: k = 7

In [22]:
# Perform discretization
k = 7
start = time.time() # Starting time
data_efd2 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  7 : 0.03231310844421387


In [23]:
## OUTPUT
data_efd2.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Pregnancies               768 non-null    int64   
 1   Glucose                   768 non-null    int64   
 2   BloodPressure             768 non-null    int64   
 3   SkinThickness             768 non-null    int64   
 4   Insulin                   768 non-null    int64   
 5   BMI                       768 non-null    int64   
 6   DiabetesPedigreeFunction  768 non-null    int64   
 7   Age                       768 non-null    int64   
 8   Outcome                   768 non-null    category
dtypes: category(1), int64(8)
memory usage: 49.0 KB
Pregnancies
Pregnancies
0    246
1    103
2    143
3     57
4    133
5     86
Name: Pregnancies, dtype: int64
Glucose
Glucose
0    115
1    108
2    111
3    112
4    104
5    114
6    104
Name: Glucose, dtype: int64
BloodPressu

### Scenario 3: k = 10

In [24]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_efd3 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  10 : 0.030271053314208984


In [25]:
## OUTPUT
data_efd3.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Pregnancies               768 non-null    int64   
 1   Glucose                   768 non-null    int64   
 2   BloodPressure             768 non-null    int64   
 3   SkinThickness             768 non-null    int64   
 4   Insulin                   768 non-null    int64   
 5   BMI                       768 non-null    int64   
 6   DiabetesPedigreeFunction  768 non-null    int64   
 7   Age                       768 non-null    int64   
 8   Outcome                   768 non-null    category
dtypes: category(1), int64(8)
memory usage: 49.0 KB
Pregnancies
Pregnancies
0    246
1    103
2     75
3     68
4     57
5     95
6     66
7     58
Name: Pregnancies, dtype: int64
Glucose
Glucose
0    79
1    81
2    76
3    78
4    77
5    80
6    67
7    82
8    72
9    76
Name

## Fixed Frequency Discretization - FFD

### Define function ffd_disc: modify input of function efd
Input include dataset, interval frequency (m)

In [26]:
def ffd_disc(data, m): # 
    n = len(data)
    ## set up the discretisation transformer
    ffd_disc = efd(q=round(n/m), variables=num_col) # number of bins = n/m
    ## fit the transformer
    ffd_disc.fit(data)
    ## transform the data
    data_ffd = ffd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    ffd_disc.binner_dict_
    return data_ffd

### FFD - Scenario 1: m = 10

In [27]:
# Perform discretization
m = 10
start = time.time() # Starting time
data_ffd1 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD,  m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD,  m =  10 : 0.04053497314453125


In [28]:
## OUTPUT
data_ffd1.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd1.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Pregnancies               768 non-null    int64   
 1   Glucose                   768 non-null    int64   
 2   BloodPressure             768 non-null    int64   
 3   SkinThickness             768 non-null    int64   
 4   Insulin                   768 non-null    int64   
 5   BMI                       768 non-null    int64   
 6   DiabetesPedigreeFunction  768 non-null    int64   
 7   Age                       768 non-null    int64   
 8   Outcome                   768 non-null    category
dtypes: category(1), int64(8)
memory usage: 49.0 KB
Pregnancies
Pregnancies
0     246
1     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
12      9
13     10
14      4
Name: Pregnancies, dtype: int64
Glucose
Glucose
0     10
1     1

### FFD - Scenario 1: m = 30

In [29]:
# Perform discretization
m = 30
start = time.time() # Starting time
data_ffd2 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, EFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, EFD, m =  30 : 0.0338740348815918


In [30]:
## OUTPUT
data_ffd2.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Pregnancies               768 non-null    int64   
 1   Glucose                   768 non-null    int64   
 2   BloodPressure             768 non-null    int64   
 3   SkinThickness             768 non-null    int64   
 4   Insulin                   768 non-null    int64   
 5   BMI                       768 non-null    int64   
 6   DiabetesPedigreeFunction  768 non-null    int64   
 7   Age                       768 non-null    int64   
 8   Outcome                   768 non-null    category
dtypes: category(1), int64(8)
memory usage: 49.0 KB
Pregnancies
Pregnancies
0     246
1     103
2      75
3      68
4      57
5      50
6      45
7      38
8      28
9      35
10     23
Name: Pregnancies, dtype: int64
Glucose
Glucose
0     30
1     32
2     27
3     35
4     36
5

### FFD - Scenario 3: m = 60

In [31]:
# Perform discretization
m = 60
start = time.time() # Starting time
data_ffd3 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  60 : 0.028325796127319336


In [32]:
## OUTPUT
data_ffd3.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Pregnancies               768 non-null    int64   
 1   Glucose                   768 non-null    int64   
 2   BloodPressure             768 non-null    int64   
 3   SkinThickness             768 non-null    int64   
 4   Insulin                   768 non-null    int64   
 5   BMI                       768 non-null    int64   
 6   DiabetesPedigreeFunction  768 non-null    int64   
 7   Age                       768 non-null    int64   
 8   Outcome                   768 non-null    category
dtypes: category(1), int64(8)
memory usage: 49.0 KB
Pregnancies
Pregnancies
0    246
1    103
2     75
3     68
4     57
5     50
6     83
7     28
8     58
Name: Pregnancies, dtype: int64
Glucose
Glucose
0     62
1     62
2     56
3     65
4     57
5     61
6     56
7     61
8  

#### FFD, m = 100

In [33]:
# Perform discretization
m = 100
start = time.time() # Starting time
data_ffd4 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  100 : 0.031304121017456055


In [34]:
## OUTPUT
data_ffd4.info()

## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd4.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Pregnancies               768 non-null    int64   
 1   Glucose                   768 non-null    int64   
 2   BloodPressure             768 non-null    int64   
 3   SkinThickness             768 non-null    int64   
 4   Insulin                   768 non-null    int64   
 5   BMI                       768 non-null    int64   
 6   DiabetesPedigreeFunction  768 non-null    int64   
 7   Age                       768 non-null    int64   
 8   Outcome                   768 non-null    category
dtypes: category(1), int64(8)
memory usage: 49.0 KB
Pregnancies
Pregnancies
0    246
1    103
2     75
3     68
4    107
5     83
6     86
Name: Pregnancies, dtype: int64
Glucose
Glucose
0     98
1     99
2     92
3    102
4     89
5     96
6    100
7     92
Name: Glucose, dtype:

### Export all discretized datasets

In [35]:
# EWD datasets:
data_ewd1.to_csv('pima_ewd1.csv', index = False) # k=4
data_ewd2.to_csv('pima_ewd2.csv', index = False) # k=7
data_ewd3.to_csv('pima_ewd3.csv', index = False) # k=10

In [36]:
# EFD datasets:
data_efd1.to_csv('pima_efd1.csv', index = False) # k=4
data_efd2.to_csv('pima_efd2.csv', index = False) # k=7
data_efd3.to_csv('pima_efd3.csv', index = False) # k=10


In [37]:
# FFD datasets:
data_ffd1.to_csv('pima_ffd1.csv', index = False) # m=10
data_ffd2.to_csv('pima_ffd2.csv', index = False) # m=30
data_ffd3.to_csv('pima_ffd3.csv', index = False) # m=60
data_ffd4.to_csv('pima_ffd4.csv', index = False) # m=100