# Unsupervised discretization
Dataset: australia (Small, Binary)

Updated at: 26 June 22

By: Sam

### About Dataset
*Continuous attributes:* 6

- A2:	continuous.
- A3:	continuous.
- A7:	continuous.
- A10: continuous.
- A13: continuous.
- A14: continuous.

*Categorical attributes:*
- A1:	0,1    CATEGORICAL a,b
- A4:	1,2,3         CATEGORICAL p,g,gg
- A5:  1, 2,3,4,5, 6,7,8,9,10,11,12,13,14    CATEGORICAL ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x 
- A6:	 1, 2,3, 4,5,6,7,8,9    CATEGORICAL ff,dd,j,bb,v,n,o,h,z 
- A8:	1, 0       CATEGORICAL t, f.
- A9: 1, 0	    CATEGORICAL t, f.
- A11:  1, 0	    CATEGORICAL t, f.
- A12:    1, 2, 3    CATEGORICAL s, g, p 

*Label*
A15:   1,2 +,- (class attribute)

In [1]:
# Load library
import pandas as pd
import numpy as np
import time
import timeit
from collections import Counter

In [2]:
from sklearn.preprocessing import KBinsDiscretizer as kbins # also use for unsupervised
from sklearn.preprocessing import OrdinalEncoder

In [3]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd

In [4]:
# Load dataset
data = pd.read_csv('clean_australia.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      690 non-null    int64  
 1   A2      690 non-null    float64
 2   A3      690 non-null    float64
 3   A4      690 non-null    int64  
 4   A5      690 non-null    int64  
 5   A6      690 non-null    int64  
 6   A7      690 non-null    float64
 7   A8      690 non-null    int64  
 8   A9      690 non-null    int64  
 9   A10     690 non-null    int64  
 10  A11     690 non-null    int64  
 11  A12     690 non-null    int64  
 12  A13     690 non-null    int64  
 13  A14     690 non-null    int64  
 14  label   690 non-null    int64  
dtypes: float64(3), int64(12)
memory usage: 81.0 KB


In [6]:
# List of continuous feature to discretize
num_col = ["A2", "A3", "A7", "A10", "A13", "A14"]

In [7]:
# Filter categorical feature
cat_col = data[data.drop('label', axis=1).columns.difference(num_col)].columns.tolist()

In [8]:
cat_col

['A1', 'A11', 'A12', 'A4', 'A5', 'A6', 'A8', 'A9']

In [9]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,label
0,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.0,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.75,1,4,4,1.25,0,0,0,1,2,280,1,0
3,0,21.67,11.5,1,5,3,0.0,1,1,11,1,2,0,1,1
4,1,20.17,8.17,2,6,4,1.96,1,1,14,0,2,60,159,1


### Ordinal encoding for categorical attributes (do this once)

In [10]:
# Before encoding
for col in cat_col:
    print(col, Counter(data[col]))

A1 Counter({1: 468, 0: 222})
A11 Counter({0: 374, 1: 316})
A12 Counter({2: 625, 1: 57, 3: 8})
A4 Counter({2: 525, 1: 163, 3: 2})
A5 Counter({8: 146, 11: 78, 9: 64, 3: 59, 6: 54, 1: 53, 4: 51, 13: 41, 14: 38, 7: 38, 2: 30, 10: 25, 5: 10, 12: 3})
A6 Counter({4: 408, 8: 138, 5: 59, 1: 57, 3: 8, 9: 8, 2: 6, 7: 6})
A8 Counter({1: 361, 0: 329})
A9 Counter({0: 395, 1: 295})


In [11]:
# Ordinal encoding for numeric attributes
## define ordinal encoding
enc = OrdinalEncoder()
## transform data
result_encode = enc.fit_transform(data[cat_col])
## convert result array to dataframe
df_aus_encode = pd.DataFrame(result_encode, columns = cat_col)

In [12]:
# Check output after ordinal encoding
df_aus_encode.head()

Unnamed: 0,A1,A11,A12,A4,A5,A6,A8,A9
0,1.0,1.0,1.0,1.0,3.0,3.0,0.0,0.0
1,0.0,0.0,1.0,1.0,7.0,3.0,0.0,0.0
2,0.0,1.0,1.0,0.0,3.0,3.0,0.0,0.0
3,0.0,1.0,1.0,0.0,4.0,2.0,1.0,1.0
4,1.0,0.0,1.0,1.0,5.0,3.0,1.0,1.0


In [13]:
df_aus_encode = df_aus_encode.astype(int)

In [14]:
# Check output after ordinal encoding
for col in cat_col: 
    print(col, Counter(df_aus_encode[col]))

A1 Counter({1: 468, 0: 222})
A11 Counter({0: 374, 1: 316})
A12 Counter({1: 625, 0: 57, 2: 8})
A4 Counter({1: 525, 0: 163, 2: 2})
A5 Counter({7: 146, 10: 78, 8: 64, 2: 59, 5: 54, 0: 53, 3: 51, 12: 41, 13: 38, 6: 38, 1: 30, 9: 25, 4: 10, 11: 3})
A6 Counter({3: 408, 6: 138, 4: 59, 0: 57, 2: 8, 7: 8, 1: 6, 5: 6})
A8 Counter({1: 361, 0: 329})
A9 Counter({0: 395, 1: 295})


In [15]:
# Merge numeric, encoded categorical, label
data = pd.concat([data[num_col], df_aus_encode, data['label']], axis="columns")

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A2      690 non-null    float64
 1   A3      690 non-null    float64
 2   A7      690 non-null    float64
 3   A10     690 non-null    int64  
 4   A13     690 non-null    int64  
 5   A14     690 non-null    int64  
 6   A1      690 non-null    int64  
 7   A11     690 non-null    int64  
 8   A12     690 non-null    int64  
 9   A4      690 non-null    int64  
 10  A5      690 non-null    int64  
 11  A6      690 non-null    int64  
 12  A8      690 non-null    int64  
 13  A9      690 non-null    int64  
 14  label   690 non-null    int64  
dtypes: float64(3), int64(12)
memory usage: 81.0 KB


In [17]:
# For categorical data: convert from numeric to categorical
for col in cat_col:
    data[col] = pd.Categorical(data[col]) # convert interger to categorical data

In [18]:
data['label'] = pd.Categorical(data['label']) 

## Equal Width Discretization

In [19]:
# Define function: Inputs: dataset, number of parameters

def ewd_disc(data, k):
    ## set up the discretisation transformer
    ewd_disc = ewd(bins=k, variables=num_col, return_boundaries=False)
    '''
    Parameters
    ----------
    bins : int, default=10
        Desired number of equal width intervals / bins.

    variables : list
        The list of numerical variables to transform. If None, the
        discretiser will automatically select all numerical type variables.

    return_object : bool, default=False
        Whether the numbers in the discrete variable should be returned as
        numeric or as object. The decision should be made by the user based on
        whether they would like to proceed the engineering of the variable as
        if it was numerical or categorical.

    return_boundaries: bool, default=False
        whether the output should be the interval boundaries. If True, it returns
        the interval boundaries. If False, it returns integers.
    '''
    ## fit the transformer
    ewd_disc.fit(data)
    ## transform the data
    data_ewd = ewd_disc.transform(data)
    ## binner_dict contains the boundaries of the different bins: 
    # stores the interval limits identified for each variable
    ewd_disc.binner_dict_
    return data_ewd  # return dataset after discretization

### EWD - Scenario 1: k = 4

In [20]:
# Perform discretization
k = 4
start = time.time() # Starting  time
data_ewd1 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  4 : 0.022511005401611328


In [21]:
# OUTPUT:
data_ewd1.head()

Unnamed: 0,A2,A3,A7,A10,A13,A14,A1,A11,A12,A4,A5,A6,A8,A9,label
0,0,1,0,0,0,0,1,1,1,1,3,3,0,0,0
1,0,0,0,0,0,0,0,0,1,1,7,3,0,0,0
2,0,0,0,0,0,0,0,1,1,0,3,3,0,0,0
3,0,1,0,0,0,0,0,1,1,0,4,2,1,1,1
4,0,1,0,0,0,0,1,0,1,1,5,3,1,1,1


In [22]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd1.groupby(col)[col].count())

A2
A2
0    379
1    226
2     72
3     13
Name: A2, dtype: int64
A3
A3
0    515
1    141
2     26
3      8
Name: A3, dtype: int64
A7
A7
0    639
1     40
2     10
3      1
Name: A7, dtype: int64
A10
A10
0    682
1      6
2      1
3      1
Name: A10, dtype: int64
A13
A13
0    666
1     22
2      1
3      1
Name: A13, dtype: int64
A14
A14
0    685
1      3
2      1
3      1
Name: A14, dtype: int64


### EWD - Scenario 2: k = 7

In [23]:
# Perform discretization
k = 7
start = time.time() # Starting  time
data_ewd2 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  7 : 0.02955913543701172


In [24]:
# OUTPUT:
data_ewd2.head()

Unnamed: 0,A2,A3,A7,A10,A13,A14,A1,A11,A12,A4,A5,A6,A8,A9,label
0,0,2,0,0,0,0,1,1,1,1,3,3,0,0,0
1,0,1,0,0,0,0,0,0,1,1,7,3,0,0,0
2,1,0,0,0,0,0,0,1,1,0,3,3,0,0,0
3,0,2,0,1,0,0,0,1,1,0,4,2,1,1,1
4,0,2,0,1,0,0,1,0,1,1,5,3,1,1,1


In [25]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd2.groupby(col)[col].count())

A2
A2
0    206
1    227
2    145
3     59
4     37
5     11
6      5
Name: A2, dtype: int64
A3
A3
0    414
1    115
2     97
3     44
4     11
5      4
6      5
Name: A3, dtype: int64
A7
A7
0    571
1     78
2     19
3     17
4      4
6      1
Name: A7, dtype: int64
A10
A10
0    631
1     54
2      3
4      1
6      1
Name: A10, dtype: int64
A13
A13
0    543
1    134
2      9
3      2
4      1
6      1
Name: A13, dtype: int64
A14
A14
0    682
1      4
2      1
3      2
6      1
Name: A14, dtype: int64


### EWD - Scenario 3: k = 10

In [26]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_ewd3 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  10 : 0.0248870849609375


In [27]:
# OUTPUT:
data_ewd3.head()

Unnamed: 0,A2,A3,A7,A10,A13,A14,A1,A11,A12,A4,A5,A6,A8,A9,label
0,1,4,0,0,0,0,1,1,1,1,3,3,0,0,0
1,1,2,0,0,0,0,0,0,1,1,7,3,0,0,0
2,2,0,0,0,1,0,0,1,1,0,3,3,0,0,0
3,1,4,0,1,0,0,0,1,1,0,4,2,1,1,1
4,0,2,0,2,0,0,1,0,1,1,5,3,1,1,1


In [28]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd3.groupby(col)[col].count())

A2
A2
0     96
1    209
2    147
3     99
4     54
5     42
6     25
7      9
8      6
9      3
Name: A2, dtype: int64
A3
A3
0    347
1    129
2     55
3     72
4     53
5     17
6      7
7      5
8      2
9      3
Name: A3, dtype: int64
A7
A7
0    523
1     95
2     37
3     12
4     12
5      6
6      2
7      2
9      1
Name: A7, dtype: int64
A10
A10
0    595
1     72
2     20
3      1
5      1
9      1
Name: A10, dtype: int64
A13
A13
0    451
1    183
2     45
3      6
4      3
5      1
9      1
Name: A13, dtype: int64
A14
A14
0    678
1      7
2      1
3      1
4      1
5      1
9      1
Name: A14, dtype: int64


## Equal Frequency Discretization - EFD
- Reference: https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/EqualFrequencyDiscretiser.ipynb
- Parameter:
- q : int, default=10
    Desired number of equal frequency intervals / bins. In other words the
    number of quantiles in which the variables should be divided.

- variables : list
    The list of numerical variables that will be discretised. If None, the
    EqualFrequencyDiscretiser() will select all numerical variables.

- return_object : bool, default=False
    Whether the numbers in the discrete variable should be returned as
    numeric or as object. The decision is made by the user based on
    whether they would like to proceed the engineering of the variable as
    if it was numerical or categorical.

- return_boundaries: bool, default=False
    whether the output should be the interval boundaries. If True, it returns
    the interval boundaries. If False, it returns integers.

In [29]:
def efd_disc(data, k):
    ## set up the discretisation transformer
    efd_disc = efd(q=k, variables=num_col)
    ## fit the transformer
    efd_disc.fit(data)
    ## transform the data
    data_efd = efd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    efd_disc.binner_dict_
    return data_efd

### Define function efd_disc, inputs include dataset, number of intervals (k)

### EFD - Scenario 1: k = 4

In [30]:
# Perform discretization
k = 4
start = time.time() # Starting time
data_efd1 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  4 : 0.035897016525268555


In [31]:
## OUTPUT: Check number of instance in each interval 
for col in num_col:
    print(col)
    print(data_efd1.groupby(col)[col].count())

A2
A2
0    179
1    166
2    172
3    173
Name: A2, dtype: int64
A3
A3
0    177
1    170
2    170
3    173
Name: A3, dtype: int64
A7
A7
0    181
1    181
2    156
3    172
Name: A7, dtype: int64
A10
A10
0    539
1    151
Name: A10, dtype: int64
A13
A13
0    201
1    164
2    154
3    171
Name: A13, dtype: int64
A14
A14
0    352
1    165
2    173
Name: A14, dtype: int64


### EFD - Scenario 2: k = 7

In [32]:
# Perform discretization
k = 7
start = time.time() # Starting time
data_efd2 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  7 : 0.023318052291870117


In [33]:
## OUTPUT
data_efd2.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A2      690 non-null    int64   
 1   A3      690 non-null    int64   
 2   A7      690 non-null    int64   
 3   A10     690 non-null    int64   
 4   A13     690 non-null    int64   
 5   A14     690 non-null    int64   
 6   A1      690 non-null    category
 7   A11     690 non-null    category
 8   A12     690 non-null    category
 9   A4      690 non-null    category
 10  A5      690 non-null    category
 11  A6      690 non-null    category
 12  A8      690 non-null    category
 13  A9      690 non-null    category
 14  label   690 non-null    category
dtypes: category(9), int64(6)
memory usage: 40.4 KB
A2
A2
0    103
1     95
2     99
3     97
4     99
5     98
6     99
Name: A2, dtype: int64
A3
A3
0    106
1    104
2     87
3     98
4    108
5     88
6     99
Name: A3, dtype: int64
A7
A7
0    10

### Scenario 3: k = 10

In [34]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_efd3 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  10 : 0.02377176284790039


In [35]:
## OUTPUT
data_efd3.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A2      690 non-null    int64   
 1   A3      690 non-null    int64   
 2   A7      690 non-null    int64   
 3   A10     690 non-null    int64   
 4   A13     690 non-null    int64   
 5   A14     690 non-null    int64   
 6   A1      690 non-null    category
 7   A11     690 non-null    category
 8   A12     690 non-null    category
 9   A4      690 non-null    category
 10  A5      690 non-null    category
 11  A6      690 non-null    category
 12  A8      690 non-null    category
 13  A9      690 non-null    category
 14  label   690 non-null    category
dtypes: category(9), int64(6)
memory usage: 40.4 KB
A2
A2
0    69
1    69
2    70
3    70
4    67
5    70
6    70
7    69
8    67
9    69
Name: A2, dtype: int64
A3
A3
0    73
1    71
2    66
3    66
4    71
5    67
6    74
7    64
8    69
9    69
Na

## Fixed Frequency Discretization - FFD

### Define function ffd_disc: modify input of function efd
Input include dataset, interval frequency (m)

In [36]:
def ffd_disc(data, m): # 
    n = len(data)
    ## set up the discretisation transformer
    ffd_disc = efd(q=round(n/m), variables=num_col) # number of bins = n/m
    ## fit the transformer
    ffd_disc.fit(data)
    ## transform the data
    data_ffd = ffd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    ffd_disc.binner_dict_
    return data_ffd

### FFD - Scenario 1: m = 10

In [37]:
# Perform discretization
m = 10
start = time.time() # Starting time
data_ffd1 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD,  m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD,  m =  10 : 0.03240799903869629


In [38]:
## OUTPUT
data_ffd1.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd1.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A2      690 non-null    int64   
 1   A3      690 non-null    int64   
 2   A7      690 non-null    int64   
 3   A10     690 non-null    int64   
 4   A13     690 non-null    int64   
 5   A14     690 non-null    int64   
 6   A1      690 non-null    category
 7   A11     690 non-null    category
 8   A12     690 non-null    category
 9   A4      690 non-null    category
 10  A5      690 non-null    category
 11  A6      690 non-null    category
 12  A8      690 non-null    category
 13  A9      690 non-null    category
 14  label   690 non-null    category
dtypes: category(9), int64(6)
memory usage: 40.4 KB
A2
A2
0     10
1     10
2     11
3     11
4      9
      ..
64    10
65    10
66    10
67    10
68    10
Name: A2, Length: 69, dtype: int64
A3
A3
0     24
1      7
2      9
3     12
4     12
      

### FFD - Scenario 1: m = 30

In [39]:
# Perform discretization
m = 30
start = time.time() # Starting time
data_ffd2 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, EFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, EFD, m =  30 : 0.02730393409729004


In [40]:
## OUTPUT
data_ffd2.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A2      690 non-null    int64   
 1   A3      690 non-null    int64   
 2   A7      690 non-null    int64   
 3   A10     690 non-null    int64   
 4   A13     690 non-null    int64   
 5   A14     690 non-null    int64   
 6   A1      690 non-null    category
 7   A11     690 non-null    category
 8   A12     690 non-null    category
 9   A4      690 non-null    category
 10  A5      690 non-null    category
 11  A6      690 non-null    category
 12  A8      690 non-null    category
 13  A9      690 non-null    category
 14  label   690 non-null    category
dtypes: category(9), int64(6)
memory usage: 40.4 KB
A2
A2
0     31
1     30
2     30
3     30
4     31
5     30
6     29
7     30
8     30
9     29
10    30
11    32
12    28
13    30
14    32
15    29
16    31
17    29
18    30
19    29
20    30
21

### FFD - Scenario 3: m = 60

In [41]:
# Perform discretization
m = 60
start = time.time() # Starting time
data_ffd3 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  60 : 0.02887701988220215


In [42]:
## OUTPUT
data_ffd3.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A2      690 non-null    int64   
 1   A3      690 non-null    int64   
 2   A7      690 non-null    int64   
 3   A10     690 non-null    int64   
 4   A13     690 non-null    int64   
 5   A14     690 non-null    int64   
 6   A1      690 non-null    category
 7   A11     690 non-null    category
 8   A12     690 non-null    category
 9   A4      690 non-null    category
 10  A5      690 non-null    category
 11  A6      690 non-null    category
 12  A8      690 non-null    category
 13  A9      690 non-null    category
 14  label   690 non-null    category
dtypes: category(9), int64(6)
memory usage: 40.4 KB
A2
A2
0     59
1     57
2     63
3     51
4     59
5     56
6     62
7     54
8     56
9     58
10    57
11    58
Name: A2, dtype: int64
A3
A3
0     58
1     60
2     59
3     62
4     53
5     55


#### FFD, m = 100

In [43]:
# Perform discretization
m = 100
start = time.time() # Starting time
data_ffd4 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  100 : 0.023678064346313477


In [44]:
## OUTPUT
data_ffd4.info()

## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd4.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A2      690 non-null    int64   
 1   A3      690 non-null    int64   
 2   A7      690 non-null    int64   
 3   A10     690 non-null    int64   
 4   A13     690 non-null    int64   
 5   A14     690 non-null    int64   
 6   A1      690 non-null    category
 7   A11     690 non-null    category
 8   A12     690 non-null    category
 9   A4      690 non-null    category
 10  A5      690 non-null    category
 11  A6      690 non-null    category
 12  A8      690 non-null    category
 13  A9      690 non-null    category
 14  label   690 non-null    category
dtypes: category(9), int64(6)
memory usage: 40.4 KB
A2
A2
0    103
1     95
2     99
3     97
4     99
5     98
6     99
Name: A2, dtype: int64
A3
A3
0    106
1    104
2     87
3     98
4    108
5     88
6     99
Name: A3, dtype: int64
A7
A7
0    10

### Export discretized datasets

In [45]:
# EWD datasets:
data_ewd1.to_csv('aus_ewd1.csv', index = False) # k=4
data_ewd2.to_csv('aus_ewd2.csv', index = False) # k=7
data_ewd3.to_csv('aus_ewd3.csv', index = False) # k=10

In [46]:
# EFD datasets:
data_efd1.to_csv('aus_efd1.csv', index = False) # k=4
data_efd1.to_csv('aus_efd2.csv', index = False) # k=7
data_efd1.to_csv('aus_efd3.csv', index = False) # k=10


In [47]:
# FFD datasets:
data_ffd1.to_csv('aus_ffd1.csv', index = False) # m=10
data_ffd2.to_csv('aus_ffd2.csv', index = False) # m=30
data_ffd3.to_csv('aus_ffd3.csv', index = False) # m=60
data_ffd4.to_csv('aus_ffd4.csv', index = False) # m=100