# Unsupervised discretization

Dataset: clean_musk (M)

Updated at: 26 June 22

By: Sam

### About Dataset

Number of Instances  6,598

Number of Attributes 168 plus the class.

For Each Attribute:
   Attribute:           Description:
   - molecule_name: Symbolic name of each molecule.  Musks have names such as MUSK-188.  Non-musks have names such as NON-MUSK-jp13.
   - conformation_name: Symbolic name of each conformation.  
   
   - f1 through f162: These are "distance features" along rays (see paper cited above).  The distances are measured in hundredths of Angstroms. any experiments withthe data should treat these feature values as lying on an arbitrary continuous scale.  In particular, the algorithm should not make any use of the zero point or the sign of each feature value. 
   - f163: This is the distance of the oxygen atom in the molecule to a designated point in 3-space. This is also called OXY-DIS.
   - f164: OXY-X: X-displacement from the designated point.
   - f165: OXY-Y: Y-displacement from the designated point.
   - f166: OXY-Z: Z-displacement from the designated
                        point. 
   
   class:               0 => non-musk, 1 => musk

   Please note that the molecule_name and conformation_name attributes
   should not be used to predict the class.

Missing Attribute Values: none.

Class Distribution: 
   Musks:     39
   Non-musks: 63

In [1]:
# Load library
import pandas as pd
import numpy as np
import time
import timeit

In [2]:
from sklearn.preprocessing import KBinsDiscretizer as kbins # also use for unsupervised

In [3]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd

In [4]:
# Load dataset
data = pd.read_csv('clean_musk.csv')

In [5]:
data.head()

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-211,211_1+1,46,-108,-60,-69,-117,49,38,-161,...,-308,52,-7,39,126,156,-50,-112,96,1.0
1,MUSK-211,211_1+10,41,-188,-145,22,-117,-6,57,-171,...,-59,-2,52,103,136,169,-61,-136,79,1.0
2,MUSK-211,211_1+11,46,-194,-145,28,-117,73,57,-168,...,-134,-154,57,143,142,165,-67,-145,39,1.0
3,MUSK-211,211_1+12,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,136,168,-60,-135,80,1.0
4,MUSK-211,211_1+13,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,137,168,-60,-135,80,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: float64(1), int64(166), object(2)
memory usage: 8.5+ MB


In [7]:
# Convert outcome to categorical
data['class'] = pd.Categorical(data['class'])

In [8]:
# get list of numeric attributes to discretize
num_col = data.select_dtypes(include=np.number).columns
num_col = num_col.tolist()

In [9]:
num_col

['f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34',
 'f35',
 'f36',
 'f37',
 'f38',
 'f39',
 'f40',
 'f41',
 'f42',
 'f43',
 'f44',
 'f45',
 'f46',
 'f47',
 'f48',
 'f49',
 'f50',
 'f51',
 'f52',
 'f53',
 'f54',
 'f55',
 'f56',
 'f57',
 'f58',
 'f59',
 'f60',
 'f61',
 'f62',
 'f63',
 'f64',
 'f65',
 'f66',
 'f67',
 'f68',
 'f69',
 'f70',
 'f71',
 'f72',
 'f73',
 'f74',
 'f75',
 'f76',
 'f77',
 'f78',
 'f79',
 'f80',
 'f81',
 'f82',
 'f83',
 'f84',
 'f85',
 'f86',
 'f87',
 'f88',
 'f89',
 'f90',
 'f91',
 'f92',
 'f93',
 'f94',
 'f95',
 'f96',
 'f97',
 'f98',
 'f99',
 'f100',
 'f101',
 'f102',
 'f103',
 'f104',
 'f105',
 'f106',
 'f107',
 'f108',
 'f109',
 'f110',
 'f111',
 'f112',
 'f113',
 'f114',
 'f115',
 'f116',
 'f117',
 'f118',
 'f119',
 'f120',
 'f121',
 'f122',
 'f123',
 

## Equal Width Discretization

In [10]:
# Define function: Inputs: dataset, number of parameters

def ewd_disc(data, k):
    ## set up the discretisation transformer
    ewd_disc = ewd(bins=k, variables=num_col, return_boundaries=False)
    '''
    Parameters
    ----------
    bins : int, default=10
        Desired number of equal width intervals / bins.

    variables : list
        The list of numerical variables to transform. If None, the
        discretiser will automatically select all numerical type variables.

    return_object : bool, default=False
        Whether the numbers in the discrete variable should be returned as
        numeric or as object. The decision should be made by the user based on
        whether they would like to proceed the engineering of the variable as
        if it was numerical or categorical.

    return_boundaries: bool, default=False
        whether the output should be the interval boundaries. If True, it returns
        the interval boundaries. If False, it returns integers.
    '''
    ## fit the transformer
    ewd_disc.fit(data)
    ## transform the data
    data_ewd = ewd_disc.transform(data)
    ## binner_dict contains the boundaries of the different bins: 
    # stores the interval limits identified for each variable
    ewd_disc.binner_dict_
    return data_ewd  # return dataset after discretization

### EWD - Scenario 1: k = 4

In [11]:
# Perform discretization
k = 4
start = time.time() # Starting  time
data_ewd1 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  4 : 0.4553401470184326


In [12]:
# OUTPUT:
data_ewd1.head()

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-211,211_1+1,0,1,1,0,0,2,2,0,...,0,2,1,1,1,0,1,2,2,1.0
1,MUSK-211,211_1+10,0,0,0,1,0,1,2,0,...,2,2,2,1,1,0,1,1,2,1.0
2,MUSK-211,211_1+11,0,0,0,2,0,2,2,0,...,1,0,2,1,1,0,1,1,2,1.0
3,MUSK-211,211_1+12,0,0,0,1,0,1,2,0,...,2,2,2,1,1,0,1,1,2,1.0
4,MUSK-211,211_1+13,0,0,0,1,0,1,2,0,...,2,2,2,1,1,0,1,1,2,1.0


In [13]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd1.groupby(col)[col].count())

f1
f1
0    4590
1    1444
2     210
3     354
Name: f1, dtype: int64
f2
f2
0    3907
1    1446
2     164
3    1081
Name: f2, dtype: int64
f3
f3
0    3178
1    1047
2    1339
3    1034
Name: f3, dtype: int64
f4
f4
0    2911
1    1048
2    1300
3    1339
Name: f4, dtype: int64
f5
f5
0    6391
1       8
2       6
3     193
Name: f5, dtype: int64
f6
f6
0     905
1    1373
2    3665
3     655
Name: f6, dtype: int64
f7
f7
0    2331
1     458
2    3308
3     501
Name: f7, dtype: int64
f8
f8
0    1625
1    2599
2    2325
3      49
Name: f8, dtype: int64
f9
f9
0    2268
1     426
2    3370
3     534
Name: f9, dtype: int64
f10
f10
0     495
1    2882
2    3036
3     185
Name: f10, dtype: int64
f11
f11
0    1685
1    1805
2    3016
3      92
Name: f11, dtype: int64
f12
f12
0     974
1    2839
2    2554
3     231
Name: f12, dtype: int64
f13
f13
0     250
1    4062
2    2096
3     190
Name: f13, dtype: int64
f14
f14
0    1591
1    1744
2    2923
3     340
Name: f14, dtype: int64
f15
f15
0    1738
1

f154
0    2551
1    2791
2     576
3     680
Name: f154, dtype: int64
f155
f155
0    2688
1    1993
2    1850
3      67
Name: f155, dtype: int64
f156
f156
0    2137
1    1359
2    2764
3     338
Name: f156, dtype: int64
f157
f157
0    6104
1      24
2      75
3     395
Name: f157, dtype: int64
f158
f158
0    3498
1     956
2    1710
3     434
Name: f158, dtype: int64
f159
f159
0    3522
1     278
2    2162
3     636
Name: f159, dtype: int64
f160
f160
0    2037
1    3545
2     743
3     273
Name: f160, dtype: int64
f161
f161
0    2423
1    2215
2    1891
3      69
Name: f161, dtype: int64
f162
f162
0     956
1    4644
2     475
3     523
Name: f162, dtype: int64
f163
f163
0    4686
1    1705
2     173
3      34
Name: f163, dtype: int64
f164
f164
0      15
1    6152
2     253
3     178
Name: f164, dtype: int64
f165
f165
0     189
1    4241
2    1900
3     268
Name: f165, dtype: int64
f166
f166
0     104
1     218
2    6074
3     202
Name: f166, dtype: int64


### EWD - Scenario 2: k = 7

In [14]:
# Perform discretization
k = 7
start = time.time() # Starting  time
data_ewd2 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  7 : 0.4179649353027344


In [15]:
# OUTPUT:
data_ewd2.head()

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-211,211_1+1,1,2,3,1,0,4,3,0,...,0,4,2,2,3,1,2,3,4,1.0
1,MUSK-211,211_1+10,1,0,0,3,0,3,4,0,...,4,3,4,2,3,1,2,3,4,1.0
2,MUSK-211,211_1+11,1,0,0,3,0,4,4,0,...,3,1,4,3,3,1,2,3,4,1.0
3,MUSK-211,211_1+12,1,0,0,3,0,3,4,0,...,4,3,4,2,3,1,2,3,4,1.0
4,MUSK-211,211_1+13,1,0,0,3,0,3,4,0,...,4,3,4,2,3,1,2,3,4,1.0


In [16]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd2.groupby(col)[col].count())

f1
f1
0     268
1    5372
2     232
3     304
4      59
5     215
6     148
Name: f1, dtype: int64
f2
f2
0    3178
1    1180
2     825
3     250
4      74
5     341
6     750
Name: f2, dtype: int64
f3
f3
0    1824
1    1505
2     724
3     414
4     833
5    1077
6     221
Name: f3, dtype: int64
f4
f4
0    1338
1    1800
2     453
3    1397
4     251
5     357
6    1002
Name: f4, dtype: int64
f5
f5
0    6384
1      10
2       5
4       3
5     105
6      91
Name: f5, dtype: int64
f6
f6
0     494
1     646
2     730
3    1520
4    2260
5     721
6     227
Name: f6, dtype: int64
f7
f7
0    2256
1     117
2     234
3    1435
4    2039
5      57
6     460
Name: f7, dtype: int64
f8
f8
0     776
1     970
2    1271
3    2822
4     677
5      75
6       7
Name: f8, dtype: int64
f9
f9
0    2164
1     115
2     293
3    2743
4     732
5     197
6     354
Name: f9, dtype: int64
f10
f10
0      72
1     473
2    1724
3    3377
4     635
5     249
6      68
Name: f10, dtype: int64
f11
f11
0    1269

f161
0     736
1    2212
2     875
3    2249
4     419
5      38
6      69
Name: f161, dtype: int64
f162
f162
0     128
1    1109
2    2453
3    2084
4     261
5     286
6     277
Name: f162, dtype: int64
f163
f163
0     738
1    4634
2     963
3      83
4     138
5      20
6      22
Name: f163, dtype: int64
f164
f164
0      15
1      29
2    5927
3     380
4      12
5     103
6     132
Name: f164, dtype: int64
f165
f165
0      41
1     399
2    1500
3    4190
4     138
5     161
6     169
Name: f165, dtype: int64
f166
f166
0      75
1      36
2     196
3    2510
4    2585
5    1152
6      44
Name: f166, dtype: int64


### EWD - Scenario 3: k = 10

In [17]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_ewd3 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  10 : 0.40227198600769043


In [18]:
# OUTPUT:
data_ewd3.head()

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-211,211_1+1,2,3,4,1,0,6,5,1,...,0,6,3,2,4,1,4,5,6,1.0
1,MUSK-211,211_1+10,2,0,0,4,0,4,5,0,...,6,5,5,4,4,1,3,4,6,1.0
2,MUSK-211,211_1+11,2,0,0,5,0,6,5,1,...,4,1,5,4,4,1,3,4,6,1.0
3,MUSK-211,211_1+12,2,0,0,4,0,4,5,1,...,6,5,5,4,4,1,3,4,6,1.0
4,MUSK-211,211_1+13,2,0,0,4,0,4,5,1,...,6,5,5,4,4,1,3,4,6,1.0


In [19]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd3.groupby(col)[col].count())

f1
f1
0      52
1     690
2    5012
3      81
4     199
5     160
6      41
7     187
8      29
9     147
Name: f1, dtype: int64
f2
f2
0    2785
1     769
2     878
3     678
4     243
5     108
6      43
7     325
8     226
9     543
Name: f2, dtype: int64
f3
f3
0    1344
1    1108
2     904
3     303
4     566
5     771
6     272
7     769
8     553
9       8
Name: f3, dtype: int64
f4
f4
0     475
1    1902
2     850
3     339
4     393
5    1182
6      51
7     308
8     517
9     581
Name: f4, dtype: int64
f5
f5
0    6359
1      30
2       8
3       2
6       2
7      20
8     119
9      58
Name: f5, dtype: int64
f6
f6
0     294
1     349
2     511
3     446
4     678
5    1319
6    1853
7     633
8     379
9     136
Name: f6, dtype: int64
f7
f7
0    2168
1     143
2      79
3     188
4     211
5    2539
6     749
7      38
8      53
9     430
Name: f7, dtype: int64
f8
f8
0     306
1     614
2     857
3     549
4    1898
5    1829
6     431
7      81
8      29
9       4
Name: f8, d

f133
0      46
1     148
2     208
3     566
4    3115
5    1821
6     335
7     179
8      85
9      95
Name: f133, dtype: int64
f134
f134
0      57
1     147
2     101
3     915
4    2199
5    2464
6     281
7     183
8     184
9      67
Name: f134, dtype: int64
f135
f135
0      69
1     421
2    1195
3     284
4    1892
5    2271
6     161
7     229
8      72
9       4
Name: f135, dtype: int64
f136
f136
0      35
1     244
2     349
3    1233
4     804
5    2427
6      88
7     555
8     805
9      58
Name: f136, dtype: int64
f137
f137
0    1036
1     359
2     410
3     314
4     191
5     645
6    2589
7     697
8     170
9     187
Name: f137, dtype: int64
f138
f138
0     300
1      60
2    1023
3    1109
4    1557
5    1066
6     120
7     438
8     350
9     575
Name: f138, dtype: int64
f139
f139
0    1159
1     652
2     441
3     218
4    1219
5    2303
6     333
7     166
8      63
9      44
Name: f139, dtype: int64
f140
f140
0      54
1     225
2     229
3     749
4    2599


## Equal Frequency Discretization - EFD
- Reference: https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/EqualFrequencyDiscretiser.ipynb
- Parameter:
- q : int, default=10
    Desired number of equal frequency intervals / bins. In other words the
    number of quantiles in which the variables should be divided.

- variables : list
    The list of numerical variables that will be discretised. If None, the
    EqualFrequencyDiscretiser() will select all numerical variables.

- return_object : bool, default=False
    Whether the numbers in the discrete variable should be returned as
    numeric or as object. The decision is made by the user based on
    whether they would like to proceed the engineering of the variable as
    if it was numerical or categorical.

- return_boundaries: bool, default=False
    whether the output should be the interval boundaries. If True, it returns
    the interval boundaries. If False, it returns integers.

In [20]:
def efd_disc(data, k):
    ## set up the discretisation transformer
    efd_disc = efd(q=k, variables=num_col)
    ## fit the transformer
    efd_disc.fit(data)
    ## transform the data
    data_efd = efd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    efd_disc.binner_dict_
    return data_efd

### Define function efd_disc, inputs include dataset, number of intervals (k)

### EFD - Scenario 1: k = 4

In [21]:
# Perform discretization
k = 4
start = time.time() # Starting time
data_efd1 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  4 : 0.47538113594055176


In [22]:
## OUTPUT: Check number of instance in each interval 
for col in num_col:
    print(col)
    print(data_efd1.groupby(col)[col].count())

f1
f1
0    1791
1    1897
2    1369
3    1541
Name: f1, dtype: int64
f2
f2
0    1862
1    1444
2    1663
3    1629
Name: f2, dtype: int64
f3
f3
0    1658
1    1644
2    1694
3    1602
Name: f3, dtype: int64
f4
f4
0    1794
1    1525
2    1637
3    1642
Name: f4, dtype: int64
f5
f5
0    4890
1     515
2    1193
Name: f5, dtype: int64
f6
f6
0    1686
1    1639
2    1666
3    1607
Name: f6, dtype: int64
f7
f7
0    1668
1    1742
2    1696
3    1492
Name: f7, dtype: int64
f8
f8
0    1651
1    1670
2    1660
3    1617
Name: f8, dtype: int64
f9
f9
0    1693
1    1725
2    1897
3    1283
Name: f9, dtype: int64
f10
f10
0    1650
1    1694
2    1670
3    1584
Name: f10, dtype: int64
f11
f11
0    1656
1    1661
2    1681
3    1600
Name: f11, dtype: int64
f12
f12
0    1724
1    1580
2    1651
3    1643
Name: f12, dtype: int64
f13
f13
0    1657
1    1665
2    1672
3    1604
Name: f13, dtype: int64
f14
f14
0    1650
1    1685
2    1714
3    1549
Name: f14, dtype: int64
f15
f15
0    1651
1    1732
2

### EFD - Scenario 2: k = 7

In [23]:
# Perform discretization
k = 7
start = time.time() # Starting time
data_efd2 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  7 : 0.4613490104675293


In [24]:
## OUTPUT
data_efd2.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: category(1), int64(166), object(2)
memory usage: 8.5+ MB
f1
f1
0    1256
1     715
2    1087
3     794
4     985
5     827
6     934
Name: f1, dtype: int64
f2
f2
0     978
1    1075
2     781
3     960
4     951
5     937
6     916
Name: f2, dtype: int64
f3
f3
0    1029
1     911
2    1006
3     882
4     909
5     932
6     929
Name: f3, dtype: int64
f4
f4
0    1049
1     950
2     872
3     910
4    1037
5     839
6     941
Name: f4, dtype: int64
f5
f5
0    4890
1     819
2     889
Name: f5, dtype: int64
f6
f6
0    995
1    892
2    941
3    957
4    930
5    940
6    943
Name: f6, dtype: int64
f7
f7
0    1102
1     796
2     931
3     961
4    1006
5     866
6     936
Name: f7, dtype: int64
f8
f8
0    945
1    953
2    962
3    936
4    939
5    924
6    939
Name: f8, dtype: int64
f9
f9
0    1021
1     885
2     927
3    1050
4     989
5     815
6     911
Na

### Scenario 3: k = 10

In [25]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_efd3 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  10 : 0.4925680160522461


In [26]:
## OUTPUT
data_efd3.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: category(1), int64(166), object(2)
memory usage: 8.5+ MB
f1
f1
0     742
1     821
2     472
3    1023
4     630
5     446
6     703
7     470
8     643
9     648
Name: f1, dtype: int64
f2
f2
0    802
1    548
2    703
3    594
4    659
5    672
6    649
7    669
8    645
9    657
Name: f2, dtype: int64
f3
f3
0    685
1    659
2    662
3    817
4    479
5    675
6    760
7    546
8    754
9    561
Name: f3, dtype: int64
f4
f4
0    707
1    631
2    661
3    652
4    668
5    640
6    710
7    644
8    692
9    593
Name: f4, dtype: int64
f5
f5
0    4890
1     515
2     537
3     656
Name: f5, dtype: int64
f6
f6
0    661
1    673
2    663
3    766
4    562
5    705
6    593
7    709
8    611
9    655
Name: f6, dtype: int64
f7
f7
0    888
1    629
2    484
3    643
4    766
5    632
6    754
7    482
8    703
9    617
Name: f7, dtype: int64
f8
f8
0    673
1    655

## Fixed Frequency Discretization - FFD

### Define function ffd_disc: modify input of function efd
Input include dataset, interval frequency (m)

In [27]:
def ffd_disc(data, m): # 
    n = len(data)
    ## set up the discretisation transformer
    ffd_disc = efd(q=round(n/m), variables=num_col) # number of bins = n/m
    ## fit the transformer
    ffd_disc.fit(data)
    ## transform the data
    data_ffd = ffd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    ffd_disc.binner_dict_
    return data_ffd

### FFD - Scenario 1: m = 10

In [28]:
# Perform discretization
m = 10
start = time.time() # Starting time
data_ffd1 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD,  m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD,  m =  10 : 1.8921542167663574


In [29]:
## OUTPUT
data_ffd1.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd1.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: category(1), int64(166), object(2)
memory usage: 8.5+ MB
f1
f1
0      12
1       8
2      13
3      13
4       6
       ..
111     9
112    29
114    47
115    49
116    24
Name: f1, Length: 114, dtype: int64
f2
f2
0      802
1      176
2      181
3      191
5      278
      ... 
196     88
197     39
198     17
199     10
200      8
Name: f2, Length: 190, dtype: int64
f3
f3
0      21
1      36
2      39
3      11
4      24
       ..
185    69
186    46
187    10
188     5
189    10
Name: f3, Length: 174, dtype: int64
f4
f4
0       23
1       38
2       56
3       22
4       42
      ... 
194    114
195     35
196      5
197      8
198     10
Name: f4, Length: 189, dtype: int64
f5
f5
0     4890
1      515
2      160
3       57
4       87
5       76
6       67
7       47
8       43
9       49
10      27
12      21
13      16
14      21
15      10
16      16
17  

f143
0       59
1      112
2      236
3      230
4      304
      ... 
219     17
220      7
221     15
222      3
223      8
Name: f143, Length: 215, dtype: int64
f144
f144
0      236
1      339
2      377
3      186
4      305
      ... 
213      9
214      8
215     10
216     10
217     10
Name: f144, Length: 208, dtype: int64
f145
f145
0     4999
1      652
2       40
3       80
4       25
      ... 
64       9
65      12
66       9
67      10
68       9
Name: f145, Length: 67, dtype: int64
f146
f146
0      443
1      942
2     2096
3     1402
4      442
      ... 
93       9
94      11
95       9
96      10
97      10
Name: f146, Length: 98, dtype: int64
f147
f147
0      12
1      17
2      27
3      41
4     118
     ... 
73     10
74     10
75     11
76      9
77     10
Name: f147, Length: 75, dtype: int64
f148
f148
0      23
1      10
2      11
3       8
4       9
       ..
232     7
233    12
234     9
235     9
236    10
Name: f148, Length: 220, dtype: int64
f149
f149
0     

### FFD - Scenario 1: m = 30

In [30]:
# Perform discretization
m = 30
start = time.time() # Starting time
data_ffd2 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, EFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, EFD, m =  30 : 1.2363886833190918


In [31]:
## OUTPUT
data_ffd2.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: category(1), int64(166), object(2)
memory usage: 8.5+ MB
f1
f1
0     33
1     56
2     82
3     12
4     48
      ..
62    29
63    29
64    47
65    49
66    24
Name: f1, Length: 67, dtype: int64
f2
f2
0      802
1      176
2      181
3      191
5      278
      ... 
129    112
130     88
131     39
132     17
133     18
Name: f2, Length: 133, dtype: int64
f3
f3
0      57
1      39
2      35
3      71
4      44
       ..
125    31
126    50
127    69
128    46
129    25
Name: f3, Length: 127, dtype: int64
f4
f4
0       61
1       56
2       22
3       42
4       39
      ... 
127     53
128     69
129    114
130     35
131     23
Name: f4, Length: 131, dtype: int64
f5
f5
0     4890
1      515
2      160
3       57
4       87
5       76
6       67
7       47
8       43
9       49
10      27
11      21
12      37
13      26
14      16
15      41
16      25
17   

f107
0      42
1      33
2      37
3      62
4      75
       ..
112    33
113    29
114    35
115    23
116    30
Name: f107, Length: 117, dtype: int64
f108
f108
0      33
1      34
2      26
3      32
4      28
       ..
173    23
174    28
175    31
176    29
177    30
Name: f108, Length: 177, dtype: int64
f109
f109
0       40
1      254
2      437
3      179
4       29
      ... 
120     41
121     19
122     33
123     35
124     22
Name: f109, Length: 123, dtype: int64
f110
f110
0       38
1       23
2       53
3      374
4      324
      ... 
104     23
105     28
106     36
107     23
108     30
Name: f110, Length: 108, dtype: int64
f111
f111
0     142
1     321
2     255
3     146
4      70
     ... 
93     41
94     17
95     28
96     34
97     26
Name: f111, Length: 97, dtype: int64
f112
f112
0      30
1      33
2      36
3      29
4      34
       ..
150    24
151    33
152    27
153    30
154    30
Name: f112, Length: 155, dtype: int64
f113
f113
0      30
1      30
2     

### FFD - Scenario 3: m = 60

In [32]:
# Perform discretization
m = 60
start = time.time() # Starting time
data_ffd3 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  60 : 0.9191968441009521


In [33]:
## OUTPUT
data_ffd3.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: category(1), int64(166), object(2)
memory usage: 8.5+ MB
f1
f1
0      89
1      82
2      12
3      58
4     120
5      78
6      57
7     142
8     104
9     133
10    381
11    307
12    228
13    180
14     64
15    140
16    171
17    206
18    506
19    630
20    164
21    282
22    219
23    228
24    256
25     64
26    156
27    250
28    133
29    127
30     59
31     14
32     93
33     27
34     65
35     59
36     66
37     56
38     55
39     72
40     50
41     60
42     74
43     92
44     11
45     58
46     96
47     24
Name: f1, dtype: int64
f2
f2
0     802
1     176
2     181
3     191
4     278
     ... 
74     85
75    112
76     88
77     39
78     35
Name: f2, Length: 79, dtype: int64
f3
f3
0     96
1     35
2     71
3     44
4     76
      ..
84    69
85    81
86    69
87    46
88    25
Name: f3, Length: 89, dtype: int64
f4
f4
0      61


f132
0     135
1     467
2      98
3      54
4      38
     ... 
94     64
95     54
96     78
97     43
98     59
Name: f132, Length: 97, dtype: int64
f133
f133
0     61
1     59
2     65
3     61
4     54
      ..
77    54
78    59
79    60
80    62
81    58
Name: f133, Length: 82, dtype: int64
f134
f134
0     60
1     65
2     59
3     61
4     57
      ..
95    63
96    65
97    53
98    60
99    59
Name: f134, Length: 100, dtype: int64
f135
f135
0     60
1     64
2     56
3     66
4     59
      ..
90    62
91    60
92    59
93    67
94    52
Name: f135, Length: 95, dtype: int64
f136
f136
0     65
1     65
2     68
3     58
4     48
      ..
77    49
78    73
79    50
80    55
81    58
Name: f136, Length: 81, dtype: int64
f137
f137
0     112
1      53
2      82
3     133
4     184
     ... 
82     56
83     57
84     64
85     59
86     57
Name: f137, Length: 86, dtype: int64
f138
f138
0      63
1      90
2      43
3      49
4      55
     ... 
91     92
92    100
93    131
94    

#### FFD, m = 100

In [34]:
# Perform discretization
m = 100
start = time.time() # Starting time
data_ffd4 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  100 : 0.7416830062866211


In [35]:
## OUTPUT
data_ffd4.info()

## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd4.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: category(1), int64(166), object(2)
memory usage: 8.5+ MB
f1
f1
0     171
1      60
2     130
3      42
4     115
5     120
6     104
7     133
8     381
9     307
10    228
11    180
12     64
13    140
14    171
15    206
16    506
17    630
18    164
19    282
20    219
21    228
22     19
23    237
24     64
25    156
26    250
27    133
28    127
29     59
30    107
31     66
32    103
33    104
34    108
35     86
36    117
37     92
38    116
39     73
Name: f1, dtype: int64
f2
f2
0     802
1     176
2     181
3     191
4     278
5     234
6     191
7     106
8      81
9      81
10     82
11    106
12    109
13     88
14    110
15    109
16     86
17    100
18     92
19    103
20    101
21     93
22    108
23    128
24     83
25     88
26    124
27     70
28    136
29     62
30    113
31     95
32    120
33     71
34    112
35     97
36    114
37     89
3

f152
0     474
1     693
2     451
3     337
4     449
5     237
6     171
7     143
8     106
9     109
10     69
11     61
12    110
13     89
14    101
15    100
16    134
17    156
18    100
19     18
20    142
21     74
22     83
23    134
24     58
25    109
26    100
27    113
28     95
29    117
30    130
31     90
32     99
33     74
34    125
35     56
36     98
37    107
38    166
39    111
40     61
41     78
42    113
43     76
44     81
Name: f152, dtype: int64
f153
f153
0     1547
1     1625
2      177
3       85
4       74
5      106
6       94
7       92
8      112
9      148
10      58
11     110
12     105
13      97
14      96
15      73
16     122
17     142
18     182
19     213
20     102
21      46
22     116
23      83
24     164
25      60
26      77
27     125
28      80
29      91
30     105
31     127
32      66
33      98
Name: f153, dtype: int64
f154
f154
0     409
1     595
2     397
3     147
4     121
5      91
6      98
7      42
8     146
9      54
1

### Export discretized datasets

In [36]:
# EWD datasets:
data_ewd1.to_csv('musk_ewd1.csv', index = False) # k=4
data_ewd2.to_csv('musk_ewd2.csv', index = False) # k=7
data_ewd3.to_csv('musk_ewd3.csv', index = False) # k=10

In [37]:
# EFD datasets:
data_efd1.to_csv('musk_efd1.csv', index = False) # k=4
data_efd2.to_csv('musk_efd2.csv', index = False) # k=7
data_efd3.to_csv('musk_efd3.csv', index = False) # k=10


In [38]:
# FFD datasets:
data_ffd1.to_csv('musk_ffd1.csv', index = False) # m=10
data_ffd2.to_csv('musk_ffd2.csv', index = False) # m=30
data_ffd3.to_csv('musk_ffd3.csv', index = False) # m=60
data_ffd4.to_csv('musk_ffd4.csv', index = False) # m=100