# Discretization of pre-processed data using Decision Tree discretization
## Dataset: pageblock

By: Sam
Update: 23/02/2023
Replicate using Malina script

### About Dataset
Number of Instances: 5473.

Number of Attributes: 10 numeric attributes
   - height: Height of the block.
   - lenght: Length of the block. 
   - area: Area of the block (height * lenght);
   - eccen: Eccentricity of the block (lenght / height);
   - p_black:Percentage of black pixels within the block (blackpix / area);
   - p_and: Percentage of black pixels after the application of the Run Length Smoothing Algorithm (RLSA) (blackand / area);
   - mean_tr: Mean number of white-black transitions (blackpix / wb_trans);
   - blackpix: Total number of black pixels in the original bitmap of the block.
   - blackand: Total number of black pixels in the bitmap of the block after the RLSA.
   - wb_trans: Number of white-black transitions in the original bitmap of the block.

Missing Attribute Values:  No missing value.

# 1. Preparing data

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [2]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_pageblock.csv')
#pageblock dataset
pageblock = data0

In [3]:
pageblock

Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blacpix,blackand,wb_trans,class
0,5,7,35,1.400,0.400,0.657,2.33,14,23,6,1
1,6,7,42,1.167,0.429,0.881,3.60,18,37,5,1
2,6,18,108,3.000,0.287,0.741,4.43,31,80,7,1
3,5,7,35,1.400,0.371,0.743,4.33,13,26,3,1
4,6,3,18,0.500,0.500,0.944,2.25,9,17,4,1
...,...,...,...,...,...,...,...,...,...,...,...
5468,4,524,2096,131.000,0.542,0.603,40.57,1136,1264,28,2
5469,7,4,28,0.571,0.714,0.929,10.00,20,26,2,1
5470,6,95,570,15.833,0.300,0.911,1.64,171,519,104,1
5471,7,41,287,5.857,0.213,0.801,1.36,61,230,45,1


In [4]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
pageblock['class']= label_encoder.fit_transform(pageblock['class'])
  
pageblock['class'].unique()

array([0, 1, 3, 4, 2])

In [5]:
# List of continuous feature to discretize
num_list = pageblock.columns.to_list()


In [6]:
num_list.remove('class')

In [7]:
y_list = pd.DataFrame(pageblock['class'])

In [8]:
num_list

['height',
 'length',
 'area',
 'eccen',
 'p_black',
 'p_and',
 'mean_tr',
 'blacpix',
 'blackand',
 'wb_trans']

In [9]:
num_list
y_list

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
5468,1
5469,0
5470,0
5471,0


# 3. Decision Tree discretization

In [10]:
# !pip install feature_engine

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser

In [12]:
# Load dataset
data = pageblock
data

Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blacpix,blackand,wb_trans,class
0,5,7,35,1.400,0.400,0.657,2.33,14,23,6,0
1,6,7,42,1.167,0.429,0.881,3.60,18,37,5,0
2,6,18,108,3.000,0.287,0.741,4.43,31,80,7,0
3,5,7,35,1.400,0.371,0.743,4.33,13,26,3,0
4,6,3,18,0.500,0.500,0.944,2.25,9,17,4,0
...,...,...,...,...,...,...,...,...,...,...,...
5468,4,524,2096,131.000,0.542,0.603,40.57,1136,1264,28,1
5469,7,4,28,0.571,0.714,0.929,10.00,20,26,2,0
5470,6,95,570,15.833,0.300,0.911,1.64,171,519,104,0
5471,7,41,287,5.857,0.213,0.801,1.36,61,230,45,0


In [13]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

# DT scripts

In [14]:
#load data
data = pageblock
# let's separate into training and testing set
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (3831, 11)
X_test : (1642, 11)


## 2.1 DT with small max_depth

In [15]:
#make DT discreizer
# 'max_depth': [2] => 2^2 = 4 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [2]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

        height    length      area     eccen   p_black     p_and   mean_tr  \
1815  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
5066  0.764957  0.054813  0.048361  0.037736  0.035986  0.308390  0.008525   
5297  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
2923  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
3172  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
...        ...       ...       ...       ...       ...       ...       ...   
1839  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
2106  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
1319  0.008413  0.054813  0.048361  0.037736  0.028436  0.122699  0.008525   
2972  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
3001  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   

       blacpix  blackand  wb_trans  class  
1815  0.048801  0.0

In [16]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: height
4
Entries per interval for height
Counter({0.00841311285175515: 4923, 0.7649572649572649: 341, 0.018867924528301886: 151, 0.4772727272727273: 58})
 
No of bins: length
4
Entries per interval for length
Counter({0.05481283422459893: 5344, 0.041666666666666664: 65, 0.92: 36, 0.05555555555555555: 28})
 
No of bins: area
4
Entries per interval for area
Counter({0.04836065573770492: 5239, 0.3515625: 177, 0.4090909090909091: 29, 0.0: 28})
 
No of bins: eccen
4
Entries per interval for eccen
Counter({0.03773584905660377: 5225, 0.05: 89, 0.6607142857142857: 85, 0.9137931034482759: 74})
 
No of bins: p_black
4
Entries per interval for p_black
Counter({0.03598595669982446: 4900, 0.02843601895734597: 291, 0.5901639344262295: 176, 0.375: 106})
 
No of bins: p_and
4
Entries per interval for p_and
Counter({0.01762977473065622: 4377, 0.30839002267573695: 642, 0.12269938650306748: 450, 1.0: 4})
 
No of bins: mean_tr
4
Entries per interval for mean_tr
Counter({0.008524590163934427: 4

In [17]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pageblock.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_small_discretized_pageblock.csv',index=False)

        height    length      area     eccen   p_black     p_and   mean_tr  \
1815  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
5066  0.764957  0.054813  0.048361  0.037736  0.035986  0.308390  0.008525   
5297  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
2923  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
3172  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
...        ...       ...       ...       ...       ...       ...       ...   
1839  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
2106  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
1319  0.008413  0.054813  0.048361  0.037736  0.028436  0.122699  0.008525   
2972  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   
3001  0.008413  0.054813  0.048361  0.037736  0.035986  0.017630  0.008525   

       blacpix  blackand  wb_trans  class  
1815  0.048801  0.0

In [18]:
disc_ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   height    5473 non-null   int64
 1   length    5473 non-null   int64
 2   area      5473 non-null   int64
 3   eccen     5473 non-null   int64
 4   p_black   5473 non-null   int64
 5   p_and     5473 non-null   int64
 6   mean_tr   5473 non-null   int64
 7   blacpix   5473 non-null   int64
 8   blackand  5473 non-null   int64
 9   wb_trans  5473 non-null   int64
 10  class     5473 non-null   int64
dtypes: int64(11)
memory usage: 470.5 KB


## 2.2 DT with medium max_depth

In [19]:
#make DT discreizer
# 'max_depth': [3] => 2^3 = 8 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [3]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

        height    length      area     eccen   p_black     p_and  mean_tr  \
1815  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
5066  0.740113  0.056443  0.167315  0.250000  0.019614  0.310502  0.00670   
5297  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
2923  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
3172  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
...        ...       ...       ...       ...       ...       ...      ...   
1839  0.006742  0.056443  0.039377  0.028024  0.019614  0.050980  0.00670   
2106  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
1319  0.006742  0.056443  0.039377  0.028024  0.024752  0.111111  0.03271   
2972  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
3001  0.006742  0.056443  0.039377  0.028024  0.019614  0.050980  0.00670   

       blacpix  blackand  wb_trans  class  
1815  0.047619  0.036137  0.014

In [20]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: height
7
Entries per interval for height
Counter({0.006742261722341404: 4672, 0.7401129943502824: 254, 0.03804347826086957: 251, 0.022222222222222223: 134, 0.8421052631578947: 87, 0.4772727272727273: 58, 0.0: 17})
 
No of bins: length
6
Entries per interval for length
Counter({0.05644273127753304: 5193, 0.0: 151, 0.041666666666666664: 65, 0.05555555555555555: 28, 1.0: 24, 0.6666666666666666: 12})
 
No of bins: area
6
Entries per interval for area
Counter({0.03937702027622686: 4866, 0.16731517509727625: 373, 0.42857142857142855: 127, 0.23529411764705882: 68, 0.0: 28, 0.375: 11})
 
No of bins: eccen
7
Entries per interval for eccen
Counter({0.02802402058907635: 4984, 0.25: 241, 0.05172413793103448: 85, 0.6851851851851852: 81, 0.8529411764705882: 41, 1.0: 33, 0.0: 8})
 
No of bins: p_black
7
Entries per interval for p_black
Counter({0.019614147909967846: 4420, 0.2012987012987013: 480, 0.024752475247524754: 278, 0.5145631067961165: 146, 0.375: 106, 1.0: 30, 0.1111111111111111: 

In [21]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pageblock.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_medium_discretized_pageblock.csv',index=False)

        height    length      area     eccen   p_black     p_and  mean_tr  \
1815  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
5066  0.740113  0.056443  0.167315  0.250000  0.019614  0.310502  0.00670   
5297  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
2923  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
3172  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
...        ...       ...       ...       ...       ...       ...      ...   
1839  0.006742  0.056443  0.039377  0.028024  0.019614  0.050980  0.00670   
2106  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
1319  0.006742  0.056443  0.039377  0.028024  0.024752  0.111111  0.03271   
2972  0.006742  0.056443  0.039377  0.028024  0.019614  0.010967  0.00670   
3001  0.006742  0.056443  0.039377  0.028024  0.019614  0.050980  0.00670   

       blacpix  blackand  wb_trans  class  
1815  0.047619  0.036137  0.014

## 2.3 DT with large max_depth

In [22]:
#make DT discreizer
# 'max_depth': [4] => 2^4 = 16 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [4]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

        height   length      area     eccen   p_black     p_and   mean_tr  \
1815  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
5066  0.740113  0.05493  0.160494  0.296875  0.015759  0.310502  0.006018   
5297  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
2923  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
3172  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
...        ...      ...       ...       ...       ...       ...       ...   
1839  0.005283  0.05493  0.039088  0.031300  0.015759  0.047525  0.006018   
2106  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
1319  0.005283  0.05493  0.039088  0.002513  0.026042  0.077348  0.028302   
2972  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
3001  0.005283  0.05493  0.039088  0.031300  0.015759  0.047525  0.006018   

       blacpix  blackand  wb_trans  class  
1815  0.043033  0.034740  0.005

In [23]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: height
10
Entries per interval for height
Counter({0.0052827843380981974: 4608, 0.7401129943502824: 254, 0.046296296296296294: 150, 0.012658227848101266: 119, 0.02631578947368421: 101, 0.8421052631578947: 87, 0.1111111111111111: 64, 0.4772727272727273: 58, 0.0: 17, 0.09090909090909091: 15})
 
No of bins: length
7
Entries per interval for length
Counter({0.054929577464788736: 5077, 0.0: 151, 0.12195121951219512: 116, 0.041666666666666664: 65, 0.05555555555555555: 28, 1.0: 24, 0.6666666666666666: 12})
 
No of bins: area
11
Entries per interval for area
Counter({0.03908794788273615: 4830, 0.16049382716049382: 354, 0.5272727272727272: 74, 0.2777777777777778: 47, 0.07692307692307693: 36, 0.18181818181818182: 35, 0.0: 28, 0.13333333333333333: 21, 0.2857142857142857: 19, 0.42857142857142855: 18, 0.375: 11})
 
No of bins: eccen
12
Entries per interval for eccen
Counter({0.03130041949015812: 4419, 0.002512562814070352: 565, 0.296875: 188, 0.046511627906976744: 64, 0.627906976744186:

In [24]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pageblock.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_large_discretized_pageblock.csv',index=False)

        height   length      area     eccen   p_black     p_and   mean_tr  \
1815  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
5066  0.740113  0.05493  0.160494  0.296875  0.015759  0.310502  0.006018   
5297  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
2923  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
3172  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
...        ...      ...       ...       ...       ...       ...       ...   
1839  0.005283  0.05493  0.039088  0.031300  0.015759  0.047525  0.006018   
2106  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
1319  0.005283  0.05493  0.039088  0.002513  0.026042  0.077348  0.028302   
2972  0.005283  0.05493  0.039088  0.031300  0.015759  0.006985  0.006018   
3001  0.005283  0.05493  0.039088  0.031300  0.015759  0.047525  0.006018   

       blacpix  blackand  wb_trans  class  
1815  0.043033  0.034740  0.005

## 2.4 DT with extra large max_depth

In [25]:
#make DT discreizer
# 'max_depth': [5] => 2^5 = 32 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [5]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

        height    length      area     eccen   p_black     p_and   mean_tr  \
1815  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.003521   
5066  0.740113  0.049708  0.186275  0.396552  0.015745  0.310502  0.003521   
5297  0.003233  0.049708  0.036258  0.068937  0.015873  0.006490  0.003521   
2923  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.007096   
3172  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.007096   
...        ...       ...       ...       ...       ...       ...       ...   
1839  0.003233  0.049708  0.036258  0.007388  0.015745  0.011111  0.003521   
2106  0.018433  0.049708  0.036258  0.007388  0.015745  0.006490  0.003521   
1319  0.003233  0.049708  0.036258  0.002801  0.031847  0.059211  0.011299   
2972  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.003521   
3001  0.003233  0.049708  0.036258  0.007388  0.015745  0.011111  0.007096   

       blacpix  blackand  wb_trans  class  
1815  0.037821  0.0

In [26]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: height
12
Entries per interval for height
Counter({0.003232758620689655: 3986, 0.018433179723502304: 622, 0.7401129943502824: 254, 0.0: 114, 0.027777777777777776: 101, 0.029411764705882353: 88, 0.8421052631578947: 87, 0.1111111111111111: 64, 0.4772727272727273: 58, 0.08333333333333333: 49, 0.045454545454545456: 35, 0.09090909090909091: 15})
 
No of bins: length
9
Entries per interval for length
Counter({0.049707602339181284: 3921, 0.07248157248157248: 1156, 0.0: 151, 0.078125: 93, 0.041666666666666664: 65, 0.05555555555555555: 28, 1.0: 24, 0.2777777777777778: 23, 0.6666666666666666: 12})
 
No of bins: area
15
Entries per interval for area
Counter({0.036257688572353515: 4416, 0.06944444444444445: 414, 0.18627450980392157: 296, 0.5: 80, 0.0: 60, 0.02564102564102564: 58, 0.22580645161290322: 41, 0.0625: 24, 0.13333333333333333: 21, 0.42857142857142855: 18, 0.09090909090909091: 13, 0.375: 11, 1.0: 11, 0.6: 6, 0.6666666666666666: 4})
 
No of bins: eccen
15
Entries per interval f

In [27]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pageblock.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_verylarge_discretized_pageblock.csv',index=False)

        height    length      area     eccen   p_black     p_and   mean_tr  \
1815  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.003521   
5066  0.740113  0.049708  0.186275  0.396552  0.015745  0.310502  0.003521   
5297  0.003233  0.049708  0.036258  0.068937  0.015873  0.006490  0.003521   
2923  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.007096   
3172  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.007096   
...        ...       ...       ...       ...       ...       ...       ...   
1839  0.003233  0.049708  0.036258  0.007388  0.015745  0.011111  0.003521   
2106  0.018433  0.049708  0.036258  0.007388  0.015745  0.006490  0.003521   
1319  0.003233  0.049708  0.036258  0.002801  0.031847  0.059211  0.011299   
2972  0.003233  0.049708  0.036258  0.007388  0.015745  0.006490  0.003521   
3001  0.003233  0.049708  0.036258  0.007388  0.015745  0.011111  0.007096   

       blacpix  blackand  wb_trans  class  
1815  0.037821  0.0