# Discretization of pre-processed data using Decision Tree discretization
## Dataset: australia
By: Sam
Update: 13/10/2022
Reproduce datasets for bnaic, replicate using Malina script

### About Dataset
*Continuous attributes:*

- A2:	continuous.
- A3:	continuous.
- A7:	continuous.
- A10: continuous.
- A13: continuous.
- A14: continuous.

*Categorical attributes:*
- A1:	0,1    CATEGORICAL a,b
- A4:	1,2,3         CATEGORICAL p,g,gg
- A5:  1, 2,3,4,5, 6,7,8,9,10,11,12,13,14    CATEGORICAL ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x 
- A6:	 1, 2,3, 4,5,6,7,8,9    CATEGORICAL ff,dd,j,bb,v,n,o,h,z 
- A8:	1, 0       CATEGORICAL t, f.
- A9: 1, 0	    CATEGORICAL t, f.
- A11:  1, 0	    CATEGORICAL t, f.
- A12:    1, 2, 3    CATEGORICAL s, g, p 

*Label*
A15:   1,2 +,- (class attribute)

# 1. Preparing data

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [2]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_australia.csv')
#australia dataset
aus = data0

In [3]:
aus

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,label
0,1,22.08,11.460,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.000,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.750,1,4,4,1.250,0,0,0,1,2,280,1,0
3,0,21.67,11.500,1,5,3,0.000,1,1,11,1,2,0,1,1
4,1,20.17,8.170,2,6,4,1.960,1,1,14,0,2,60,159,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,31.57,10.500,2,14,4,6.500,1,0,0,0,2,0,1,1
686,1,20.67,0.415,2,8,4,0.125,0,0,0,0,2,0,45,0
687,0,18.83,9.540,2,6,4,0.085,1,0,0,0,2,100,1,1
688,0,27.42,14.500,2,14,8,3.085,1,1,1,0,2,120,12,1


In [4]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
aus['label']= label_encoder.fit_transform(aus['label'])
  
aus['label'].unique()

array([0, 1], dtype=int64)

In [5]:
# List of continuous feature to discretize
num_list = ["A2", "A3", "A7", "A10", "A13", "A14"]

In [6]:
y_list = pd.DataFrame(aus['label'])

In [7]:
num_list
y_list

Unnamed: 0,label
0,0
1,0
2,0
3,1
4,1
...,...
685,1
686,0
687,1
688,1


# 3. Decision Tree discretization

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser

In [9]:
# Load dataset
data = aus
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,label
0,1,22.08,11.460,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.000,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.750,1,4,4,1.250,0,0,0,1,2,280,1,0
3,0,21.67,11.500,1,5,3,0.000,1,1,11,1,2,0,1,1
4,1,20.17,8.170,2,6,4,1.960,1,1,14,0,2,60,159,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,31.57,10.500,2,14,4,6.500,1,0,0,0,2,0,1,1
686,1,20.67,0.415,2,8,4,0.125,0,0,0,0,2,0,45,0
687,0,18.83,9.540,2,6,4,0.085,1,0,0,0,2,100,1,1
688,0,27.42,14.500,2,14,8,3.085,1,1,1,0,2,120,12,1


In [10]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['label'], test_size=0.3, random_state=0)

# DT scripts

In [14]:
#load data
data = aus
# let's separate into training and testing set
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['label'], test_size=0.3, random_state=0)

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (483, 15)
X_test : (207, 15)


## 2.1 DT with small max_depth

In [15]:
#make DT discreizer
# 'max_depth': [2] => 2^2 = 4 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [2]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     A1        A2        A3  A4  A5  A6       A7  A8  A9       A10  A11  A12  \
596   1  0.429022  0.586207   1   8   4  0.64186   1   0  0.237226    0    1   
303   1  0.216216  0.586207   2   2   4  0.22500   0   0  0.237226    0    2   
204   0  0.429022  0.340659   2  11   8  0.64186   1   1  0.915094    0    2   
351   1  0.429022  0.586207   2   6   5  0.22500   0   0  0.237226    1    2   
118   1  0.429022  0.586207   2  11   4  0.64186   1   1  0.915094    1    2   
..   ..       ...       ...  ..  ..  ..      ...  ..  ..       ...  ...  ...   
188   0  0.429022  0.340659   2   9   4  0.22500   0   0  0.237226    1    2   
380   1  0.539216  0.586207   2   7   4  0.64186   1   1  0.915094    1    2   
369   1  0.429022  0.340659   2   7   4  0.64186   1   1  0.915094    0    2   
362   0  0.429022  0.340659   2   7   8  0.22500   0   1  0.481481    1    2   
90    1  0.429022  0.340659   2   4   8  0.22500   0   0  0.237226    1    1   

          A13       A14  label  
596  0

In [16]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
2
Entries per interval for A1
Counter({1: 468, 0: 222})
 
No of bins: A2
4
Entries per interval for A2
Counter({0.42902208201892744: 459, 0.5392156862745098: 141, 0.21621621621621623: 53, 0.7037037037037037: 37})
 
No of bins: A3
4
Entries per interval for A3
Counter({0.34065934065934067: 396, 0.5862068965517241: 236, 0.5384615384615384: 46, 0.9: 12})
 
No of bins: A4
3
Entries per interval for A4
Counter({2: 525, 1: 163, 3: 2})
 
No of bins: A5
14
Entries per interval for A5
Counter({8: 146, 11: 78, 9: 64, 3: 59, 6: 54, 1: 53, 4: 51, 13: 41, 14: 38, 7: 38, 2: 30, 10: 25, 5: 10, 12: 3})
 
No of bins: A6
8
Entries per interval for A6
Counter({4: 408, 8: 138, 5: 59, 1: 57, 3: 8, 9: 8, 2: 6, 7: 6})
 
No of bins: A7
4
Entries per interval for A7
Counter({0.641860465116279: 301, 0.225: 296, 0.4: 77, 1.0: 16})
 
No of bins: A8
2
Entries per interval for A8
Counter({1: 361, 0: 329})
 
No of bins: A9
2
Entries per interval for A9
Counter({0: 395, 1: 295})
 
No of bins: A10
4
Ent

In [17]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = aus.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_small_discretized_aus.csv',index=False)

     A1        A2        A3  A4  A5  A6       A7  A8  A9       A10  A11  A12  \
596   1  0.429022  0.586207   1   8   4  0.64186   1   0  0.237226    0    1   
303   1  0.216216  0.586207   2   2   4  0.22500   0   0  0.237226    0    2   
204   0  0.429022  0.340659   2  11   8  0.64186   1   1  0.915094    0    2   
351   1  0.429022  0.586207   2   6   5  0.22500   0   0  0.237226    1    2   
118   1  0.429022  0.586207   2  11   4  0.64186   1   1  0.915094    1    2   
..   ..       ...       ...  ..  ..  ..      ...  ..  ..       ...  ...  ...   
188   0  0.429022  0.340659   2   9   4  0.22500   0   0  0.237226    1    2   
380   1  0.539216  0.586207   2   7   4  0.64186   1   1  0.915094    1    2   
369   1  0.429022  0.340659   2   7   4  0.64186   1   1  0.915094    0    2   
362   0  0.429022  0.340659   2   7   8  0.22500   0   1  0.481481    1    2   
90    1  0.429022  0.340659   2   4   8  0.22500   0   0  0.237226    1    1   

          A13       A14  label  
596  0

In [21]:
disc_ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      690 non-null    int32
 1   A2      690 non-null    int32
 2   A3      690 non-null    int32
 3   A4      690 non-null    int32
 4   A5      690 non-null    int32
 5   A6      690 non-null    int32
 6   A7      690 non-null    int32
 7   A8      690 non-null    int32
 8   A9      690 non-null    int32
 9   A10     690 non-null    int32
 10  A11     690 non-null    int32
 11  A12     690 non-null    int32
 12  A13     690 non-null    int32
 13  A14     690 non-null    int32
 14  label   690 non-null    int32
dtypes: int32(15)
memory usage: 40.6 KB


## 2.2 DT with medium max_depth

In [22]:
#make DT discreizer
# 'max_depth': [3] => 2^3 = 8 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [3]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     A1        A2        A3  A4  A5  A6        A7  A8  A9       A10  A11  A12  \
596   1  0.475962  0.613333   1   8   4  0.608696   1   0  0.237226    0    1   
303   1  0.000000  0.613333   2   2   4  0.179688   0   0  0.237226    0    2   
204   0  0.475962  0.344444   2  11   8  0.608696   1   1  0.880000    0    2   
351   1  0.475962  0.613333   2   6   5  0.179688   0   0  0.237226    1    2   
118   1  0.475962  0.613333   2  11   4  0.608696   1   1  0.946429    1    2   
..   ..       ...       ...  ..  ..  ..       ...  ..  ..       ...  ...  ...   
188   0  0.339450  0.344444   2   9   4  0.179688   0   0  0.237226    1    2   
380   1  0.583333  0.613333   2   7   4  0.608696   1   1  0.946429    1    2   
369   1  0.339450  0.344444   2   7   4  0.701299   1   1  0.946429    0    2   
362   0  0.475962  0.344444   2   7   8  0.179688   0   1  0.517857    1    2   
90    1  0.475962  0.344444   2   4   8  0.179688   0   0  0.237226    1    1   

          A13       A14  la

In [23]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
2
Entries per interval for A1
Counter({1: 468, 0: 222})
 
No of bins: A2
8
Entries per interval for A2
Counter({0.47596153846153844: 303, 0.3394495412844037: 156, 0.5833333333333334: 114, 0.2857142857142857: 39, 0.782608695652174: 30, 0.3333333333333333: 27, 0.0: 14, 0.25: 7})
 
No of bins: A3
7
Entries per interval for A3
Counter({0.34444444444444444: 391, 0.6133333333333333: 197, 0.52: 43, 0.4166666666666667: 39, 1.0: 13, 0.0: 5, 0.5: 2})
 
No of bins: A4
3
Entries per interval for A4
Counter({2: 525, 1: 163, 3: 2})
 
No of bins: A5
14
Entries per interval for A5
Counter({8: 146, 11: 78, 9: 64, 3: 59, 6: 54, 1: 53, 4: 51, 13: 41, 14: 38, 7: 38, 2: 30, 10: 25, 5: 10, 12: 3})
 
No of bins: A6
8
Entries per interval for A6
Counter({4: 408, 8: 138, 5: 59, 1: 57, 3: 8, 9: 8, 2: 6, 7: 6})
 
No of bins: A7
6
Entries per interval for A7
Counter({0.6086956521739131: 198, 0.1796875: 193, 0.3055555555555556: 103, 0.7012987012987013: 103, 0.37735849056603776: 75, 1.0: 18})
 
No of

In [24]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = aus.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_medium_discretized_aus.csv',index=False)

     A1        A2        A3  A4  A5  A6        A7  A8  A9       A10  A11  A12  \
596   1  0.475962  0.613333   1   8   4  0.608696   1   0  0.237226    0    1   
303   1  0.000000  0.613333   2   2   4  0.179688   0   0  0.237226    0    2   
204   0  0.475962  0.344444   2  11   8  0.608696   1   1  0.880000    0    2   
351   1  0.475962  0.613333   2   6   5  0.179688   0   0  0.237226    1    2   
118   1  0.475962  0.613333   2  11   4  0.608696   1   1  0.946429    1    2   
..   ..       ...       ...  ..  ..  ..       ...  ..  ..       ...  ...  ...   
188   0  0.339450  0.344444   2   9   4  0.179688   0   0  0.237226    1    2   
380   1  0.583333  0.613333   2   7   4  0.608696   1   1  0.946429    1    2   
369   1  0.339450  0.344444   2   7   4  0.701299   1   1  0.946429    0    2   
362   0  0.475962  0.344444   2   7   8  0.179688   0   1  0.517857    1    2   
90    1  0.475962  0.344444   2   4   8  0.179688   0   0  0.237226    1    1   

          A13       A14  la

## 2.3 DT with large max_depth

In [25]:
#make DT discreizer
# 'max_depth': [4] => 2^4 = 16 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [4]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     A1        A2        A3  A4  A5  A6        A7  A8  A9       A10  A11  A12  \
596   1  0.564103  0.608108   1   8   4  0.635514   1   0  0.237226    0    1   
303   1  0.000000  0.608108   2   2   4  0.000000   0   0  0.237226    0    2   
204   0  0.455621  0.333333   2  11   8  0.635514   1   1  0.952381    0    2   
351   1  0.455621  0.608108   2   6   5  0.190083   0   0  0.237226    1    2   
118   1  0.455621  0.608108   2  11   4  0.635514   1   1  0.928571    1    2   
..   ..       ...       ...  ..  ..  ..       ...  ..  ..       ...  ...  ...   
188   0  0.289474  0.333333   2   9   4  0.190083   0   0  0.237226    1    2   
380   1  0.523810  0.608108   2   7   4  0.635514   1   1  0.928571    1    2   
369   1  0.289474  0.333333   2   7   4  0.746032   1   1  0.928571    0    2   
362   0  0.455621  0.333333   2   7   8  0.190083   0   1  0.517857    1    2   
90    1  0.455621  0.333333   2   4   8  0.190083   0   0  0.237226    1    1   

          A13       A14  la

In [26]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
2
Entries per interval for A1
Counter({1: 468, 0: 222})
 
No of bins: A2
12
Entries per interval for A2
Counter({0.4556213017751479: 244, 0.2894736842105263: 105, 0.5238095238095238: 88, 0.5641025641025641: 59, 0.45454545454545453: 51, 0.23076923076923078: 36, 0.7619047619047619: 26, 0.7058823529411765: 24, 0.4: 23, 0.0: 21, 1.0: 9, 0.5: 4})
 
No of bins: A3
8
Entries per interval for A3
Counter({0.3333333333333333: 368, 0.6081081081081081: 195, 0.47619047619047616: 35, 0.5: 26, 0.5833333333333334: 23, 0.5555555555555556: 17, 1.0: 16, 0.0: 10})
 
No of bins: A4
3
Entries per interval for A4
Counter({2: 525, 1: 163, 3: 2})
 
No of bins: A5
14
Entries per interval for A5
Counter({8: 146, 11: 78, 9: 64, 3: 59, 6: 54, 1: 53, 4: 51, 13: 41, 14: 38, 7: 38, 2: 30, 10: 25, 5: 10, 12: 3})
 
No of bins: A6
8
Entries per interval for A6
Counter({4: 408, 8: 138, 5: 59, 1: 57, 3: 8, 9: 8, 2: 6, 7: 6})
 
No of bins: A7
10
Entries per interval for A7
Counter({0.19008264462809918: 181, 

In [27]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = aus.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_large_discretized_aus.csv',index=False)

     A1        A2        A3  A4  A5  A6        A7  A8  A9       A10  A11  A12  \
596   1  0.564103  0.608108   1   8   4  0.635514   1   0  0.237226    0    1   
303   1  0.000000  0.608108   2   2   4  0.000000   0   0  0.237226    0    2   
204   0  0.455621  0.333333   2  11   8  0.635514   1   1  0.952381    0    2   
351   1  0.455621  0.608108   2   6   5  0.190083   0   0  0.237226    1    2   
118   1  0.455621  0.608108   2  11   4  0.635514   1   1  0.928571    1    2   
..   ..       ...       ...  ..  ..  ..       ...  ..  ..       ...  ...  ...   
188   0  0.289474  0.333333   2   9   4  0.190083   0   0  0.237226    1    2   
380   1  0.523810  0.608108   2   7   4  0.635514   1   1  0.928571    1    2   
369   1  0.289474  0.333333   2   7   4  0.746032   1   1  0.928571    0    2   
362   0  0.455621  0.333333   2   7   8  0.190083   0   1  0.517857    1    2   
90    1  0.455621  0.333333   2   4   8  0.190083   0   0  0.237226    1    1   

          A13       A14  la

## 2.4 DT with extra large max_depth

In [28]:
#make DT discreizer
# 'max_depth': [5] => 2^5 = 32 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [5]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     A1        A2        A3  A4  A5  A6        A7  A8  A9       A10  A11  A12  \
596   1  0.857143  0.640449   1   8   4  0.621359   1   0  0.237226    0    1   
303   1  0.000000  0.559322   2   2   4  0.000000   0   0  0.237226    0    2   
204   0  0.527027  0.357664   2  11   8  0.621359   1   1  0.973684    0    2   
351   1  0.527027  0.640449   2   6   5  0.157895   0   0  0.237226    1    2   
118   1  0.400000  0.640449   2  11   4  0.621359   1   1  0.935484    1    2   
..   ..       ...       ...  ..  ..  ..       ...  ..  ..       ...  ...  ...   
188   0  0.305556  0.305785   2   9   4  0.157895   0   0  0.237226    1    2   
380   1  0.508197  0.559322   2   7   4  0.621359   1   1  0.909091    1    2   
369   1  0.305556  0.357664   2   7   4  0.711538   1   1  0.935484    0    2   
362   0  0.527027  0.305785   2   7   8  0.157895   0   1  0.517857    1    2   
90    1  0.527027  0.305785   2   4   8  0.157895   0   0  0.237226    1    1   

          A13       A14  la

In [29]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
2
Entries per interval for A1
Counter({1: 468, 0: 222})
 
No of bins: A2
14
Entries per interval for A2
Counter({0.4: 171, 0.527027027027027: 111, 0.3055555555555556: 99, 0.5081967213114754: 83, 0.41379310344827586: 47, 0.0: 31, 0.15789473684210525: 25, 1.0: 23, 0.35714285714285715: 22, 0.8571428571428571: 21, 0.8: 21, 0.6875: 21, 0.42857142857142855: 11, 0.75: 4})
 
No of bins: A3
11
Entries per interval for A3
Counter({0.35766423357664234: 198, 0.30578512396694213: 170, 0.6404494382022472: 117, 0.559322033898305: 78, 0.5: 33, 0.5333333333333333: 25, 0.5625: 24, 1.0: 18, 0.0: 11, 0.2: 11, 0.6666666666666666: 5})
 
No of bins: A4
3
Entries per interval for A4
Counter({2: 525, 1: 163, 3: 2})
 
No of bins: A5
14
Entries per interval for A5
Counter({8: 146, 11: 78, 9: 64, 3: 59, 6: 54, 1: 53, 4: 51, 13: 41, 14: 38, 7: 38, 2: 30, 10: 25, 5: 10, 12: 3})
 
No of bins: A6
8
Entries per interval for A6
Counter({4: 408, 8: 138, 5: 59, 1: 57, 3: 8, 9: 8, 2: 6, 7: 6})
 
No of bins:

In [30]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = aus.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_verylarge_discretized_aus.csv',index=False)

     A1        A2        A3  A4  A5  A6        A7  A8  A9       A10  A11  A12  \
596   1  0.857143  0.640449   1   8   4  0.621359   1   0  0.237226    0    1   
303   1  0.000000  0.559322   2   2   4  0.000000   0   0  0.237226    0    2   
204   0  0.527027  0.357664   2  11   8  0.621359   1   1  0.973684    0    2   
351   1  0.527027  0.640449   2   6   5  0.157895   0   0  0.237226    1    2   
118   1  0.400000  0.640449   2  11   4  0.621359   1   1  0.935484    1    2   
..   ..       ...       ...  ..  ..  ..       ...  ..  ..       ...  ...  ...   
188   0  0.305556  0.305785   2   9   4  0.157895   0   0  0.237226    1    2   
380   1  0.508197  0.559322   2   7   4  0.621359   1   1  0.909091    1    2   
369   1  0.305556  0.357664   2   7   4  0.711538   1   1  0.935484    0    2   
362   0  0.527027  0.305785   2   7   8  0.157895   0   1  0.517857    1    2   
90    1  0.527027  0.305785   2   4   8  0.157895   0   0  0.237226    1    1   

          A13       A14  la