# Discretization of pre-processed data using Decision Tree discretization
## Dataset: phoneme

By: Sam
Update: 23/02/2023
Replicate using Malina script

### About Dataset

Raw dataset is in format arff, must convert to csv (using tool: https://pulipulichen.github.io/jieba-js/weka/arff2csv/)

Five different attributes were chosen to characterize each vowel: they are the amplitudes of the five first harmonics AHi, normalised by the total energy Ene (integrated on all the frequencies): AHi/Ene. The phonemes are transcribed as follows: sh as in she, dcl as in dark, iy as the vowel in she, aa as the vowel in dark, and ao as the first vowel in water.
=> All attributes are numeric.

The aim of the present database is to distinguish between nasal and oral vowels. There are thus two different classes:
- Class 0 : Nasals
- Class 1 : Orals

# 1. Preparing data

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [20]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_phoneme.csv')
#phoneme dataset
phoneme = data0
phoneme.drop(['id'], axis=1, inplace = True)

In [21]:
phoneme

Unnamed: 0,V1,V2,V3,V4,V5,Class
0,0.489927,-0.451528,-1.047990,-0.598693,-0.020418,1
1,-0.641265,0.109245,0.292130,-0.916804,0.240223,1
2,0.870593,-0.459862,0.578159,0.806634,0.835248,1
3,-0.628439,-0.316284,1.934295,-1.427099,-0.136583,1
4,-0.596399,0.015938,2.043206,-1.688448,-0.948127,1
...,...,...,...,...,...,...
5399,-0.658318,1.331760,-0.081621,1.794253,-1.082181,1
5400,-0.044375,-0.010512,0.030989,-0.019379,1.281061,2
5401,0.246882,-0.793228,1.190101,1.423194,-1.303036,2
5402,-0.778907,-0.383111,1.727029,-1.432389,-1.208085,1


In [22]:
# Import label encoder
from sklearn import preprocessing

phoneme.rename(columns={'Class':'class'}, inplace=True)
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
phoneme['class']= label_encoder.fit_transform(phoneme['class'])
  
phoneme['class'].unique()

array([0, 1])

In [23]:
# List of continuous feature to discretize
num_list = phoneme.columns.to_list()


In [24]:
num_list.remove('class')

In [25]:
y_list = pd.DataFrame(phoneme['class'])

In [26]:
num_list

['V1', 'V2', 'V3', 'V4', 'V5']

In [27]:
num_list
y_list

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
5399,0
5400,1
5401,1
5402,0


# 3. Decision Tree discretization

In [None]:
# !pip install feature_engine

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser

In [29]:
# Load dataset
data = phoneme
data

Unnamed: 0,V1,V2,V3,V4,V5,class
0,0.489927,-0.451528,-1.047990,-0.598693,-0.020418,0
1,-0.641265,0.109245,0.292130,-0.916804,0.240223,0
2,0.870593,-0.459862,0.578159,0.806634,0.835248,0
3,-0.628439,-0.316284,1.934295,-1.427099,-0.136583,0
4,-0.596399,0.015938,2.043206,-1.688448,-0.948127,0
...,...,...,...,...,...,...
5399,-0.658318,1.331760,-0.081621,1.794253,-1.082181,0
5400,-0.044375,-0.010512,0.030989,-0.019379,1.281061,1
5401,0.246882,-0.793228,1.190101,1.423194,-1.303036,1
5402,-0.778907,-0.383111,1.727029,-1.432389,-1.208085,0


In [30]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

# DT scripts

In [31]:
#load data
data = phoneme
# let's separate into training and testing set
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (3782, 6)
X_test : (1622, 6)


## 2.1 DT with small max_depth

In [32]:
#make DT discreizer
# 'max_depth': [2] => 2^2 = 4 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [2]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

            V1        V2        V3        V4        V5  class
3974  0.343898  0.117700  0.061708  0.047408  0.181390      0
2580  0.343898  0.117700  0.468303  0.297604  0.181390      0
2027  0.343898  0.462159  0.174142  0.047408  0.181390      0
4752  0.011869  0.462159  0.061708  0.047408  0.181390      0
4160  0.343898  0.117700  0.468303  0.047408  0.181390      0
...        ...       ...       ...       ...       ...    ...
4770  0.011869  0.462159  0.061708  0.047408  0.181390      0
803   0.428233  0.462159  0.468303  0.600168  0.593137      0
4563  0.011869  0.462159  0.061708  0.047408  0.181390      0
536   0.011869  0.233533  0.174142  0.047408  0.181390      0
3349  0.428233  0.221080  0.468303  0.600168  0.665370      1

[5404 rows x 6 columns]
DT discreizer binner dict:
{'V1': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=29),
             param_grid={'max_depth': [2]}, scoring='accuracy'), 'V2': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(rand

In [33]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: V1
4
Entries per interval for V1
Counter({0.34389782403027436: 3026, 0.42823250296559906: 1193, 0.011869436201780416: 957, 0.1456953642384106: 228})
 
No of bins: V2
4
Entries per interval for V2
Counter({0.4621588089330025: 2283, 0.11769991015274034: 1608, 0.23353293413173654: 958, 0.2210796915167095: 555})
 
No of bins: V3
4
Entries per interval for V3
Counter({0.46830265848670755: 2802, 0.06170752324598478: 1685, 0.1741424802110818: 538, 0.23863636363636365: 379})
 
No of bins: V4
4
Entries per interval for V4
Counter({0.04740834386852086: 2272, 0.6001676445934618: 1701, 0.29760403530895335: 1099, 0.4252336448598131: 332})
 
No of bins: V5
4
Entries per interval for V5
Counter({0.18139029688631428: 3952, 0.5931372549019608: 597, 0.5746478873239437: 505, 0.6653696498054474: 350})
 
No of bins: class
2
Entries per interval for class
Counter({0: 3818, 1: 1586})
 


In [34]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = phoneme.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_small_discretized_phoneme.csv',index=False)

            V1        V2        V3        V4        V5  class
3974  0.343898  0.117700  0.061708  0.047408  0.181390      0
2580  0.343898  0.117700  0.468303  0.297604  0.181390      0
2027  0.343898  0.462159  0.174142  0.047408  0.181390      0
4752  0.011869  0.462159  0.061708  0.047408  0.181390      0
4160  0.343898  0.117700  0.468303  0.047408  0.181390      0
...        ...       ...       ...       ...       ...    ...
4770  0.011869  0.462159  0.061708  0.047408  0.181390      0
803   0.428233  0.462159  0.468303  0.600168  0.593137      0
4563  0.011869  0.462159  0.061708  0.047408  0.181390      0
536   0.011869  0.233533  0.174142  0.047408  0.181390      0
3349  0.428233  0.221080  0.468303  0.600168  0.665370      1

[5404 rows x 6 columns]
      V1  V2  V3  V4  V5  class
0      2   0   0   0   0      0
1      2   0   3   1   0      0
2      2   3   1   0   0      0
3      0   3   0   0   0      0
4      2   0   3   0   0      0
...   ..  ..  ..  ..  ..    ...
5399   

In [35]:
disc_ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   V1      5404 non-null   int64
 1   V2      5404 non-null   int64
 2   V3      5404 non-null   int64
 3   V4      5404 non-null   int64
 4   V5      5404 non-null   int64
 5   class   5404 non-null   int64
dtypes: int64(6)
memory usage: 253.4 KB


## 2.2 DT with medium max_depth

In [36]:
#make DT discreizer
# 'max_depth': [3] => 2^3 = 8 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [3]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

            V1        V2        V3        V4        V5  class
3974  0.354664  0.127789  0.032720  0.019528  0.276423      0
2580  0.354664  0.127789  0.524496  0.287858  0.160793      0
2027  0.270370  0.300000  0.162602  0.144476  0.160793      0
4752  0.000000  0.500768  0.032720  0.019528  0.160793      0
4160  0.354664  0.127789  0.437401  0.019528  0.160793      0
...        ...       ...       ...       ...       ...    ...
4770  0.000000  0.500768  0.032720  0.019528  0.160793      0
803   0.393124  0.500768  0.437401  0.500000  0.725888      0
4563  0.000000  0.500768  0.032720  0.019528  0.160793      0
536   0.036036  0.225806  0.162602  0.019528  0.160793      0
3349  0.393124  0.197015  0.437401  0.616585  0.530435      1

[5404 rows x 6 columns]
DT discreizer binner dict:
{'V1': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=29),
             param_grid={'max_depth': [3]}, scoring='accuracy'), 'V2': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(rand

In [37]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: V1
8
Entries per interval for V1
Counter({0.3546637744034707: 2649, 0.3931240657698057: 959, 0.0: 645, 0.27037037037037037: 377, 0.036036036036036036: 312, 0.5632183908045977: 234, 0.1342281879194631: 226, 1.0: 2})
 
No of bins: V2
8
Entries per interval for V2
Counter({0.500768049155146: 1867, 0.12778904665314403: 1424, 0.22580645161290322: 937, 0.19701492537313434: 485, 0.3: 416, 0.03937007874015748: 184, 0.37037037037037035: 70, 0.5294117647058824: 21})
 
No of bins: V3
8
Entries per interval for V3
Counter({0.43740095087163233: 1775, 0.032719836400818: 1395, 0.5244956772334294: 1027, 0.16260162601626016: 527, 0.25203252032520324: 352, 0.2: 290, 0.05555555555555555: 27, 0.6: 11})
 
No of bins: V4
8
Entries per interval for V4
Counter({0.01952807160292921: 1766, 0.6165853658536585: 1463, 0.2878581173260573: 1026, 0.14447592067988668: 506, 0.41626794258373206: 324, 0.5: 238, 0.4166666666666667: 73, 0.8: 8})
 
No of bins: V5
8
Entries per interval for V5
Counter({0.16079295

In [38]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = phoneme.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_medium_discretized_phoneme.csv',index=False)

            V1        V2        V3        V4        V5  class
3974  0.354664  0.127789  0.032720  0.019528  0.276423      0
2580  0.354664  0.127789  0.524496  0.287858  0.160793      0
2027  0.270370  0.300000  0.162602  0.144476  0.160793      0
4752  0.000000  0.500768  0.032720  0.019528  0.160793      0
4160  0.354664  0.127789  0.437401  0.019528  0.160793      0
...        ...       ...       ...       ...       ...    ...
4770  0.000000  0.500768  0.032720  0.019528  0.160793      0
803   0.393124  0.500768  0.437401  0.500000  0.725888      0
4563  0.000000  0.500768  0.032720  0.019528  0.160793      0
536   0.036036  0.225806  0.162602  0.019528  0.160793      0
3349  0.393124  0.197015  0.437401  0.616585  0.530435      1

[5404 rows x 6 columns]
      V1  V2  V3  V4  V5  class
0      4   1   0   0   1      0
1      4   1   6   2   0      0
2      3   4   2   1   0      0
3      0   6   0   0   0      0
4      4   1   5   0   0      0
...   ..  ..  ..  ..  ..    ...
5399   

## 2.3 DT with large max_depth

In [39]:
#make DT discreizer
# 'max_depth': [4] => 2^4 = 16 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [4]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

            V1        V2        V3        V4        V5  class
3974  0.306649  0.142317  0.057823  0.012241  0.270492      0
2580  0.386631  0.142317  0.571765  0.292546  0.082317      0
2027  0.304762  0.295455  0.184466  0.142045  0.220840      0
4752  0.000000  0.461679  0.021930  0.012241  0.082317      0
4160  0.306649  0.142317  0.453789  0.012241  0.220840      0
...        ...       ...       ...       ...       ...    ...
4770  0.000000  0.461679  0.021930  0.012241  0.082317      0
803   0.406349  0.529178  0.453789  0.458904  0.770642      0
4563  0.000000  0.529178  0.021930  0.012241  0.220840      0
536   0.031674  0.166667  0.184466  0.012241  0.220840      0
3349  0.406349  0.237903  0.453789  0.612426  0.544643      1

[5404 rows x 6 columns]
DT discreizer binner dict:
{'V1': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=29),
             param_grid={'max_depth': [4]}, scoring='accuracy'), 'V2': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(rand

In [40]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: V1
13
Entries per interval for V1
Counter({0.3866305329719964: 1581, 0.3066485753052917: 1068, 0.40634920634920635: 900, 0.0: 645, 0.03167420814479638: 311, 0.3047619047619048: 288, 0.5244755244755245: 192, 0.16666666666666666: 149, 0.15: 89, 0.06382978723404255: 77, 0.1794871794871795: 59, 0.7419354838709677: 42, 1.0: 3})
 
No of bins: V2
16
Entries per interval for V2
Counter({0.14231738035264482: 1158, 0.5291777188328912: 1089, 0.46167883211678834: 778, 0.2631578947368421: 578, 0.29545454545454547: 414, 0.23790322580645162: 367, 0.16666666666666666: 359, 0.06770833333333333: 266, 0.08045977011494253: 118, 0.0: 112, 0.10416666666666667: 72, 0.47058823529411764: 47, 0.2: 23, 0.75: 11, 0.3333333333333333: 10, 1.0: 2})
 
No of bins: V3
16
Entries per interval for V3
Counter({0.4537892791127542: 1518, 0.021929824561403508: 977, 0.571764705882353: 629, 0.18446601941747573: 442, 0.05782312925170068: 418, 0.44981412639405205: 398, 0.24279835390946503: 349, 0.3388888888888889: 25

In [41]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = phoneme.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_large_discretized_phoneme.csv',index=False)

            V1        V2        V3        V4        V5  class
3974  0.306649  0.142317  0.057823  0.012241  0.270492      0
2580  0.386631  0.142317  0.571765  0.292546  0.082317      0
2027  0.304762  0.295455  0.184466  0.142045  0.220840      0
4752  0.000000  0.461679  0.021930  0.012241  0.082317      0
4160  0.306649  0.142317  0.453789  0.012241  0.220840      0
...        ...       ...       ...       ...       ...    ...
4770  0.000000  0.461679  0.021930  0.012241  0.082317      0
803   0.406349  0.529178  0.453789  0.458904  0.770642      0
4563  0.000000  0.529178  0.021930  0.012241  0.220840      0
536   0.031674  0.166667  0.184466  0.012241  0.220840      0
3349  0.406349  0.237903  0.453789  0.612426  0.544643      1

[5404 rows x 6 columns]
      V1  V2  V3  V4  V5  class
0      7   4   3   0   4      0
1      8   4  13   4   1      0
2      6   9   6   3   3      0
3      0  11   1   0   1      0
4      7   4  12   0   3      0
...   ..  ..  ..  ..  ..    ...
5399   

## 2.4 DT with extra large max_depth

In [42]:
#make DT discreizer
# 'max_depth': [5] => 2^5 = 32 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [5]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

            V1        V2        V3        V4        V5  class
3974  0.309166  0.141236  0.051546  0.001647  0.222222      0
2580  0.383288  0.141236  0.556728  0.291549  0.164319      0
2027  0.252427  0.251613  0.179153  0.127451  0.119403      0
4752  0.000000  0.467652  0.092308  0.026374  0.019713      0
4160  0.309166  0.141236  0.482801  0.001647  0.119403      0
...        ...       ...       ...       ...       ...    ...
4770  0.000000  0.467652  0.092308  0.001647  0.019713      0
803   0.413570  0.442553  0.482801  0.475177  0.742268      0
4563  0.000000  0.442553  0.014540  0.001647  0.119403      0
536   0.019868  0.172840  0.179153  0.026374  0.119403      0
3349  0.413570  0.212500  0.365672  0.606122  0.519608      1

[5404 rows x 6 columns]
DT discreizer binner dict:
{'V1': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=29),
             param_grid={'max_depth': [5]}, scoring='accuracy'), 'V2': GridSearchCV(cv=3, estimator=DecisionTreeClassifier(rand

In [43]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: V1
16
Entries per interval for V1
Counter({0.3832879200726612: 1572, 0.3091655266757866: 1060, 0.4135702746365105: 884, 0.0: 732, 0.019867549668874173: 215, 0.35514018691588783: 148, 0.15: 147, 0.5849056603773585: 146, 0.2524271844660194: 140, 0.05714285714285714: 96, 0.1836734693877551: 70, 0.23333333333333334: 50, 0.35135135135135137: 46, 0.125: 42, 0.7142857142857143: 36, 1.0: 20})
 
No of bins: V2
18
Entries per interval for V2
Counter({0.14123581336696092: 1157, 0.4676524953789279: 760, 0.5684007707129094: 759, 0.25757575757575757: 574, 0.2125: 353, 0.1728395061728395: 344, 0.4425531914893617: 330, 0.33986928104575165: 224, 0.0: 223, 0.08024691358024691: 222, 0.25161290322580643: 190, 0.1076923076923077: 84, 0.0851063829787234: 70, 0.4375: 44, 1.0: 33, 0.15789473684210525: 21, 0.14285714285714285: 8, 0.6666666666666666: 8})
 
No of bins: V3
22
Entries per interval for V3
Counter({0.4828009828009828: 1129, 0.014539579967689823: 891, 0.5567282321899736: 561, 0.1791530944

In [44]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = phoneme.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_verylarge_discretized_phoneme.csv',index=False)

            V1        V2        V3        V4        V5  class
3974  0.309166  0.141236  0.051546  0.001647  0.222222      0
2580  0.383288  0.141236  0.556728  0.291549  0.164319      0
2027  0.252427  0.251613  0.179153  0.127451  0.119403      0
4752  0.000000  0.467652  0.092308  0.026374  0.019713      0
4160  0.309166  0.141236  0.482801  0.001647  0.119403      0
...        ...       ...       ...       ...       ...    ...
4770  0.000000  0.467652  0.092308  0.001647  0.019713      0
803   0.413570  0.442553  0.482801  0.475177  0.742268      0
4563  0.000000  0.442553  0.014540  0.001647  0.119403      0
536   0.019868  0.172840  0.179153  0.026374  0.119403      0
3349  0.413570  0.212500  0.365672  0.606122  0.519608      1

[5404 rows x 6 columns]
      V1  V2  V3  V4  V5  class
0      8   4   3   1   4      0
1     11   4  17   9   3      0
2      7   9   8   4   2      0
3      0  14   5   2   1      0
4      8   4  15   1   2      0
...   ..  ..  ..  ..  ..    ...
5399   