# Discretization of pre-processed data using Decision Tree discretization
## Dataset: musk
By: Sam
Update: 23/02/2023
Replicate using Malina script

### About Dataset
Number of Instances  6,598

Number of Attributes 168 plus the class.

For Each Attribute:
   Attribute:           Description:
   - molecule_name: Symbolic name of each molecule.  Musks have names such as MUSK-188.  Non-musks have names such as NON-MUSK-jp13.
   - conformation_name: Symbolic name of each conformation.  
   
   - f1 through f162: These are "distance features" along rays (see paper cited above).  The distances are measured in hundredths of Angstroms. any experiments withthe data should treat these feature values as lying on an arbitrary continuous scale.  In particular, the algorithm should not make any use of the zero point or the sign of each feature value. 
   - f163: This is the distance of the oxygen atom in the molecule to a designated point in 3-space. This is also called OXY-DIS.
   - f164: OXY-X: X-displacement from the designated point.
   - f165: OXY-Y: Y-displacement from the designated point.
   - f166: OXY-Z: Z-displacement from the designated
                        point. 
   
   class:               0 => non-musk, 1 => musk

   Please note that the molecule_name and conformation_name attributes
   should not be used to predict the class.

Missing Attribute Values: none.

Class Distribution: 
   Musks:     39
   Non-musks: 63

# 1. Preparing data

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [2]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_musk.csv')
#musk dataset
musk = data0

In [3]:
musk

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-211,211_1+1,46,-108,-60,-69,-117,49,38,-161,...,-308,52,-7,39,126,156,-50,-112,96,1.0
1,MUSK-211,211_1+10,41,-188,-145,22,-117,-6,57,-171,...,-59,-2,52,103,136,169,-61,-136,79,1.0
2,MUSK-211,211_1+11,46,-194,-145,28,-117,73,57,-168,...,-134,-154,57,143,142,165,-67,-145,39,1.0
3,MUSK-211,211_1+12,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,136,168,-60,-135,80,1.0
4,MUSK-211,211_1+13,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,137,168,-60,-135,80,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6593,NON-MUSK-jp13,jp13_2+5,51,-123,-23,-108,-117,134,-160,82,...,-66,164,-14,-29,107,171,-44,-115,118,0.0
6594,NON-MUSK-jp13,jp13_2+6,44,-104,-19,-105,-117,142,-165,68,...,-51,166,-9,150,129,158,-66,-144,-5,0.0
6595,NON-MUSK-jp13,jp13_2+7,44,-102,-19,-104,-117,72,-165,65,...,90,117,-8,150,130,159,-66,-144,-6,0.0
6596,NON-MUSK-jp13,jp13_2+8,51,-121,-23,-106,-117,63,-161,79,...,86,99,-14,-31,106,171,-44,-116,117,0.0


In [4]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
musk['class']= label_encoder.fit_transform(musk['class'])
  
musk['class'].unique()

array([1, 0])

In [15]:
# List of continuous feature to discretize
num_list = musk.columns.to_list()


In [16]:
num_list.remove('molecule_name')
num_list.remove('conformation_name')
num_list.remove('class')

In [17]:
y_list = pd.DataFrame(musk['class'])

In [18]:
num_list

['f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34',
 'f35',
 'f36',
 'f37',
 'f38',
 'f39',
 'f40',
 'f41',
 'f42',
 'f43',
 'f44',
 'f45',
 'f46',
 'f47',
 'f48',
 'f49',
 'f50',
 'f51',
 'f52',
 'f53',
 'f54',
 'f55',
 'f56',
 'f57',
 'f58',
 'f59',
 'f60',
 'f61',
 'f62',
 'f63',
 'f64',
 'f65',
 'f66',
 'f67',
 'f68',
 'f69',
 'f70',
 'f71',
 'f72',
 'f73',
 'f74',
 'f75',
 'f76',
 'f77',
 'f78',
 'f79',
 'f80',
 'f81',
 'f82',
 'f83',
 'f84',
 'f85',
 'f86',
 'f87',
 'f88',
 'f89',
 'f90',
 'f91',
 'f92',
 'f93',
 'f94',
 'f95',
 'f96',
 'f97',
 'f98',
 'f99',
 'f100',
 'f101',
 'f102',
 'f103',
 'f104',
 'f105',
 'f106',
 'f107',
 'f108',
 'f109',
 'f110',
 'f111',
 'f112',
 'f113',
 'f114',
 'f115',
 'f116',
 'f117',
 'f118',
 'f119',
 'f120',
 'f121',
 'f122',
 'f123',
 

In [None]:
num_list
y_list

# 3. Decision Tree discretization

In [None]:
# !pip install feature_engine

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser

In [20]:
# Load dataset
data = musk
data

Unnamed: 0,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,f8,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,MUSK-211,211_1+1,46,-108,-60,-69,-117,49,38,-161,...,-308,52,-7,39,126,156,-50,-112,96,1
1,MUSK-211,211_1+10,41,-188,-145,22,-117,-6,57,-171,...,-59,-2,52,103,136,169,-61,-136,79,1
2,MUSK-211,211_1+11,46,-194,-145,28,-117,73,57,-168,...,-134,-154,57,143,142,165,-67,-145,39,1
3,MUSK-211,211_1+12,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,136,168,-60,-135,80,1
4,MUSK-211,211_1+13,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,137,168,-60,-135,80,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6593,NON-MUSK-jp13,jp13_2+5,51,-123,-23,-108,-117,134,-160,82,...,-66,164,-14,-29,107,171,-44,-115,118,0
6594,NON-MUSK-jp13,jp13_2+6,44,-104,-19,-105,-117,142,-165,68,...,-51,166,-9,150,129,158,-66,-144,-5,0
6595,NON-MUSK-jp13,jp13_2+7,44,-102,-19,-104,-117,72,-165,65,...,90,117,-8,150,130,159,-66,-144,-6,0
6596,NON-MUSK-jp13,jp13_2+8,51,-121,-23,-106,-117,63,-161,79,...,86,99,-14,-31,106,171,-44,-116,117,0


In [21]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

# DT scripts

In [22]:
#load data
data = musk
# let's separate into training and testing set
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (4618, 169)
X_test : (1980, 169)


## 2.1 DT with small max_depth

In [23]:
#make DT discreizer
# 'max_depth': [2] => 2^2 = 4 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [2]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.178614  0.309365  0.385045  0.034687   
3951   NON-MUSK-327           327_1+2  0.178614  0.158569  0.132381  0.034687   
4086  NON-MUSK-f146        f146_1+162  0.007792  0.158569  0.132381  0.145009   
428        MUSK-217          217_2+86  0.178614  0.309365  0.385045  0.145009   
891        MUSK-j51           j51_2+4  0.090796  0.158569  0.385045  0.034687   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.178614  0.044467  0.013233  0.371281   
655        MUSK-287           287_1+4  0.178614  0.158569  0.132381  0.371281   
1013      MUSK-jf78          jf78_1+3  0.178614  0.158569  0.132381  0.371281   
3378   NON-MUSK-286          286_2+13  0.178614  0.158569  0.132381  0.034687   
4001   NON-MUSK-362           362_1+2  0.178614  0.044467  0.385045  0.145009   

            f5        f6   

In [24]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: molecule_name
102
Entries per interval for molecule_name
Counter({'NON-MUSK-j146': 1044, 'NON-MUSK-252': 1010, 'NON-MUSK-j147': 911, 'NON-MUSK-f146': 383, 'NON-MUSK-288': 344, 'NON-MUSK-192': 286, 'NON-MUSK-197': 277, 'MUSK-217': 215, 'NON-MUSK-326': 141, 'NON-MUSK-251': 140, 'NON-MUSK-216': 135, 'NON-MUSK-233': 104, 'NON-MUSK-270': 83, 'MUSK-240': 82, 'MUSK-213': 78, 'MUSK-215': 73, 'MUSK-287': 64, 'NON-MUSK-f209': 63, 'NON-MUSK-jp13': 63, 'NON-MUSK-286': 59, 'NON-MUSK-220': 53, 'NON-MUSK-207': 48, 'MUSK-f158': 43, 'NON-MUSK-244': 40, 'MUSK-f152': 36, 'NON-MUSK-232': 34, 'MUSK-jf67': 32, 'MUSK-212': 31, 'NON-MUSK-289': 29, 'MUSK-256': 29, 'NON-MUSK-249': 29, 'NON-MUSK-f164': 27, 'MUSK-214': 27, 'MUSK-j51': 25, 'NON-MUSK-210': 25, 'MUSK-jf58': 22, 'NON-MUSK-199': 21, 'MUSK-jf59': 20, 'NON-MUSK-208': 20, 'MUSK-211': 19, 'NON-MUSK-j90': 18, 'MUSK-273': 18, 'MUSK-228': 17, 'MUSK-322': 16, 'MUSK-284': 16, 'NON-MUSK-361': 16, 'MUSK-314': 16, 'NON-MUSK-362': 16, 'MUSK-333': 16, '

In [25]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = musk.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_small_discretized_musk.csv',index=False)

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.178614  0.309365  0.385045  0.034687   
3951   NON-MUSK-327           327_1+2  0.178614  0.158569  0.132381  0.034687   
4086  NON-MUSK-f146        f146_1+162  0.007792  0.158569  0.132381  0.145009   
428        MUSK-217          217_2+86  0.178614  0.309365  0.385045  0.145009   
891        MUSK-j51           j51_2+4  0.090796  0.158569  0.385045  0.034687   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.178614  0.044467  0.013233  0.371281   
655        MUSK-287           287_1+4  0.178614  0.158569  0.132381  0.371281   
1013      MUSK-jf78          jf78_1+3  0.178614  0.158569  0.132381  0.371281   
3378   NON-MUSK-286          286_2+13  0.178614  0.158569  0.132381  0.034687   
4001   NON-MUSK-362           362_1+2  0.178614  0.044467  0.385045  0.145009   

            f5        f6   

In [26]:
disc_ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 169 entries, molecule_name to class
dtypes: int64(169)
memory usage: 8.5 MB


## 2.2 DT with medium max_depth

In [27]:
#make DT discreizer
# 'max_depth': [3] => 2^3 = 8 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [3]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.165839  0.282648  0.262058  0.021964   
3951   NON-MUSK-327           327_1+2  0.287324  0.142433  0.099155  0.021964   
4086  NON-MUSK-f146        f146_1+162  0.000000  0.213974  0.099155  0.176793   
428        MUSK-217          217_2+86  0.165839  0.692308  0.664234  0.176793   
891        MUSK-j51           j51_2+4  0.065876  0.142433  0.262058  0.021964   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.165839  0.011211  0.011673  0.361589   
655        MUSK-287           287_1+4  0.165839  0.142433  0.314634  0.361589   
1013      MUSK-jf78          jf78_1+3  0.165839  0.142433  0.099155  0.361589   
3378   NON-MUSK-286          286_2+13  0.165839  0.142433  0.099155  0.021964   
4001   NON-MUSK-362           362_1+2  0.287324  0.011211  0.262058  0.176793   

            f5        f6   

In [28]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: molecule_name
102
Entries per interval for molecule_name
Counter({'NON-MUSK-j146': 1044, 'NON-MUSK-252': 1010, 'NON-MUSK-j147': 911, 'NON-MUSK-f146': 383, 'NON-MUSK-288': 344, 'NON-MUSK-192': 286, 'NON-MUSK-197': 277, 'MUSK-217': 215, 'NON-MUSK-326': 141, 'NON-MUSK-251': 140, 'NON-MUSK-216': 135, 'NON-MUSK-233': 104, 'NON-MUSK-270': 83, 'MUSK-240': 82, 'MUSK-213': 78, 'MUSK-215': 73, 'MUSK-287': 64, 'NON-MUSK-f209': 63, 'NON-MUSK-jp13': 63, 'NON-MUSK-286': 59, 'NON-MUSK-220': 53, 'NON-MUSK-207': 48, 'MUSK-f158': 43, 'NON-MUSK-244': 40, 'MUSK-f152': 36, 'NON-MUSK-232': 34, 'MUSK-jf67': 32, 'MUSK-212': 31, 'NON-MUSK-289': 29, 'MUSK-256': 29, 'NON-MUSK-249': 29, 'NON-MUSK-f164': 27, 'MUSK-214': 27, 'MUSK-j51': 25, 'NON-MUSK-210': 25, 'MUSK-jf58': 22, 'NON-MUSK-199': 21, 'MUSK-jf59': 20, 'NON-MUSK-208': 20, 'MUSK-211': 19, 'NON-MUSK-j90': 18, 'MUSK-273': 18, 'MUSK-228': 17, 'MUSK-322': 16, 'MUSK-284': 16, 'NON-MUSK-361': 16, 'MUSK-314': 16, 'NON-MUSK-362': 16, 'MUSK-333': 16, '

In [29]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = musk.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_medium_discretized_musk.csv',index=False)

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.165839  0.282648  0.262058  0.021964   
3951   NON-MUSK-327           327_1+2  0.287324  0.142433  0.099155  0.021964   
4086  NON-MUSK-f146        f146_1+162  0.000000  0.213974  0.099155  0.176793   
428        MUSK-217          217_2+86  0.165839  0.692308  0.664234  0.176793   
891        MUSK-j51           j51_2+4  0.065876  0.142433  0.262058  0.021964   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.165839  0.011211  0.011673  0.361589   
655        MUSK-287           287_1+4  0.165839  0.142433  0.314634  0.361589   
1013      MUSK-jf78          jf78_1+3  0.165839  0.142433  0.099155  0.361589   
3378   NON-MUSK-286          286_2+13  0.165839  0.142433  0.099155  0.021964   
4001   NON-MUSK-362           362_1+2  0.287324  0.011211  0.262058  0.176793   

            f5        f6   

## 2.3 DT with large max_depth

In [30]:
#make DT discreizer
# 'max_depth': [4] => 2^4 = 16 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [4]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.203593  0.270476  0.336100  0.056000   
3951   NON-MUSK-327           327_1+2  0.299120  0.161179  0.134731  0.056000   
4086  NON-MUSK-f146        f146_1+162  0.000000  0.276074  0.070569  0.190317   
428        MUSK-217          217_2+86  0.203593  0.833333  0.629032  0.190317   
891        MUSK-j51           j51_2+4  0.072464  0.161179  0.336100  0.056000   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.203593  0.009412  0.005587  0.357045   
655        MUSK-287           287_1+4  0.203593  0.090764  0.386018  0.357045   
1013      MUSK-jf78          jf78_1+3  0.203593  0.161179  0.070569  0.357045   
3378   NON-MUSK-286          286_2+13  0.091445  0.161179  0.134731  0.005725   
4001   NON-MUSK-362           362_1+2  0.000000  0.009412  0.007143  0.190317   

            f5        f6   

In [31]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: molecule_name
102
Entries per interval for molecule_name
Counter({'NON-MUSK-j146': 1044, 'NON-MUSK-252': 1010, 'NON-MUSK-j147': 911, 'NON-MUSK-f146': 383, 'NON-MUSK-288': 344, 'NON-MUSK-192': 286, 'NON-MUSK-197': 277, 'MUSK-217': 215, 'NON-MUSK-326': 141, 'NON-MUSK-251': 140, 'NON-MUSK-216': 135, 'NON-MUSK-233': 104, 'NON-MUSK-270': 83, 'MUSK-240': 82, 'MUSK-213': 78, 'MUSK-215': 73, 'MUSK-287': 64, 'NON-MUSK-f209': 63, 'NON-MUSK-jp13': 63, 'NON-MUSK-286': 59, 'NON-MUSK-220': 53, 'NON-MUSK-207': 48, 'MUSK-f158': 43, 'NON-MUSK-244': 40, 'MUSK-f152': 36, 'NON-MUSK-232': 34, 'MUSK-jf67': 32, 'MUSK-212': 31, 'NON-MUSK-289': 29, 'MUSK-256': 29, 'NON-MUSK-249': 29, 'NON-MUSK-f164': 27, 'MUSK-214': 27, 'MUSK-j51': 25, 'NON-MUSK-210': 25, 'MUSK-jf58': 22, 'NON-MUSK-199': 21, 'MUSK-jf59': 20, 'NON-MUSK-208': 20, 'MUSK-211': 19, 'NON-MUSK-j90': 18, 'MUSK-273': 18, 'MUSK-228': 17, 'MUSK-322': 16, 'MUSK-284': 16, 'NON-MUSK-361': 16, 'MUSK-314': 16, 'NON-MUSK-362': 16, 'MUSK-333': 16, '

In [32]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = musk.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_large_discretized_musk.csv',index=False)

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.203593  0.270476  0.336100  0.056000   
3951   NON-MUSK-327           327_1+2  0.299120  0.161179  0.134731  0.056000   
4086  NON-MUSK-f146        f146_1+162  0.000000  0.276074  0.070569  0.190317   
428        MUSK-217          217_2+86  0.203593  0.833333  0.629032  0.190317   
891        MUSK-j51           j51_2+4  0.072464  0.161179  0.336100  0.056000   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.203593  0.009412  0.005587  0.357045   
655        MUSK-287           287_1+4  0.203593  0.090764  0.386018  0.357045   
1013      MUSK-jf78          jf78_1+3  0.203593  0.161179  0.070569  0.357045   
3378   NON-MUSK-286          286_2+13  0.091445  0.161179  0.134731  0.005725   
4001   NON-MUSK-362           362_1+2  0.000000  0.009412  0.007143  0.190317   

            f5        f6   

## 2.4 DT with extra large max_depth

In [33]:
#make DT discreizer
# 'max_depth': [5] => 2^5 = 32 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [5]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.180745  0.251185  0.258621  0.108911   
3951   NON-MUSK-327           327_1+2  0.285276  0.150892  0.194231  0.020134   
4086  NON-MUSK-f146        f146_1+162  0.000000  0.276074  0.031378  0.174678   
428        MUSK-217          217_2+86  0.180745  0.875000  0.689320  0.174678   
891        MUSK-j51           j51_2+4  0.065379  0.150892  0.379870  0.108911   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.296954  0.000000  0.003311  0.364807   
655        MUSK-287           287_1+4  0.180745  0.124060  0.332090  0.364807   
1013      MUSK-jf78          jf78_1+3  0.296954  0.150892  0.031378  0.364807   
3378   NON-MUSK-286          286_2+13  0.067308  0.216117  0.194231  0.020000   
4001   NON-MUSK-362           362_1+2  0.000000  0.000000  0.000000  0.174678   

            f5        f6   

In [34]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: molecule_name
102
Entries per interval for molecule_name
Counter({'NON-MUSK-j146': 1044, 'NON-MUSK-252': 1010, 'NON-MUSK-j147': 911, 'NON-MUSK-f146': 383, 'NON-MUSK-288': 344, 'NON-MUSK-192': 286, 'NON-MUSK-197': 277, 'MUSK-217': 215, 'NON-MUSK-326': 141, 'NON-MUSK-251': 140, 'NON-MUSK-216': 135, 'NON-MUSK-233': 104, 'NON-MUSK-270': 83, 'MUSK-240': 82, 'MUSK-213': 78, 'MUSK-215': 73, 'MUSK-287': 64, 'NON-MUSK-f209': 63, 'NON-MUSK-jp13': 63, 'NON-MUSK-286': 59, 'NON-MUSK-220': 53, 'NON-MUSK-207': 48, 'MUSK-f158': 43, 'NON-MUSK-244': 40, 'MUSK-f152': 36, 'NON-MUSK-232': 34, 'MUSK-jf67': 32, 'MUSK-212': 31, 'NON-MUSK-289': 29, 'MUSK-256': 29, 'NON-MUSK-249': 29, 'NON-MUSK-f164': 27, 'MUSK-214': 27, 'MUSK-j51': 25, 'NON-MUSK-210': 25, 'MUSK-jf58': 22, 'NON-MUSK-199': 21, 'MUSK-jf59': 20, 'NON-MUSK-208': 20, 'MUSK-211': 19, 'NON-MUSK-j90': 18, 'MUSK-273': 18, 'MUSK-228': 17, 'MUSK-322': 16, 'MUSK-284': 16, 'NON-MUSK-361': 16, 'MUSK-314': 16, 'NON-MUSK-362': 16, 'MUSK-333': 16, '

In [35]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = musk.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_verylarge_discretized_musk.csv',index=False)

      molecule_name conformation_name        f1        f2        f3        f4  \
3594   NON-MUSK-288          288_3+25  0.180745  0.251185  0.258621  0.108911   
3951   NON-MUSK-327           327_1+2  0.285276  0.150892  0.194231  0.020134   
4086  NON-MUSK-f146        f146_1+162  0.000000  0.276074  0.031378  0.174678   
428        MUSK-217          217_2+86  0.180745  0.875000  0.689320  0.174678   
891        MUSK-j51           j51_2+4  0.065379  0.150892  0.379870  0.108911   
...             ...               ...       ...       ...       ...       ...   
5967  NON-MUSK-j147         j147_2+49  0.296954  0.000000  0.003311  0.364807   
655        MUSK-287           287_1+4  0.180745  0.124060  0.332090  0.364807   
1013      MUSK-jf78          jf78_1+3  0.296954  0.150892  0.031378  0.364807   
3378   NON-MUSK-286          286_2+13  0.067308  0.216117  0.194231  0.020000   
4001   NON-MUSK-362           362_1+2  0.000000  0.000000  0.000000  0.174678   

            f5        f6   