In [14]:
# Equal-frequency discretization divides the values of the variable into intervals that carry
# the same proportion of observations. The interval width is determined by quantiles, and
# therefore different intervals may have different widths
# This discretization technique is particularly useful for
# skewed variables as it spreads the observations over the different bins equally
%config Completer.use_jedi = False
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from feature_engine.discretisation import EqualFrequencyDiscretiser

In [15]:
boston_dataset = load_boston()
data = pd.DataFrame(boston_dataset.data,
columns=boston_dataset.feature_names)
data['MEDV'] = boston_dataset.target

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3,
random_state=0)

In [17]:
X_train['lstat_disc'], intervals = pd.qcut(X_train['LSTAT'], 10,
labels=None, retbins=True)

In [18]:
print(intervals)

[ 1.73   4.623  6.202  7.528  9.5   11.16  13.26  15.565 18.06  22.453
 36.98 ]


In [19]:
print(X_train[['LSTAT', 'lstat_disc']].head(10))

     LSTAT       lstat_disc
141  34.41  (22.453, 36.98]
272   7.73     (7.528, 9.5]
135  16.96  (15.565, 18.06]
298   4.97   (4.623, 6.202]
122  17.93  (15.565, 18.06]
22   18.72  (18.06, 22.453]
68   13.09   (11.16, 13.26]
20   21.02  (18.06, 22.453]
437  26.45  (22.453, 36.98]
14   10.26     (9.5, 11.16]


In [20]:
X_train['lstat_disc'].value_counts() / len(X_train)

(7.528, 9.5]       0.104520
(1.729, 4.623]     0.101695
(13.26, 15.565]    0.101695
(15.565, 18.06]    0.101695
(22.453, 36.98]    0.101695
(4.623, 6.202]     0.098870
(6.202, 7.528]     0.098870
(11.16, 13.26]     0.098870
(9.5, 11.16]       0.096045
(18.06, 22.453]    0.096045
Name: lstat_disc, dtype: float64

In [32]:
# use sklearn
disc = KBinsDiscretizer(n_bins=10, encode='ordinal',
strategy='quantile')


In [33]:
disc.fit(X_train[['LSTAT', 'DIS', 'RM']])

KBinsDiscretizer(encode='ordinal', n_bins=10)

In [34]:
train_t = disc.transform(X_train[['LSTAT', 'DIS', 'RM']])
test_t = disc.transform(X_test[['LSTAT', 'DIS', 'RM']])

In [36]:
print(disc.bin_edges_)

[array([ 1.73 ,  4.623,  6.202,  7.528,  9.5  , 11.16 , 13.26 , 15.565,
       18.06 , 22.453, 36.98 ])
 array([ 1.1742 ,  1.66132,  1.9793 ,  2.26121,  2.64774,  3.2157 ,
        3.7965 ,  4.45352,  5.40702,  6.8147 , 12.1265 ])
 array([3.561 , 5.5783, 5.8532, 5.9628, 6.1048, 6.2155, 6.3946, 6.5632,
       6.794 , 7.185 , 8.78  ])]


In [37]:
# convert numpy to dataframe
train_t = pd.DataFrame(train_t, columns =['LSTAT', 'DIS', 'RM'])

In [38]:
train_t.head()

Unnamed: 0,LSTAT,DIS,RM
0,9.0,0.0,0.0
1,3.0,6.0,6.0
2,7.0,2.0,5.0
3,1.0,9.0,5.0
4,7.0,2.0,2.0
