In [22]:
# In equal-width discretization, the variable values are sorted into intervals of the same
# width. The number of intervals is decided arbitrarily and the width is determined by the
# range of values of the variable and the number of bins to create
%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from feature_engine.discretisation import EqualWidthDiscretiser

In [23]:
boston_dataset = load_boston()
data = pd.DataFrame(boston_dataset.data,
columns=boston_dataset.feature_names)
data['MEDV'] = boston_dataset.target

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
data.drop('MEDV', axis=1), data['MEDV'], test_size=0.3,
random_state=0)

In [50]:
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,lstat_disc
141,1.62864,0.0,21.89,0.0,0.624,5.019,100.0,1.4394,4.0,437.0,21.2,396.9,34.41,"(34.0, 37.0]"
272,0.1146,20.0,6.96,0.0,0.464,6.538,58.7,3.9175,3.0,223.0,18.6,394.96,7.73,"(7.0, 10.0]"
135,0.55778,0.0,21.89,0.0,0.624,6.335,98.2,2.1107,4.0,437.0,21.2,394.67,16.96,"(16.0, 19.0]"
298,0.06466,70.0,2.24,0.0,0.4,6.345,20.1,7.8278,5.0,358.0,14.8,368.24,4.97,"(4.0, 7.0]"
122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93,"(16.0, 19.0]"


In [25]:
lstat_range = X_train['LSTAT'].max() - X_train['LSTAT'].min()

In [26]:
print(lstat_range)

35.25


In [27]:
# calculate width for 10 equal bins
inter_width = int(lstat_range / 10)
print(inter_width)

3


In [28]:
min_value = int(np.floor( X_train['LSTAT'].min()))
max_value = int(np.ceil( X_train['LSTAT'].max()))
print(min_value, max_value, inter_width)

1 37 3


In [29]:
intervals = [i for i in range(min_value, max_value + inter_width,
inter_width)]

In [30]:
print(intervals)

[1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37]


In [31]:
X_train['lstat_disc'] = pd.cut(x=X_train['LSTAT'], bins=intervals,
include_lowest=True)

In [32]:
print(X_train[['LSTAT', 'lstat_disc']].head(10))

     LSTAT    lstat_disc
141  34.41  (34.0, 37.0]
272   7.73   (7.0, 10.0]
135  16.96  (16.0, 19.0]
298   4.97    (4.0, 7.0]
122  17.93  (16.0, 19.0]
22   18.72  (16.0, 19.0]
68   13.09  (13.0, 16.0]
20   21.02  (19.0, 22.0]
437  26.45  (25.0, 28.0]
14   10.26  (10.0, 13.0]


In [33]:
X_train.groupby('lstat_disc')['LSTAT'].count()

lstat_disc
(0.999, 4.0]    28
(4.0, 7.0]      67
(7.0, 10.0]     63
(10.0, 13.0]    49
(13.0, 16.0]    44
(16.0, 19.0]    45
(19.0, 22.0]    21
(22.0, 25.0]    17
(25.0, 28.0]     7
(28.0, 31.0]     9
(31.0, 34.0]     0
(34.0, 37.0]     4
Name: LSTAT, dtype: int64

In [34]:
X_test['lstat_disc'] = pd.cut(x=X_test['LSTAT'], bins=intervals,
include_lowest=True)

In [44]:
# using feature engine
disc = EqualWidthDiscretiser(bins=10, variables = ['LSTAT', 'DIS',
'RM'])

In [45]:
disc.fit(X_train)

EqualWidthDiscretiser(variables=['LSTAT', 'DIS', 'RM'])

In [46]:
train_t = disc.transform(X_train)
test_t = disc.transform(X_test)

In [47]:
# using sklearn
disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')

In [48]:
disc.fit(X_train[['LSTAT', 'DIS', 'RM']])

KBinsDiscretizer(encode='ordinal', n_bins=10, strategy='uniform')

In [49]:
train_t = disc.transform(X_train[['LSTAT', 'DIS', 'RM']])
test_t = disc.transform(X_test[['LSTAT', 'DIS', 'RM']])