# 离散化和分箱

In [1]:
import numpy as np
import pandas as pd

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [3]:
bins = [18, 25, 35, 60, 100]

In [4]:
cats = pd.cut(ages, bins)

In [5]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [7]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [8]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [9]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [10]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [11]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

## 均匀分布的箱

In [17]:
data = np.random.rand(200)

In [18]:
cats2 = pd.cut(data, 4, precision=2)

In [19]:
cats2.value_counts()

(-0.0008, 0.25]    45
(0.25, 0.49]       49
(0.49, 0.74]       49
(0.74, 0.99]       57
dtype: int64

## 等长的箱

In [20]:
data = np.random.randn(1000) # 正态分布

In [21]:
cats3 = pd.qcut(data, 4)

In [23]:
cats3

[(0.713, 3.082], (-0.633, -0.0176], (0.713, 3.082], (-2.854, -0.633], (0.713, 3.082], ..., (-0.0176, 0.713], (-0.633, -0.0176], (-2.854, -0.633], (-0.633, -0.0176], (-2.854, -0.633]]
Length: 1000
Categories (4, interval[float64]): [(-2.854, -0.633] < (-0.633, -0.0176] < (-0.0176, 0.713] < (0.713, 3.082]]

In [22]:
cats3.value_counts()

(-2.854, -0.633]     250
(-0.633, -0.0176]    250
(-0.0176, 0.713]     250
(0.713, 3.082]       250
dtype: int64

## 自定义的分位数

In [24]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(1.28, 3.082], (-1.201, -0.0176], (1.28, 3.082], (-1.201, -0.0176], (-0.0176, 1.28], ..., (-0.0176, 1.28], (-1.201, -0.0176], (-2.854, -1.201], (-1.201, -0.0176], (-1.201, -0.0176]]
Length: 1000
Categories (4, interval[float64]): [(-2.854, -1.201] < (-1.201, -0.0176] < (-0.0176, 1.28] < (1.28, 3.082]]