In [1]:
# 导入包
import numpy as np
import pandas as pd

In [2]:
# 创建实验数据
data = pd.DataFrame({"RawData": np.random.randn(8)})
data

Unnamed: 0,RawData
0,0.10202
1,-0.113755
2,1.080326
3,-1.945521
4,-0.194798
5,-0.613031
6,-1.958032
7,-1.484649


## 一、离散化

In [3]:
# 离散化处理成大小序列
data["Discret"] = data["RawData"].argsort() + 1
data

Unnamed: 0,RawData,Discret
0,0.10202,7
1,-0.113755,4
2,1.080326,8
3,-1.945521,6
4,-0.194798,5
5,-0.613031,2
6,-1.958032,1
7,-1.484649,3


## 二、面元划分

### 2.1 cut函数

In [4]:
# 查看data
data

Unnamed: 0,RawData,Discret
0,0.10202,7
1,-0.113755,4
2,1.080326,8
3,-1.945521,6
4,-0.194798,5
5,-0.613031,2
6,-1.958032,1
7,-1.484649,3


In [6]:
# 设定面元划分标准
bins = [-2, -1.5, 0.5, 1.5]
# 进行面元划分
cats = pd.cut(data["RawData"], bins)
# pd.cut(data["RawData"], [-2, -1.5, 0.5, 1.5])
cats

0     (-1.5, 0.5]
1     (-1.5, 0.5]
2      (0.5, 1.5]
3    (-2.0, -1.5]
4     (-1.5, 0.5]
5     (-1.5, 0.5]
6    (-2.0, -1.5]
7     (-1.5, 0.5]
Name: RawData, dtype: category
Categories (3, interval[float64, right]): [(-2.0, -1.5] < (-1.5, 0.5] < (0.5, 1.5]]

In [7]:
# 统计每个面元区间里分别有多少个元素
pd.value_counts(cats)

(-1.5, 0.5]     5
(-2.0, -1.5]    2
(0.5, 1.5]      1
Name: RawData, dtype: int64

In [8]:
# 进行面元划分，设置左闭右开
pd.cut(data["RawData"], bins, right=False)

0     [-1.5, 0.5)
1     [-1.5, 0.5)
2      [0.5, 1.5)
3    [-2.0, -1.5)
4     [-1.5, 0.5)
5     [-1.5, 0.5)
6    [-2.0, -1.5)
7     [-1.5, 0.5)
Name: RawData, dtype: category
Categories (3, interval[float64, left]): [[-2.0, -1.5) < [-1.5, 0.5) < [0.5, 1.5)]

In [9]:
# 进行面元划分,设置面元标签
# 设定面元标签
labels = ["低区间", "中区间", "高区间"]
# 设定面元划分标准
bins = [-2, -1.5, 0.5, 1.5]
# 进行面元划分
pd.cut(data["RawData"], bins, labels=labels)
# pd.cut(data["RawData"], bins, labels=["低区间", "中区间", "高区间"])

0    中区间
1    中区间
2    高区间
3    低区间
4    中区间
5    中区间
6    低区间
7    中区间
Name: RawData, dtype: category
Categories (3, object): ['低区间' < '中区间' < '高区间']

In [10]:
# 进行面元划分,分成4个面元
# 数据区间均分为4个面元
# 2位有效数字
pd.cut(data["RawData"], 4, precision=2)

0    (-0.44, 0.32]
1    (-0.44, 0.32]
2     (0.32, 1.08]
3    (-1.96, -1.2]
4    (-0.44, 0.32]
5    (-1.2, -0.44]
6    (-1.96, -1.2]
7    (-1.96, -1.2]
Name: RawData, dtype: category
Categories (4, interval[float64, right]): [(-1.96, -1.2] < (-1.2, -0.44] < (-0.44, 0.32] < (0.32, 1.08]]

In [12]:
# 进行面元划分,分成4个面元
# 数据均匀分布在4个面元
# 2位有效数字
pd.qcut(data["RawData"], 4, precision=2)

0    (-0.06, 1.08]
1    (-0.4, -0.06]
2    (-0.06, 1.08]
3    (-1.97, -1.6]
4    (-0.4, -0.06]
5     (-1.6, -0.4]
6    (-1.97, -1.6]
7     (-1.6, -0.4]
Name: RawData, dtype: category
Categories (4, interval[float64, right]): [(-1.97, -1.6] < (-1.6, -0.4] < (-0.4, -0.06] < (-0.06, 1.08]]

In [13]:
# cut均衡划分数据区间
pd.value_counts(pd.cut(data["RawData"], 4, precision=2))

(-1.96, -1.2]    3
(-0.44, 0.32]    3
(-1.2, -0.44]    1
(0.32, 1.08]     1
Name: RawData, dtype: int64

### 2.2 qcut函数

In [14]:
# qcut均匀划分数据分布
pd.value_counts(pd.qcut(data["RawData"], 4, precision=2))

(-1.97, -1.6]    2
(-1.6, -0.4]     2
(-0.4, -0.06]    2
(-0.06, 1.08]    2
Name: RawData, dtype: int64

In [17]:
# qcut指定面元分界
pd.value_counts(pd.qcut(data["RawData"], [0, 0.5, 0.75, 1], precision=2))

(-1.97, -0.4]    4
(-0.4, -0.06]    2
(-0.06, 1.08]    2
Name: RawData, dtype: int64