### 类别型特征的处理

- 基于 pandas 的 one-hot编码
- 基于 sklearn 的编码方法

#### pandas 的处理方法

In [55]:
import pandas as pd
import numpy as np
import pandas_profiling

In [2]:
# 定义数据的列名称, 因为这个数据集没有包含列名称
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# 读取在线的数据集, 并将?转换为缺失NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()[df.columns[:10]]

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4


In [56]:
df.profile_report()



In [3]:
# 查看所有数据的类型
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [4]:
# 这里仅关心类别型特征
df2 = df.select_dtypes('object').copy()
df2.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [59]:
# 另一种选择 object 类型数据
category_feature_mask = df.dtypes == object
category_cols = df.columns[category_feature_mask].tolist()
df3 = df[category_cols].copy()
df3.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [60]:
# 简单的处理缺失值--丢弃
df2.dropna(inplace=True)

##### 方法1: 替换字符串

最简单的处理方法，就是直接用整数替换类别型特征，即给每个字符串一个编号，用编号替换字符串

In [6]:
col = 'body_style'
strs = df2[col].value_counts()
strs

sedan          94
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: body_style, dtype: int64

In [7]:
# 将所有字符串映射为数字
value_map = dict((v, i) for i, v in enumerate(strs.index))
value_map

{'sedan': 0, 'hatchback': 1, 'wagon': 2, 'hardtop': 3, 'convertible': 4}

In [8]:
# 用 replace 方法进行替换
df2.replace({col: value_map})[col].head()

0    4
1    4
2    1
3    0
4    0
Name: body_style, dtype: int64

##### 方法2:标签编码

标签编码也是将每个字符串转化为数字，如刚刚选择的 body_style 列，可以转化如下所示：

- convertible -> 0
- hardtop -> 1
- hatchback -> 2
- sedan -> 3
- wagon -> 4

In [9]:
# 首先转化为 category 类型
bs = df2['body_style'].astype('category')
bs.head()

0    convertible
1    convertible
2      hatchback
3          sedan
4          sedan
Name: body_style, dtype: category
Categories (5, object): [convertible, hardtop, hatchback, sedan, wagon]

In [10]:
# 接着直接用标签的编码作为真正的数据即可
bs.cat.codes.head()

0    0
1    0
2    2
3    3
4    3
dtype: int8

##### 方法3: one-hot 编码

标签编码并不太适合类别型特征有多个数值的情况，因为这可能让算法误解其有大小的关系。

In [61]:
pd.get_dummies(df[['drive_wheels', 'body_style']]).head()

Unnamed: 0,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon
0,0,0,1,1,0,0,0,0
1,0,0,1,1,0,0,0,0
2,0,0,1,0,0,1,0,0
3,0,1,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0


##### 方法4:自定义二分类

In [12]:
df2['engine_type'].value_counts()

ohc      146
ohcf      15
ohcv      13
l         12
dohc      12
rotor      4
dohcv      1
Name: engine_type, dtype: int64

In [13]:
# 这里如果只关心是否有 ohc 数值，那就可以分为两类，是否包含 ohc
df2['engine_type'].str.contains('ohc').map(int).value_counts()

1    187
0     16
Name: engine_type, dtype: int64

#### sklearn 的处理方法

sklearn 中有三种处理方法，对应三个编码器

- DictVectorizer
- LabelEncoder
- OneHotEncoder

##### DictVectorizer

In [16]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
df['drive_wheels'].unique()

array(['rwd', 'fwd', '4wd'], dtype=object)

In [26]:
df['body_style'].unique()

array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],
      dtype=object)

In [20]:
# 采用 DictVectorize 需要先将 dataframe 转化为 dict
x = df[['drive_wheels', 'body_style']].copy()
# orient=records 表示转化为{'key': value}的形式
x_dict = x.to_dict(orient='records')
x_dict[:5]

[{'drive_wheels': 'rwd', 'body_style': 'convertible'},
 {'drive_wheels': 'rwd', 'body_style': 'convertible'},
 {'drive_wheels': 'rwd', 'body_style': 'hatchback'},
 {'drive_wheels': 'fwd', 'body_style': 'sedan'},
 {'drive_wheels': '4wd', 'body_style': 'sedan'}]

In [21]:
# 指定 sparse=False ，输出的就不是一个稀疏矩阵
dv_encoder = DictVectorizer(sparse=False)

In [22]:
x_encoded = dv_encoder.fit_transform(x_dict)
x_encoded

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [27]:
# 这两个特征总共有 8 个取值，经过 one-hot 编码得到的就是 8 维特征
x_encoded.shape

(205, 8)

##### LabelEncoder

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le = LabelEncoder()

In [31]:
# 还是用 'drive_wheels', 'body_style' 这两个特征
col = ['drive_wheels', 'body_style']
# 对每个数值进行编码
x[col] = x[col].apply(lambda c: le.fit_transform(c))
x[col].head()

Unnamed: 0,drive_wheels,body_style
0,2,0
1,2,0
2,2,2
3,1,3
4,0,3


##### OneHotEncoder

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
col_mask = df[['drive_wheels', 'body_style']].dtypes == object

In [40]:
col_mask

drive_wheels    True
body_style      True
dtype: bool

In [43]:
# 初始化 one-hot 编码器
one_encoder = OneHotEncoder(sparse=False)
one_encoder.fit(x)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

In [53]:
# 这里需要采用经过 LabelEncoder 得到的结果
x_one = one_encoder.fit_transform(x)
x_one[:5]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0.]])

In [45]:
x_one.shape

(205, 8)

In [49]:
# 也可以指定特征的维度, 比如指定每个特征都是 5 维
one_enc2 = OneHotEncoder(n_values=5, sparse=False)

In [50]:
one_enc2.fit(x)
x_one2 = one_enc2.fit_transform(x)
x_one2.shape

(205, 10)

In [54]:
x_one2[:5]

array([[0., 0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])