In [31]:
from sklearn import preprocessing
import numpy as np

enc = preprocessing.OneHotEncoder()
X = np.array([['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']])
X

array([['male', 'from US', 'uses Safari'],
       ['female', 'from Europe', 'uses Firefox']], dtype='<U12')

In [32]:
'''
X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.
'''
enc.fit(X)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [33]:
enc.transform([['female', 'from US', 'uses Safari'],
               ['male', 'from Europe', 'uses Safari']]).toarray()

array([[1., 0., 0., 1., 0., 1.],
       [0., 1., 1., 0., 0., 1.]])

In [34]:
enc.categories_ # 默认情况下,从数据集中自动推断出来
# 解析
# female的编码是1, 0
# male的编码是0, 1
# from Europe的编码是1, 0
# from US的编码是0, 1
# users Firefox的编码是1, 0
# users Safari的编码是0, 1

[array(['female', 'male'], dtype='<U12'),
 array(['from Europe', 'from US'], dtype='<U12'),
 array(['uses Firefox', 'uses Safari'], dtype='<U12')]

In [35]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc1 = preprocessing.OneHotEncoder(categories=[genders, locations, browsers]) # 使用参数明确指定categories

In [36]:
X1 = np.array([['male', 'from US', 'uses Safari'],
               ['female', 'from Europe', 'uses Firefox']])
enc1.fit(X1)

OneHotEncoder(categories=[['female', 'male'],
                          ['from Africa', 'from Asia', 'from Europe',
                           'from US'],
                          ['uses Chrome', 'uses Firefox', 'uses IE',
                           'uses Safari']],
              drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error',
              sparse=True)

In [37]:
enc1.transform([['female', 'from US', 'uses Safari'],
               ['male', 'from Europe', 'uses Safari']]).toarray()

array([[1., 0., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 1.]])

In [38]:
enc1.categories_
# 解析
# female的编码是1, 0
# male的编码是0, 1
# from Africal的编码是1, 0, 0, 0
# from Asia的编码是0, 1, 0, 0
# from Europe的编码是0, 0, 1, 0
# from US的编码是0, 0, 0, 1
# user Chrome的编码是1, 0, 0, 0
# users Firefox的编码是0, 1, 0, 0
# users IE的编码是0, 0, 1, 0
# users Safari的编码是0, 0, 0, 1

[array(['female', 'male'], dtype='<U12'),
 array(['from Africa', 'from Asia', 'from Europe', 'from US'], dtype='<U12'),
 array(['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari'],
       dtype='<U12')]

In [39]:
enc2 = preprocessing.OneHotEncoder(handle_unknown='ignore') # 如果训练数据有可能缺少分类特征,则通常最好指定handle_unknown='ignore',否则将报错
X2 = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc2.fit(X)

# 不含from Asia, uses Chrome,故其编码为0, 0;0, 0
enc_output = enc2.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
enc_output

array([[1., 0., 0., 0., 0., 0.]])

In [40]:
enc2.categories_

[array(['female', 'male'], dtype='<U12'),
 array(['from Europe', 'from US'], dtype='<U12'),
 array(['uses Firefox', 'uses Safari'], dtype='<U12')]

In [41]:
enc2.inverse_transform(enc_output) # 将数据转换回原始表示形式

array([['female', None, None]], dtype=object)