# One-hot encoding converting categorical features to numerical

![one-hot](one-hot.png)

In [1]:
from sklearn.feature_extraction import DictVectorizer

X_dict = [{'interest': 'tech', 'occupation': 'professional'},
          {'interest': 'fashion', 'occupation': 'student'},
          {'interest': 'fashion', 'occupation': 'professional'},
          {'interest': 'sports', 'occupation': 'student'},
          {'interest': 'tech', 'occupation': 'student'},
          {'interest': 'tech', 'occupation': 'retired'},
          {'interest': 'sports', 'occupation': 'professional'}]

In [2]:
dict_one_hot_encoder = DictVectorizer(sparse=False)
X_encoded = dict_one_hot_encoder.fit_transform(X_dict)
print(X_encoded) # (is_fashion=0,issports=0,istech=1  isprofessional=1 ..)

[[ 0.  0.  1.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.  1.]
 [ 1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  1.]
 [ 0.  0.  1.  0.  1.  0.]
 [ 0.  1.  0.  1.  0.  0.]]


列索引对应的：

In [4]:
dict_one_hot_encoder.vocabulary_

{'interest=fashion': 0,
 'interest=sports': 1,
 'interest=tech': 2,
 'occupation=professional': 3,
 'occupation=retired': 4,
 'occupation=student': 5}

已经fit_transform后，对于new data 只需.transform:

In [5]:
new_data = [{'interest': 'sports', 'occupation': 'retired'}]
new_encoded = dict_one_hot_encoder.transform(new_data)
print(new_encoded)

[[ 0.  1.  0.  0.  1.  0.]]


逆转：

In [6]:
print(dict_one_hot_encoder.inverse_transform(new_encoded))

[{'interest=sports': 1.0, 'occupation=retired': 1.0}]


### 如果特征不是字典格式，而是string对象，我们可以用LabelEncoder先转换为数值，再将数值特征转换为二位：

In [20]:
import numpy as np
X_str = np.array([['tech', 'professional'],  # 不是（特征：特征值）的字典，直接是字符串特征值
                  ['fashion', 'student'],
                  ['fashion', 'professional'],
                  ['sports', 'student'],
                  ['tech', 'student'],
                  ['tech', 'retired'],
                  ['sports', 'professional']])

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
X_int = label_encoder.fit_transform(X_str.ravel()) # 拉平，转换成数值
X_int = X_int.reshape(*X_str.shape)  # 再重塑回去。 （星号* 什么意思？）
print(X_int)

[[5 1]
 [0 4]
 [0 1]
 [3 4]
 [5 4]
 [5 2]
 [3 1]]


In [25]:
# 再one-hot
one_hot_encoder = OneHotEncoder()
print(one_hot_encoder.fit_transform(X_int)) # 需要toarray()
X_encoded = one_hot_encoder.fit_transform(X_int).toarray()
print(X_encoded)

  (0, 3)	1.0
  (0, 2)	1.0
  (1, 5)	1.0
  (1, 0)	1.0
  (2, 3)	1.0
  (2, 0)	1.0
  (3, 5)	1.0
  (3, 1)	1.0
  (4, 5)	1.0
  (4, 2)	1.0
  (5, 4)	1.0
  (5, 2)	1.0
  (6, 3)	1.0
  (6, 1)	1.0
[[ 0.  0.  1.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.  1.]
 [ 1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  1.]
 [ 0.  0.  1.  0.  1.  0.]
 [ 0.  1.  0.  1.  0.  0.]]


#### 最后需要注意的是，如果new data 中有 训练数据中没有的新的特征，它应当被忽略。DictVectorizer会默认操作

In [22]:
new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'},
            {'interest': 'tech', 'occupation': 'unseen_occupation'}]
new_encoded = dict_one_hot_encoder.transform(new_dict)
print(new_encoded)

[[ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.  0.]]


但是，不同于DictVectorizer，LabelEncoder不会自动处理新特征。最简单的方式是将string data 转换为 dictionary object，再用DictVectorizer。为此可以定义一个转换函数：

In [27]:
new_str = np.array([['unknown_interest', 'retired'],
                  ['tech', 'unseen_occupation'],
                  ['unknown_interest', 'unseen_occupation']])

In [30]:
def string_to_dict(columns, data_str):
    columns = ['interest', 'occupation']
    data_dict = []
    for sample_str in data_str:
        data_dict.append({column:value for column,value in zip(columns,sample_str)})
    return data_dict

In [31]:
columns = ['interest', 'occupation']
new_encoded = dict_one_hot_encoder.transform(string_to_dict(columns,new_str))
print(new_encoded)

[[ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]]
