## 离散值处理

In [1]:
import pandas as pd
import numpy as np

In [2]:
vg_df = pd.read_csv('data/vgsales.csv', encoding='ISO-8859-1')
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


机器无法识别字符串类型数据，需要做处理

In [3]:
genres = np.unique(vg_df['Genre'])
genres  # 不同的字符串并不多

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

## LabelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()  # 实例化
genre_labels = gle.fit_transform(vg_df['Genre'])  # 转换需要离散值的一列
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings  # 映射成数值

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [5]:
vg_df['GenreLabel'] = genre_labels  # 赋值到一列
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,GenreLabel
1,Super Mario Bros.,NES,1985.0,Platform,4
2,Mario Kart Wii,Wii,2008.0,Racing,6
3,Wii Sports Resort,Wii,2009.0,Sports,10
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,7
5,Tetris,GB,1989.0,Puzzle,5
6,New Super Mario Bros.,DS,2006.0,Platform,4


## Map
自己建一个字典

In [14]:
gen_ord_map = {label:index  for index, label in enumerate(gle.classes_)}
gen_ord_map

{'Action': 0,
 'Adventure': 1,
 'Fighting': 2,
 'Misc': 3,
 'Platform': 4,
 'Puzzle': 5,
 'Racing': 6,
 'Role-Playing': 7,
 'Shooter': 8,
 'Simulation': 9,
 'Sports': 10,
 'Strategy': 11}

In [15]:
vg_df['GenreMap'] = vg_df['Genre'].map(gen_ord_map)
vg_df[['Name', 'Genre', 'GenreLabel', 'GenreMap']].iloc[1:7]  # 结果呈现我们设置的map

Unnamed: 0,Name,Genre,GenreLabel,GenreMap
1,Super Mario Bros.,Platform,4,4
2,Mario Kart Wii,Racing,6,6
3,Wii Sports Resort,Sports,10,10
4,Pokemon Red/Pokemon Blue,Role-Playing,7,7
5,Tetris,Puzzle,5,5
6,New Super Mario Bros.,Platform,4,4


## One-Hot Encoder
对于离散型特征，基于树的方法是不需要使用one-hot编码的，例如随机森林等。基于距离的模型，都是要使用one-hot编码，例如神经网络等。

In [26]:
from sklearn.preprocessing import OneHotEncoder
# 获取onehot后的结果，将字符串变成多列的0/1值，有则为1，无则为0
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(vg_df[['GenreLabel']]).toarray()
gen_feature_arr

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
genres = np.unique(vg_df['Genre'])  # 获取全部不同的字符串
gen_features = pd.DataFrame(gen_feature_arr, columns=genres)  # 将字符串作为列，合并onehot数据
gen_features.head()

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [28]:
# 拿出两列原本的数据，实际场景中是全部数据合并，这里是为了查看方便
vg_df_2 = vg_df[['Name', 'Genre']]
vg_df_2.head()

Unnamed: 0,Name,Genre
0,Wii Sports,Sports
1,Super Mario Bros.,Platform
2,Mario Kart Wii,Racing
3,Wii Sports Resort,Sports
4,Pokemon Red/Pokemon Blue,Role-Playing


In [29]:
vg_df_ohe = pd.concat([vg_df_2,gen_features],axis=1)  # 两个数据合并
vg_df_ohe.head()  # 可以看到Platform列第二行为1，对应着Genre列第二行是Platform字符串

Unnamed: 0,Name,Genre,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,Wii Sports,Sports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Super Mario Bros.,Platform,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Mario Kart Wii,Racing,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Wii Sports Resort,Sports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Pokemon Red/Pokemon Blue,Role-Playing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
