In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [2]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

In [3]:
pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


数据归一化

In [4]:
normal = MinMaxScaler()  # 实例化
result = normal.fit_transform(data)  # 数据归一化   x-min()/max()-min()

In [5]:
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [6]:
#逆转归一化
normal.inverse_transform(result)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [7]:
# 使用numpy来实现归一化
import numpy as np
X = np.array([[-1, 2], [-0.5, 6], [0, 10], [1, 18]])
#归一化
X_nor = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_nor

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [8]:
#逆转归一化
X_returned = X_nor * (X.max(axis=0) - X.min(axis=0)) + X.min(axis=0)
X_returned

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

数据标准化 
当数据(x)按均值(μ)中心化后，再按标准差(σ)缩放，数据就会服从为均值为0，方差为1的正态分布（即标准正态分布）

In [9]:
# 数据标准化
from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = StandardScaler() #实例化
scaler.fit(data) #fit，本质是生成均值和方差
print(scaler.mean_) #查看均值的属性mean_
print(scaler.var_) #查看方差的属性var_
x_std = scaler.transform(data) #通过接口导出结果

[-0.125  9.   ]
[ 0.546875 35.      ]


In [10]:
x_std.mean() #导出的结果是一个数组，用mean()查看均值

0.0

In [11]:
x_std.std() #用std()查看方差

1.0

In [12]:
scaler.fit_transform(data)          # 使用fit_transform(data)一步达成结果
scaler.inverse_transform(x_std)     # 使用inverse_transform逆转标准化

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

处理分类型特征：编码与哑变量

In [14]:
# preprocessing.LabelEncoder：标签专用，能够将分类转换为分类数值
from sklearn.preprocessing import LabelEncoder
import pandas as pd
data = pd.read_csv(r"./data/Narrativedata.csv",index_col=0)

In [15]:
data.sample(10)

Unnamed: 0,Age,Sex,Embarked,Survived
409,,female,S,No
833,23.0,male,S,No
617,26.0,female,S,No
819,10.0,male,S,No
705,39.0,male,S,No
287,22.0,male,S,No
207,26.0,male,C,Yes
760,,male,S,No
601,,male,S,No
437,24.0,female,S,Yes


In [16]:
y = data.iloc[:,-1]                         #要输入的是标签，不是特征矩阵，所以允许一维

In [17]:
le = LabelEncoder()

In [18]:
label = le.fit_transform(y)
label

array([0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2,
       2, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 1,
       2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2,
       2, 0, 2, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 1, 0,
       0, 2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0,
       1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 1, 2, 1, 0, 0, 2, 2, 0, 2, 0,
       2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 1,
       0, 0, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0,
       2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2,

In [19]:
le.inverse_transform(label)  # 使用inverse_transform可以逆转

array(['No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'Unknown', 'Yes', 'No', 'No', 'No', 'Unknown', 'No', 'Yes', 'No',
       'Yes', 'Unknown', 'Yes', 'Yes', 'Yes', 'No', 'Unknown', 'No', 'No',
       'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'Unknown', 'Yes', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No',
       'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Unknown', 'No',
       'Unknown', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Unknown', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No',
       'Yes', 'Yes', 'No', 'Unknown', 'No', 'No', 'Yes', 'No', 'N

In [20]:
# 让标签等于我们运行出来的结果
data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])

In [21]:
data.sample(10)

Unnamed: 0,Age,Sex,Embarked,Survived
670,40.0,female,S,2
662,47.0,male,S,0
603,44.0,male,S,0
835,39.0,female,C,2
6,54.0,male,S,0
505,18.0,male,C,1
748,19.0,male,S,0
1,38.0,female,C,2
700,18.0,female,C,2
699,42.0,male,S,0


In [22]:
# preprocessing.OrdinalEncoder：特征专用，能够将分类特征转换为分类数值
from sklearn.preprocessing import OrdinalEncoder
#接口categories_对应LabelEncoder的接口classes_，一模一样的功能
data_ = data.copy()
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [23]:
data_.loc[:,"Age"] = data_.loc[:,"Age"].fillna(data_.loc[:,"Age"].median())
#.fillna 在DataFrame里面直接进行填补
data_.dropna(axis=0,inplace=True)
#.dropna(axis=0)删除所有有缺失值的行，.dropna(axis=1)删除所有有缺失值的列
#参数inplace，为True表示在原数据集上进行修改，为False表示生成一个复制对象，不修改原数据，默认False
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [24]:
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0


In [25]:
# preprocessing.OneHotEncoder 独热编码
from sklearn.preprocessing import OneHotEncoder
data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
#.fillna 在DataFrame里面直接进行填补
data.dropna(axis=0,inplace=True)

X = data.iloc[:,1:-1]
 
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
result

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [26]:
#axis=1,表示跨行进行合并，也就是将两表左右相连，如果是axis=0，就是将量表上下相连
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
 
newdata.head()
 
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
 
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]
 
newdata.sample(10)

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
365,30.0,0.0,1.0,0.0,1.0,0.0,0.0
523,44.0,2.0,0.0,1.0,1.0,0.0,0.0
84,17.0,2.0,1.0,0.0,0.0,0.0,1.0
856,45.0,2.0,1.0,0.0,1.0,0.0,0.0
458,50.0,2.0,0.0,1.0,0.0,1.0,0.0
853,16.0,2.0,1.0,0.0,0.0,0.0,1.0
472,33.0,2.0,1.0,0.0,1.0,0.0,0.0
680,28.0,0.0,0.0,1.0,1.0,0.0,0.0
357,38.0,0.0,1.0,0.0,0.0,1.0,0.0
64,28.0,0.0,0.0,1.0,1.0,0.0,0.0


特征选择

In [27]:
#导入数据，让我们使用digit recognizor数据来一展身手
import pandas as pd

data = pd.read_csv(r"./data/digit recognizor.csv") 

In [28]:
X = data.iloc[:,1:]
y = data.iloc[:,0] 
X.shape

(42000, 784)

In [29]:
X.sample(5)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
9515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23754,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31703,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# 方差过滤  消除方差为0
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()                      #实例化，不填参数默认方差为0
X_var0 = selector.fit_transform(X)                  #获取删除不合格特征之后的新特征矩阵
 
#也可以直接写成 X = VairanceThreshold().fit_transform(X)
 
X_var0.shape#(42000, 708)

(42000, 708)

In [32]:
# 望留下一半的特征，那可以设定一个让特征总数减半的方差阈值，只要找到特征方差的中位数，再将这个中位数作为参数threshold的值输入
import numpy as np

# X.var()#每一列的方差
# print(X.var().values)
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
 

np.median(X.var().values)
 
X_fsvar.shape#(42000, 392)

(42000, 392)

In [35]:
# 随机森林方差过滤前
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score

cross_val_score(RFC(n_estimators=10,random_state=0),X,y,cv=5).mean()

0.9373571428571429

In [38]:
#  随机森林方差过滤后

cross_val_score(RFC(n_estimators=10,random_state=0),X_fsvar,y,cv=5).mean()

0.9390476190476191

为什么随机森林运行如此之快？为什么方差过滤对随机森林没很大的有影响？这是由于两种算法的原理中涉及到的
计算量不同。最近邻算法KNN，单棵决策树，支持向量机SVM，神经网络，回归算法，都需要遍历特征或升维来进
行运算，所以他们本身的运算量就很大，需要的时间就很长，因此方差过滤这样的特征选择对他们来说就尤为重
要。但对于不需要遍历特征的算法，比如随机森林，它随机选取特征进行分枝，本身运算就非常快速，因此特征选
择对它来说效果平平。这其实很容易理解，无论过滤法如何降低特征的数量，随机森林也只会选取固定数量的特征
来建模；而最近邻算法就不同了，特征越少，距离计算的维度就越少，模型明显会随着特征的减少变得轻量。因
此，过滤法的主要对象是：需要遍历特征或升维的算法们，而过滤法的主要目的是：在维持算法表现的前提下，帮
助算法们降低计算成本

In [39]:
# 卡方过滤
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
 
#假设在这里我一直我需要300个特征
X_fschi = SelectKBest(chi2, k=300).fit_transform(X_fsvar, y)
X_fschi.shape

(42000, 300)

In [40]:
cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()

0.9344761904761905

In [None]:
# 挑选最佳k值
import matplotlib.pyplot as plt

score = []
for i in range(390,200,-10):
    X_fschi = SelectKBest(chi2, k=i).fit_transform(X_fsvar, y)
    once = cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()
    score.append(once)
plt.plot(range(390,200,-10),score)

plt.show()

In [41]:
# chi2实例化后的模型中获得各个特征所对应的卡方值和P值
chivalue, pvalues_chi = chi2(X_fsvar,y)

In [43]:
#k取多少？我们想要消除所有p值大于设定值，比如0.05或0.01的特征：
k = chivalue.shape[0] - (pvalues_chi > 0.05).sum()

In [44]:
k

392

In [48]:
# embedded 嵌入法
from sklearn.feature_selection import SelectFromModel

RFC_ = RFC(n_estimators =10,random_state=0)
 
X_embedded = SelectFromModel(RFC_,threshold=0.001).fit_transform(X,y)
 
#在这里我只想取出来有限的特征。0.005这个阈值对于有780个特征的数据来说，是非常高的阈值，因为平均每个特征
# 只能够分到大约0.001的feature_importances_
 
X_embedded.shape

(42000, 279)

In [49]:
cross_val_score(RFC_,X_embedded,y,cv=5).mean()

0.9386904761904763

In [50]:
X_embedded = SelectFromModel(RFC_,threshold=0.000564).fit_transform(X,y)
X_embedded.shape
#我们可能已经找到了现有模型下的最佳结果，如果我们调整一下随机森林的参数呢？
cross_val_score(RFC(n_estimators=100,random_state=0),X_embedded,y,cv=5).mean()

0.9634285714285715

In [51]:
# wrapper 包装法
from sklearn.feature_selection import RFE

RFC_ = RFC(n_estimators =10,random_state=0)
selector = RFE(RFC_, n_features_to_select=340, step=50).fit(X, y)

selector.support_.sum()#340

selector.ranking_

X_wrapper = selector.transform(X)

cross_val_score(RFC_,X_wrapper,y,cv=5).mean()

0.9379761904761905