## 加载实例数据

In [81]:
import numpy as np
import urllib.request
# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.request.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
X 

array([[   6.   ,  148.   ,   72.   , ...,    0.   ,   33.6  ,    0.627],
       [   1.   ,   85.   ,   66.   , ...,    0.   ,   26.6  ,    0.351],
       [   8.   ,  183.   ,   64.   , ...,    0.   ,   23.3  ,    0.672],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,  112.   ,   26.2  ,    0.245],
       [   1.   ,  126.   ,   60.   , ...,    0.   ,   30.1  ,    0.349],
       [   1.   ,   93.   ,   70.   , ...,    0.   ,   30.4  ,    0.315]])

## 数据标准化

In [88]:
from sklearn import preprocessing
# normalize the data attributes
normalized_X = preprocessing.normalize(X)
# standardize the data attributes
standardized_X = preprocessing.scale(X)
standardized_X
standardized_X

array([[ 0.63994726,  0.84832379,  0.14964075, ..., -0.69289057,
         0.20401277,  0.46849198],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.69289057,
        -0.68442195, -0.36506078],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -0.69289057,
        -1.10325546,  0.60439732],
       ..., 
       [ 0.3429808 ,  0.00330087,  0.14964075, ...,  0.27959377,
        -0.73518964, -0.68519336],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.69289057,
        -0.24020459, -0.37110101],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.69289057,
        -0.20212881, -0.47378505]])

## 特征的选取

In [86]:
# from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.12248003  0.27614495  0.12606338  0.0928807   0.08633828  0.14807645
  0.1480162 ]


In [103]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# create the RFE model and select 3 attributes，and the 3 means it will have 3 feature remains
rfe = RFE(model, 3)
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True  True]
[1 2 3 5 4 1 1]


## 逻辑回归

In [106]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.55      0.63       268

avg / total       0.77      0.77      0.77       768

[[447  53]
 [120 148]]


## 以下为了看懂上面的所有点，进行的实践测试。

### 标准化（意义：http://blog.csdn.net/liuweiyuxiang/article/details/77559781）

In [75]:
from sklearn import preprocessing
import numpy as np
X=np.array([[1,-1,2],
            [2,0,0],
            [0,1,-1]])
X_scaled = preprocessing.scale(X)
X_scaled.mean(axis=0)
X_scaled.std(axis=0)
X_scaled



array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [76]:
X=  [[ 1., -1.,  2.],
    [ 2.,  0.,  0.],
    [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized
# 按行算：
# [1,-1,2]的L2范数是(1^2+(-1)^2+2^2)^(1/2)=6^(1/2)=2.45
# 第一行的每个元素除以L2范数，得到：
# [1/2.45, -1/2.45, 2/2.45] = [0.4, -0.4, 0.8..]

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

### feature_selection 中的RFE的介绍

In [98]:
#这是官网上的没有说 RFE的support 和ranking
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
n_features_to_select = 4
#递归特征消除法，返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(model, n_features_to_select).fit_transform(iris.data, iris.target)
# fit_transform 先拟合后标准化


##这个比较好理解（http://blog.csdn.net/fontthrone/article/details/79004874）
# print("N_features %s" % selector.n_features_) # 保留的特征数
# print("Support is %s" % selector.support_) # 是否保留
# print("Ranking %s" % selector.ranking_) # 重要程度排名

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

### 理解逻辑回归 LogisticRegression