## 加载实例数据

In [41]:
import numpy as np
import urllib.request
# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = urllib.request.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]
X 

array([[   6.   ,  148.   ,   72.   , ...,    0.   ,   33.6  ,    0.627],
       [   1.   ,   85.   ,   66.   , ...,    0.   ,   26.6  ,    0.351],
       [   8.   ,  183.   ,   64.   , ...,    0.   ,   23.3  ,    0.672],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,  112.   ,   26.2  ,    0.245],
       [   1.   ,  126.   ,   60.   , ...,    0.   ,   30.1  ,    0.349],
       [   1.   ,   93.   ,   70.   , ...,    0.   ,   30.4  ,    0.315]])

## 数据标准化

In [46]:
from sklearn import preprocessing
# normalize the data attributes
normalized_X = preprocessing.normalize(X)
# standardize the data attributes
standardized_X = preprocessing.scale(X)
normalized_X


array([[ 0.03494617,  0.86200564,  0.41935409, ...,  0.        ,
         0.19569858,  0.00365188],
       [ 0.00872683,  0.74178025,  0.57597054, ...,  0.        ,
         0.23213358,  0.00306312],
       [ 0.04093566,  0.93640332,  0.32748532, ...,  0.        ,
         0.11922512,  0.0034386 ],
       ..., 
       [ 0.02727338,  0.66001582,  0.39273669, ...,  0.61092373,
         0.14291252,  0.0013364 ],
       [ 0.0070043 ,  0.8825414 ,  0.42025781, ...,  0.        ,
         0.21082934,  0.0024445 ],
       [ 0.00804902,  0.74855891,  0.56343144, ...,  0.        ,
         0.24469022,  0.00253544]])

## 特征的选取

In [47]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.13021495  0.26037193  0.11960457  0.09117502  0.08822158  0.16829358
  0.14211838]


In [48]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True  True]
[1 2 3 5 4 1 1]


## 逻辑回归

In [49]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.55      0.63       268

avg / total       0.77      0.77      0.77       768

[[447  53]
 [120 148]]
