##  一、特征工程

In [2]:
import pandas as pd
import numpy as np

In [3]:
column_names =['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 
              'Marginal Adhesion','Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
              'Mitoses','Class']

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', names=column_names)
print data.info()
print data.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample code number             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB
None
   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             10162

In [4]:
# 将 ? 替换为标准缺失值表示
data = data.replace(to_replace='?', value=np.nan)
print data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample code number             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    683 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB
None


In [5]:
# 丢弃带有缺失值的
data = data.dropna(how='any')
print data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
Sample code number             683 non-null int64
Clump Thickness                683 non-null int64
Uniformity of Cell Size        683 non-null int64
Uniformity of Cell Shape       683 non-null int64
Marginal Adhesion              683 non-null int64
Single Epithelial Cell Size    683 non-null int64
Bare Nuclei                    683 non-null object
Bland Chromatin                683 non-null int64
Normal Nucleoli                683 non-null int64
Mitoses                        683 non-null int64
Class                          683 non-null int64
dtypes: int64(10), object(1)
memory usage: 64.0+ KB
None


In [9]:
# 把数据集随机拆为训练集和测试集

from sklearn.cross_validation import train_test_split

tumor_feature = data[column_names[1:10]]
tumor_target = data[column_names[10]]

X_train, X_test, y_train, y_test = train_test_split(tumor_feature, tumor_target, test_size=0.25, random_state=33)

## 二、建模
使用逻辑回归与随机梯度参数估计两种方法对上述数据进行训练

In [12]:
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [14]:
X_train

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
662,1,1,3,1,2,1,2,1,1
282,1,4,3,10,4,10,5,6,1
542,5,3,1,1,2,1,1,1,1
301,1,1,1,1,2,1,3,1,1
95,1,1,1,1,2,1,3,1,1
591,2,5,7,6,4,10,7,6,1
356,5,3,3,1,3,3,3,3,3
17,4,1,1,1,2,1,3,1,1
582,6,10,5,5,4,10,6,10,1
460,5,1,1,3,2,1,1,1,1


In [15]:
# 标准化数据，保证每个维度的数据方差为1，均值为0.使得预测结果不会被某些维度过大的特征值而主导

ss=StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [17]:
# 初始化 LogisticRegression 和 SGDClassifier

lr = LogisticRegression()
sgdc = SGDClassifier()

# 训练 LR
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)

# 训练SGD
sgdc.fit(X_train, y_train)
sgdc_predict=sgdc.predict(X_test)

### 三、模型评估

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [22]:
print "lr 混淆矩阵"
lr_confusion = confusion_matrix(y_test, lr_predict)
print lr_confusion
print "SGD 混淆矩阵"
sgdc_confusion = confusion_matrix(y_test, sgdc_predict)
print sgdc_confusion

lr 混淆矩阵
[[99  1]
 [ 1 70]]
SGD 混淆矩阵
[[98  2]
 [ 1 70]]


In [25]:
print accuracy_score(y_test, lr_predict)
print '每个分类的准确率、召回率、f1-score'
print classification_report(y_test, lr_predict, target_names=['Benign','Malignant']) 

0.988304093567
每个分类的准确率、召回率、f1-score
             precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100
  Malignant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171



In [26]:
print accuracy_score(y_test, sgdc_predict)
print '每个分类的准确率、召回率、f1-score'
print classification_report(y_test, sgdc_predict, target_names=['Benign','Malignant']) 

0.982456140351
每个分类的准确率、召回率、f1-score
             precision    recall  f1-score   support

     Benign       0.99      0.98      0.98       100
  Malignant       0.97      0.99      0.98        71

avg / total       0.98      0.98      0.98       171

