In [9]:
import pandas as pd
import numpy as np

In [10]:
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
                'Mitoses', 'Class']


In [11]:
data = pd.read_csv('dataset/breast-cancer-wisconsin.data',names=column_names)


In [12]:
# 将`?`替换为标准缺失值表示
data = data.replace(to_replace='?', value=np.nan)

In [13]:
# 丢弃带有缺失值的数据
data = data.dropna(how='any')

In [14]:
data.shape

(683, 11)

切分数据集为训练集和测试集

In [15]:
from sklearn.cross_validation import train_test_split
# X: 根据名字选取1-9列,第0列是code number舍弃掉.
# y: 第10列
X_train,X_test,y_train,y_test  = train_test_split(data[column_names[1:10]],data[column_names[10]]
                                                  ,test_size=0.25,random_state=33)




In [16]:
y_train = pd.Series(y_train)
# 训练样本的数量和类别分布  value : counts
y_train.value_counts()

2    344
4    168
Name: Class, dtype: int64

In [17]:
y_test = pd.Series(y_test)
y_test.value_counts()

2    100
4     71
Name: Class, dtype: int64

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier # 随机梯度下降

In [19]:
# 标准化数据,保证每个维度的特征数据方差为1,均值为0
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [20]:
# 初始化LogisticRegression 和 SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train,y_train) # 使用训练集训练

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
lr_y_predict =lr.predict(X_test)    # 使用测试集预测,存储预测结果

In [22]:
# 使用SGDClassifier再分别训练预测一次
sgdc.fit(X_train,y_train)
sgdc_y_predict = sgdc.predict(X_test)



进行预测结果的性能分析

In [23]:
from sklearn.metrics import classification_report # 度量库

In [24]:
# 使用逻辑回归模型自带的评分函数获取准确性结果
print('Accuracy of LR Classifier:' ,lr.score(X_test,y_test))
# 利用classification_report模块获得LR其他三个指标的结果
print(classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant']))


Accuracy of LR Classifier: 0.9883040935672515
             precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100
  Malignant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171



In [25]:
# 使用随机梯度下降模型自带的评分函数获取准确率
print('Accuracy of SGD Classifier: ',sgdc.score(X_test,y_test))
print(classification_report(y_test,sgdc_y_predict,target_names=['Benign','Maligant']))

Accuracy of SGD Classifier:  0.9005847953216374
             precision    recall  f1-score   support

     Benign       0.85      1.00      0.92       100
   Maligant       1.00      0.76      0.86        71

avg / total       0.92      0.90      0.90       171

