In [33]:
# In this notebook I am going to use the following data
# http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data

# The name of this data set is: Breast Cancer Wisconsin (Diagnostic) Data Set
# The columns are the following

#    #  Attribute                     Domain
#    -- -----------------------------------------
#    1. Sample code number            id number
#    2. Clump Thickness               1 - 10
#    3. Uniformity of Cell Size       1 - 10
#    4. Uniformity of Cell Shape      1 - 10
#    5. Marginal Adhesion             1 - 10
#    6. Single Epithelial Cell Size   1 - 10
#    7. Bare Nuclei                   1 - 10
#    8. Bland Chromatin               1 - 10
#    9. Normal Nucleoli               1 - 10
#   10. Mitoses                       1 - 10
#   11. Class:                        (2 for benign, 4 for malignant)

import pandas as pd
import seaborn as sns
import numpy as np

# df = pd.read_csv('../data/wdbc.csv', names=['ID', 'Diagnosis', 'radius', 'texture', 'perimeter', 'area', 'smoothness', 
#                                            'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dimension'],
#                header=None);
header_names = ['ID', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 
                'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
df = pd.read_csv('../data/wdbc.csv', header=None, names=header_names, na_values=["?"])
df['Class'] = df['Class'].replace(2, 0)
df['Class'] = df['Class'].replace(4, 1)
def pr(T):
    any(x.name == "?" for x in T)
# Left off here - some rows have a '?' which can't be trained by sklearn - remove rows with these question marks
# df[(df['Clump Thickness'] != '?') & (df['Uniformity of Cell Size'] != '?') & (df['Uniformity of Cell Shape'] != '?') &
#    (df['Marginal Adhesion'] != '?') & (df['Single Epithelial Cell Size'] != '?') & (df['Bare Nuclei'] != '?') & 
#    (df['Bland Chromatin'] != '?') & (df['Normal Nucleoli'] != '?') & (df['Mitoses'] != '?') & 
#    (df['Class'] != '?')]
df.describe()

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0,699.0
mean,1071704.098712,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413,0.344778
std,617095.729819,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078,0.475636
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,1.0
max,13454352.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


In [45]:
# preprocessing 
from sklearn import cross_validation
from sklearn.preprocessing import Imputer

# 1 - shuffle and then split data into training and testing data
shuffled = df.iloc[np.random.permutation(len(df))]

# 2 - using Imputer, replace NaN with average for the column
imp = Imputer(missing_values='NaN', strategy='median', axis=1)
fit_data = imp.fit(shuffled)
clean_data = imp.transform(shuffled)
clean_df = pd.DataFrame(clean_data, columns=header_names)

features = clean_df.drop(['Class', 'ID'], 1)
labels = clean_df['Class']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0)

array([[ 10.,   8.,   4., ...,   3.,  10.,   4.],
       [  3.,   1.,   4., ...,   1.,   1.,   1.],
       [  2.,   1.,   1., ...,   3.,   1.,   1.],
       ..., 
       [  9.,  10.,  10., ...,  10.,  10.,  10.],
       [  5.,   8.,   7., ...,   5.,   7.,   1.],
       [  3.,   1.,   1., ...,   1.,   1.,   1.]])

In [48]:
# Use support vector machine to classify data
from sklearn import svm

clf1 = svm.SVC()
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)

0.94285714285714284

In [49]:
# ordinary least squares: http://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares
from sklearn import linear_model

clf2 = linear_model.LinearRegression()
clf2.fit(X_train, y_train)
clf2.score(X_train, y_train)

0.86959137564921929

In [50]:
# Bayesian Ridge Regression: http://scikit-learn.org/stable/modules/linear_model.html#bayesian-regression

clf3 = linear_model.BayesianRidge()
clf3.fit(X_train, y_train)
clf3.score(X_train, y_train)

0.8695655381723647

In [51]:
# Automatic relevance determination: http://scikit-learn.org/stable/modules/linear_model.html#automatic-relevance-determination-ard
# http://papers.nips.cc/paper/3372-a-new-view-of-automatic-relevance-determination

clf4 = linear_model.ARDRegression()
clf4.fit(X_train, y_train)
clf4.score(X_train, y_train)

0.86717601125567323

In [52]:
# logisitic regression: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

clf4 = linear_model.LogisticRegression()
clf4.fit(X_train, y_train)
clf4.score(X_train, y_train)

0.97374701670644392

In [53]:
# Stochastic Gradient Descent: http://scikit-learn.org/stable/modules/linear_model.html#stochastic-gradient-descent-sgd

clf5 = linear_model.SGDClassifier()
clf5.fit(X_train, y_train)
clf5.score(X_train, y_train)

0.85202863961813846

In [54]:
# Nearest Neighbor Classifiers: 
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
from sklearn import neighbors

clf6 = neighbors.KNeighborsClassifier()
clf6.fit(X_train, y_train)
clf6.score(X_train, y_train)

0.9880668257756563