Load in data and check class ratio

In [194]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, Imputer
from sklearn import svm
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [149]:
canc = pd.read_csv("cancer_uci.csv", index_col=0)
canc.head()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,Benign
1,1002945,5,4,4,5,7,10,3,2,1,Benign
2,1015425,3,1,1,1,2,2,3,1,1,Benign
3,1016277,6,8,8,1,3,4,3,7,1,Benign
4,1017023,4,1,1,3,2,1,3,1,1,Benign


In [150]:
canc.Class = canc.Class.map({'Benign':0,'Malignant':1})
canc.Class.value_counts()

0    458
1    241
Name: Class, dtype: int64

In [151]:
# Separate your two classes:
mal_example = canc[canc.Class == 1] # give me the dataframe when Class = 1 
benign_example = canc[canc.Class == 0]

# Oversample the malignant class to have a 50/50 ratio:
mal_over_example = mal_example.sample(458,replace=True)

# Recombine the two frames:
over_sample = pd.concat([mal_over_example,benign_example])

# Sanity check the length:
print len(over_sample)

916


In [152]:
# getting class column into numpy array
y = over_sample.Class.values
# taking class column out of pandas dataframe
over_sample = over_sample.drop('Class')

Obviously the data is not normalized, as there are values that are outside of the 0-1 range. 

In [153]:
# Normalize data
# taking out ID column
over_sample = over_sample.drop('Sample_code_number', axis=1)
over_sample.head()

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
86,3,3,6,4,5,8,4,4,1,1
457,8,8,9,6,6,3,10,10,1,1
184,8,7,6,4,4,10,5,1,1,1
344,7,6,4,8,10,10,9,5,3,1
73,9,4,5,10,6,10,4,8,1,1


In [154]:
over_sample.dtypes

Clump_Thickness                 int64
Uniformity_of_Cell_Size         int64
Uniformity_of_Cell_Shape        int64
Marginal_Adhesion               int64
Single_Epithelial_Cell_Size     int64
Bare_Nuclei                    object
Bland_Chromatin                 int64
Normal_Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [155]:
#for row in over_sample.Bare_Nuclei:
    #print type(row)
    # all rows are strings

In [156]:
#over_sample.Bare_Nuclei.astype(long)
h = over_sample.Bare_Nuclei[over_sample.Bare_Nuclei == '?']

In [157]:
# converting the '?'s to NaNs
over_sample.ix[over_sample.Bare_Nuclei == '?', 'Bare_Nuclei'] = np.nan

In [158]:
# converting to Numpy array
X = over_sample.values

In [159]:
# performing imputation
# creating imputor object
imp = Imputer()
# imputing the data using mean imputation
X = imp.fit_transform(X)

In [160]:
# normalizing data
X = normalize(X)

In [161]:
X[:2,:]

array([[ 0.21594473,  0.21594473,  0.43188945,  0.2879263 ,  0.35990788,
         0.5758526 ,  0.2879263 ,  0.2879263 ,  0.07198158,  0.07198158],
       [ 0.36066785,  0.36066785,  0.40575134,  0.27050089,  0.27050089,
         0.13525045,  0.45083482,  0.45083482,  0.04508348,  0.04508348]])

In [164]:
# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Using SVM algorithm

In [195]:
# getting the SVM algorithm object
model = svm.SVC(kernel='linear')
# performing 10-fold cross-validation (training 10 models) and getting their accuracy (score)
scores = cross_val_score(model, X_train, y_train, cv=10)
# the average accuracy score
scores.mean()

0.88737352370879619

In [196]:
# fitting on the training set
model.fit(X_train,y_train)
# predicting the test set
predicted = model.predict(X_test)

In [197]:
# getting the confusion matrix
print confusion_matrix(y_test, predicted)
print 'The accuracy is ' + str(accuracy_score(y_test, predicted))

[[128  15]
 [ 12 148]]
The accuracy is 0.910891089109


In [180]:
# classification report
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       0.91      0.90      0.90       143
          1       0.91      0.93      0.92       160

avg / total       0.91      0.91      0.91       303



In [182]:
# AUC (area under the curve)
roc_auc_score(y_test, predicted)

0.91005244755244763

Decision Tree algorithm

In [198]:
# getting the Decision tree algorithm object
model = DecisionTreeClassifier()
# performing 10-fold cross-validation (training 10 models) and getting their accuracy (score)
scores = cross_val_score(model, X_train, y_train, cv=10)
# the average accuracy score
scores.mean()

1.0

In [199]:
# fitting on the training set
model.fit(X_train,y_train)
# predicting the test set
predicted = model.predict(X_test)

In [200]:
# getting the confusion matrix
print confusion_matrix(y_test, predicted)
print 'The accuracy is ' + str(accuracy_score(y_test, predicted))

[[143   0]
 [  0 160]]
The accuracy is 1.0


In [201]:
# classification report
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       143
          1       1.00      1.00      1.00       160

avg / total       1.00      1.00      1.00       303



In [202]:
# AUC (area under the curve)
roc_auc_score(y_test, predicted)

1.0

Since the Decision Tree had 100% accuracy, the results are better than the SVM with a linear kernel which had a 91% accuracy.

Random Forest Algorithm

In [203]:
# getting the Random Forest algorithm object
model = RandomForestClassifier()
# performing 10-fold cross-validation (training 10 models) and getting their accuracy (score)
scores = cross_val_score(model, X_train, y_train, cv=10)
# the average accuracy score
scores.mean()

1.0

In [204]:
# fitting on the training set
model.fit(X_train,y_train)
# predicting the test set
predicted = model.predict(X_test)

In [205]:
# getting the confusion matrix
print confusion_matrix(y_test, predicted)
print 'The accuracy is ' + str(accuracy_score(y_test, predicted))

[[143   0]
 [  0 160]]
The accuracy is 1.0


In [206]:
# classification report
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       143
          1       1.00      1.00      1.00       160

avg / total       1.00      1.00      1.00       303



In [207]:
# AUC (area under the curve)
roc_auc_score(y_test, predicted)

1.0

The Random Forest also had an accuracy of 100%, which is again better than the SVM with a linear kernel.