In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
%matplotlib inline

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import ensemble

## Model Cancer Diagnosis

I want to make a model that can predict a whether or not a tumor is benign, based on certain measurements. I will use data from the Wisconson Breat Cancer Research Association to train my model. The data contains measures on the tumors of 699 subjects and whether or not they were malignant.

In [2]:
cancer = pd.read_csv('C:\Code\Data\\breast_cancer.csv')
display(cancer.head())
print(cancer.shape)

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


(698, 11)


They data apparently needs much cleaning, having what should be one of the rows as the column names. Also, I will need my data to be in all numeric format to be able to properly work with it.

In [29]:
for i in range(0,len(cancer.columns)):
    try:
        cancer.iloc[:,i].astype(float)
    except:
        print(cancer.columns[i])

1.3


In [31]:
print(cancer['1.3'].value_counts())

1     401
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: 1.3, dtype: int64


It seems the feature mislabeled '1.3', which should be called 'Bare Nuclei', cannot be converted into a float type because it has many values marked with a '?', 16 in fact.  With how little data I have, I am hesitant to throw away any rows, but at the same time, I do not want to throw away any features, either.

To make the best model I can, I will test how my model will function when I drop the problem data points to preserve the most features, versus when I drop the whole problem feature column, preserving the most data points.

## Making a dataset dropping problem data points

I will call this data set the 'dropped row' or 'dr' version of my data

In [5]:
null_index = cancer[cancer['1.3']=='?'].index
null_index = list(null_index)
cancer_dr = cancer.drop(null_index)

In [6]:
newrow_dr = cancer_dr.columns

In [7]:
cancer_dr.columns = ['IDNum','Thickness','Size_Uniformity','Shape_Uniformity','Adhesion','Epithelial_Size',
                     'Bare Nuclei','Chromatin','Normal_Nucleoli','Mitoses','Class']

In [8]:
newrow_dr = pd.Series(newrow_dr)
newrow_dr = newrow_dr.astype(float)
newrow_dr.index = ['IDNum','Thickness','Size_Uniformity','Shape_Uniformity','Adhesion','Epithelial_Size',
                     'Bare Nuclei','Chromatin','Normal_Nucleoli','Mitoses','Class']
cancer_dr = cancer_dr.append(newrow_dr, ignore_index=True)

In [9]:
cancer_dr.Class = np.where(cancer_dr.Class==4,1,0)

In [10]:
display(cancer_dr.head())

Unnamed: 0,IDNum,Thickness,Size_Uniformity,Shape_Uniformity,Adhesion,Epithelial_Size,Bare Nuclei,Chromatin,Normal_Nucleoli,Mitoses,Class
0,1002945.0,5.0,4.0,4.0,5.0,7.0,10,3.0,2.0,1.0,0
1,1015425.0,3.0,1.0,1.0,1.0,2.0,2,3.0,1.0,1.0,0
2,1016277.0,6.0,8.0,8.0,1.0,3.0,4,3.0,7.0,1.0,0
3,1017023.0,4.0,1.0,1.0,3.0,2.0,1,3.0,1.0,1.0,0
4,1017122.0,8.0,10.0,10.0,8.0,7.0,10,9.0,7.0,1.0,1


In [32]:
print(cancer_dr['Class'].value_counts())

0    444
1    239
Name: Class, dtype: int64


There are now only 683 data points in this new data set, but it has kept more of its variance this way.

## Make a dataset dropping problem feature

I will call this data set the 'dropped column' or 'dc' version of my data

In [12]:
cancer_dc = cancer.drop('1.3',axis=1)

In [13]:
newrow_dc = cancer_dc.columns

In [14]:
cancer_dc.columns = ['IDNum','Thickness','Size_Uniformity','Shape_Uniformity','Adhesion',
                  'Epithelial_Size','Chromatin','Normal_Nucleoli','Mitoses','Class']

In [15]:
newrow_dc = pd.Series(newrow_dc)
newrow_dc = newrow_dc.astype(float)
newrow_dc.index = ['IDNum','Thickness','Size_Uniformity','Shape_Uniformity','Adhesion',
                  'Epithelial_Size','Chromatin','Normal_Nucleoli','Mitoses','Class']
cancer_dc = cancer_dc.append(newrow_dc, ignore_index=True)

In [16]:
cancer_dc.Class = np.where(cancer_dc.Class==4,1,0)

In [17]:
display(cancer_dc.head())

Unnamed: 0,IDNum,Thickness,Size_Uniformity,Shape_Uniformity,Adhesion,Epithelial_Size,Chromatin,Normal_Nucleoli,Mitoses,Class
0,1002945.0,5.0,4.0,4.0,5.0,7.0,3.0,2.0,1.0,0
1,1015425.0,3.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,0
2,1016277.0,6.0,8.0,8.0,1.0,3.0,3.0,7.0,1.0,0
3,1017023.0,4.0,1.0,1.0,3.0,2.0,3.0,1.0,1.0,0
4,1017122.0,8.0,10.0,10.0,8.0,7.0,9.0,7.0,1.0,1


In [18]:
X_dr = cancer_dr.drop(['IDNum','Class'], axis=1)
Y_dr = cancer_dr.Class

X_dc = cancer_dc.drop(['IDNum','Class'], axis=1)
Y_dc = cancer_dc.Class

This data set has the same amount of data points as the original data set, but now it only has 10 features instead of 11.

## Lasso Logistic Regression

I will use Lasso Logistic Regression Modeling to predict wether or not a tumor is malignant.

In [19]:
grid = [.01,.1, 1, 10,100,200,300,500,700, 1000, 10000] 
out = [] 
for c in grid: 
    lrl = linear_model.LogisticRegression(penalty='l1',C=c) 
    lrl.fit(X_dr, Y_dr) 
    scores = cross_val_score(lrl, X_dr, Y_dr) 
    out.append(scores.mean()) 
    bestc = grid[out.index(max(out))]

lrl = linear_model.LogisticRegression(penalty='l1',C=bestc) 
lrl.fit(X_dr,Y_dr)
print('The Model was fit with C = ',bestc)

Y_predlrl = lrl.predict(X_dr)

The Model was fit with C =  1


In [20]:
ben, mal = confusion_matrix(Y_dr, Y_predlrl)
if mal[0] == 0:
    type2 = 0
else:
    type2 = mal[0]/(mal[0]+mal[1])
print("Dropping Problem Rows")
print('\nLasso Logistic Error Score:')
print(lrl.score(X_dr,Y_dr))
print('\nType II Error Percentage:')
print(round(type2*100,2),"%")
print('\nConfusion Matrix:')
print(confusion_matrix(Y_dr, Y_predlrl))
cvscore = cross_val_score(lrl, X_dr, Y_dr)
print('\nCross Validation Score:')
print('{}% +/- {}%'.format(round(cvscore.mean(),2),round(cvscore.std()*2,2)))

Dropping Problem Rows

Lasso Logistic Error Score:
0.9707174231332357

Type II Error Percentage:
4.18 %

Confusion Matrix:
[[434  10]
 [ 10 229]]

Cross Validation Score:
0.96% +/- 0.04%


In [33]:
grid = [.01,.1, 1, 10,100,200,300,500,700, 1000, 10000] 
out = [] 
for c in grid: 
    lrl = linear_model.LogisticRegression(penalty='l1',C=c) 
    lrl.fit(X_dc, Y_dc) 
    scores = cross_val_score(lrl, X_dc, Y_dc) 
    out.append(scores.mean()) 
    bestc = grid[out.index(max(out))]

lrl = linear_model.LogisticRegression(penalty='l1',C=bestc) 
lrl.fit(X_dc,Y_dc)
print('The Model was fit with C = ',bestc)

Y_predlrl = lrl.predict(X_dc)

The Model was fit with C =  0.1


In [35]:
ben, mal = confusion_matrix(Y_dc, Y_predlrl)
if mal[0] == 0:
    type2 = 0
else:
    type2 = mal[0]/(mal[0]+mal[1])
print("Dropping Problem Columns")
print('\nLasso Logistic Error Score:')
print(lrl.score(X_dc,Y_dc))
print('\nType II Error Percentage:')
print(round(type2*100,2),"%")
print('\nConfusion Matrix:')
print(confusion_matrix(Y_dc, Y_predlrl))
cvscore = cross_val_score(lrl, X_dc, Y_dc)
print('\nCross Validation Score:')
print('{}% +/- {}%'.format(round(cvscore.mean(),2),round(cvscore.std()*2,2)))

Dropping Problem Columns

Lasso Logistic Error Score:
0.9613733905579399

Type II Error Percentage:
5.39 %

Confusion Matrix:
[[444  14]
 [ 13 228]]

Cross Validation Score:
0.95% +/- 0.05%


The Lasso Logistic Regression model appears to perform ever so slightly better when dropping rows than columns from the data.

# Random Forest

I will use Random Forest Regression modeling to predict whether or not a tumor is malignant.

In [25]:
rfc = ensemble.RandomForestClassifier()

rfc.fit(X_dr,Y_dr)

Y_predrfc = rfc.predict(X_dr)

In [26]:
ben, mal = confusion_matrix(Y_dr, Y_predrfc)
if mal[0] == 0:
    type2 = 0
else:
    type2 = mal[0]/(mal[0]+mal[1])
print("Dropping Problem Rows")
print('\nRandom Forest Error Score:')
print(rfc.score(X_dr,Y_dr))
print('\nType II Error Percentage:')
print(round(type2*100,2),"%")
print('\nConfusion Matrix:')
print(confusion_matrix(Y_dr, Y_predrfc))
cvscore = cross_val_score(rfc, X_dr, Y_dr)
print('\nCross Validation Score:')
print('{}% +/- {}%'.format(round(cvscore.mean(),2),round(cvscore.std()*2,2)))

Dropping Problem Rows

Random Forest Error Score:
0.9956076134699854

Type II Error Percentage:
0 %

Confusion Matrix:
[[441   3]
 [  0 239]]

Cross Validation Score:
0.96% +/- 0.04%


In [27]:
rfc = ensemble.RandomForestClassifier()

rfc.fit(X_dc,Y_dc)

Y_predrfc = rfc.predict(X_dc)

In [28]:
ben, mal = confusion_matrix(Y_dc, Y_predrfc)
if mal[0] == 0:
    type2 = 0
else:
    type2 = mal[0]/(mal[0]+mal[1])
print("Dropping Problem Columns")
print('Random Forest Error Score:')
print(rfc.score(X_dc,Y_dc))
print('\nType II Error Percentage:')
print(round(type2*100,2),"%")
print('\nConfusion Matrix:')
print(confusion_matrix(Y_dc, Y_predrfc))
cvscore = cross_val_score(rfc, X_dc, Y_dc)
print('\nCross Validation Score:')
print('{}% +/- {}%'.format(round(cvscore.mean(),2),round(cvscore.std()*2,2)))

Dropping Problem Columns
Random Forest Error Score:
0.9971387696709585

Type II Error Percentage:
0 %

Confusion Matrix:
[[456   2]
 [  0 241]]

Cross Validation Score:
0.95% +/- 0.05%


The Random Forest model also appears to perform ever so slightly better when problem rows are dropped over problem columns. Also, the Random Forest model simply performs better overall than the Lasso Logistic Regression model.

Both models appear to work best when dropping the problem data rows than when dropping the features. This is intuitive because when dropping the problem data rows in this instance, only 16 * 11 = 176 data points are lost. But, when dropping the problem feature 699 * 1 = 699 data points are lost. 

http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29