###Exploring Efficacy of Different Machine Learning Models in Detecteing Breast Cancver Malignancy Based on Cellular Characteristics
Colin Dailey

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Read in CSV and observe data

In [4]:
df = pd.read_csv('breast_cancer_wisconsin.csv')

Wolberg, WIlliam. (1992). Breast Cancer Wisconsin (Original). UCI Machine Learning Repository. https://doi.org/10.24432/C5HP4Z.

### Data Dictionary:
1. Sample code number:            ID number
2. Clump Thickness:               1 - 10
3. Uniformity of Cell Size:       1 - 10
4. Uniformity of Cell Shape:      1 - 10
5. Marginal Adhesion:             1 - 10
6. Single Epithelial Cell Size:   1 - 10
7. Bare Nuclei:                   1 - 10
8. Bland Chromatin:               1 - 10
9. Normal Nucleoli:               1 - 10
10. Mitoses:                      1 - 10
11. Class:                        (2 for benign, 4 for malignant)

(Wolberg, 1992)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [6]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       699 non-null    int64 
 1   1       699 non-null    int64 
 2   2       699 non-null    int64 
 3   3       699 non-null    int64 
 4   4       699 non-null    int64 
 5   5       699 non-null    int64 
 6   6       699 non-null    object
 7   7       699 non-null    int64 
 8   8       699 non-null    int64 
 9   9       699 non-null    int64 
 10  10      699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


Columns will be renamed for supervised learning based on the data dictionary. Also, column 6, though numerical in nature, is an object data type indicating that cleaning is needed.

##Cleaning and Validation...

In [8]:
# ID duplicates
dup_mask = df.duplicated()
print(f'There are {dup_mask.sum()} duplicates')

There are 8 duplicates


In [9]:
# Drop duplicates
print('There are', df.shape[0], 'rows before')
df = df.drop_duplicates()
print('There are', df.shape[0], 'rows after')

There are 699 rows before
There are 691 rows after


In [10]:
# Column 6 contains '?' values that must be removed
df['6'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [11]:
df = df[df['6'] != '?']

Column 6's "?" values were converted to NaN values, then the rows were dropped. This was done in order to make column 6 rows consistant with one data type. Rows were dropped rather than replaced with a median or special value because of the categorical, ordinal nature of the feature columns.

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 0 to 698
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       675 non-null    int64 
 1   1       675 non-null    int64 
 2   2       675 non-null    int64 
 3   3       675 non-null    int64 
 4   4       675 non-null    int64 
 5   5       675 non-null    int64 
 6   6       675 non-null    object
 7   7       675 non-null    int64 
 8   8       675 non-null    int64 
 9   9       675 non-null    int64 
 10  10      675 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 63.3+ KB


In [13]:
# Drop ID column (May cause ML models to identify non-existent patterns)
df = df.drop(['0'], axis=1)

In [14]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


Rename and drop all columns for supervised learning...

In [15]:
# Create labeled columns
df[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
   'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']] = df[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]

In [16]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4,4,8,6,4,3,4,10,6,1,4


In [17]:
# Drop original unlabeled columns
df = df.drop(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], axis=1)

In [18]:
df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [19]:
# Replace 2's with 'benign' and 4's with 'malignant' in the Class column
df['Class'] = df['Class'].replace(2, 'benign')
df['Class'] = df['Class'].replace(4, 'malignant')

In [20]:
df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,benign
1,5,4,4,5,7,10,3,2,1,benign
2,3,1,1,1,2,2,3,1,1,benign
3,6,8,8,1,3,4,3,7,1,benign
4,4,1,1,3,2,1,3,1,1,benign
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,benign
695,2,1,1,1,2,1,1,1,1,benign
696,5,10,10,3,7,3,8,10,2,malignant
697,4,8,6,4,3,4,10,6,1,malignant


In [21]:
# Convert 'Bare Nuclei' from object to integer column
df['Bare Nuclei'] = df['Bare Nuclei'].astype(int)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Clump Thickness              675 non-null    int64 
 1   Uniformity of Cell Size      675 non-null    int64 
 2   Uniformity of Cell Shape     675 non-null    int64 
 3   Marginal Adhesion            675 non-null    int64 
 4   Single Epithelial Cell Size  675 non-null    int64 
 5   Bare Nuclei                  675 non-null    int64 
 6   Bland Chromatin              675 non-null    int64 
 7   Normal Nucleoli              675 non-null    int64 
 8   Mitoses                      675 non-null    int64 
 9   Class                        675 non-null    object
dtypes: int64(9), object(1)
memory usage: 58.0+ KB
