# Cancer Detection Project

In [34]:
import numpy as np 
import pandas as pd 

In [35]:
col = [
    'id', 'clump thickness', 'uniformity of cell size', 'uniformity of cell shape', 'marginal adhesion', 'single epithelial cell size',
    'bare nuclei', 'bland chromatin', 'normal nucleoli', 'mitoses', 'class'
]
df = pd.read_csv('breast-cancer-wisconsin.data', names=col, header=None)

In [36]:
df.head()

Unnamed: 0,id,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## Preprocessing

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   clump thickness              699 non-null    int64 
 2   uniformity of cell size      699 non-null    int64 
 3   uniformity of cell shape     699 non-null    int64 
 4   marginal adhesion            699 non-null    int64 
 5   single epithelial cell size  699 non-null    int64 
 6   bare nuclei                  699 non-null    object
 7   bland chromatin              699 non-null    int64 
 8   normal nucleoli              699 non-null    int64 
 9   mitoses                      699 non-null    int64 
 10  class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [38]:
# look at single object datatype
df['bare nuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare nuclei, dtype: int64

In [39]:
# rm ?
df[df['bare nuclei'] == '?']

Unnamed: 0,id,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses,class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [40]:
df['bare nuclei'].replace('?', np.NaN, inplace=True)
df = df.dropna()
df['bare nuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: bare nuclei, dtype: int64

In [41]:
# 2 is benign, 4 is malignant --> change to 0 or 1
df['class'] = df['class'] / 2 - 1

In [42]:
df['class'].value_counts()

0.0    444
1.0    239
Name: class, dtype: int64

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           683 non-null    int64  
 1   clump thickness              683 non-null    int64  
 2   uniformity of cell size      683 non-null    int64  
 3   uniformity of cell shape     683 non-null    int64  
 4   marginal adhesion            683 non-null    int64  
 5   single epithelial cell size  683 non-null    int64  
 6   bare nuclei                  683 non-null    object 
 7   bland chromatin              683 non-null    int64  
 8   normal nucleoli              683 non-null    int64  
 9   mitoses                      683 non-null    int64  
 10  class                        683 non-null    float64
dtypes: float64(1), int64(9), object(1)
memory usage: 64.0+ KB


In [44]:
x = df.drop(['id', 'class'], axis=1)

In [45]:
x_col = x.columns

In [46]:
y = df['class']

In [47]:
from sklearn.preprocessing import StandardScaler

In [48]:
X = StandardScaler().fit_transform(x.values)

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
df1 = pd.DataFrame(X, columns=x_col)

In [51]:
df1.head()

Unnamed: 0,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [52]:
X_train, X_test, y_train, y_test = train_test_split(df1, y, train_size=0.8, random_state=42)

In [53]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id', 'class'], axis=1).values), columns=x_col).head()

Unnamed: 0,clump thickness,uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses
0,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0


In [54]:
from sklearn.neighbors import KNeighborsClassifier

In [55]:
knn = KNeighborsClassifier(
    n_neighbors=5, p=2, metric='minkowski'
)

In [56]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [57]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn import preprocessing

def printScore(clf, X_train, X_test, y_train, y_test, train=True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        res = clf.predict(X_train)
        print('Train Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_train, res))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    else:
        res_test = clf.predict(X_test)
        print('Test Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_test, res_test))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [58]:
printScore(knn, X_train, X_test, y_train, y_test)
printScore(knn, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.97

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546
 

Confusion Matrix: 
 [[358   7]
 [  8 173]] 

ROC AUC: 0.9683

Test Results:

Accuracy: 0.96

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion Matrix: 
 [[78  1]
 [ 5 53]] 

ROC AUC: 0.9506



# Grid Search

In [59]:
from sklearn.model_selection import GridSearchCV

In [60]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [61]:
params = {
    'n_neighbors' : list(range(1,11))
}

In [62]:
grid_search_cv = GridSearchCV(
    KNeighborsClassifier(), params, n_jobs=-1, verbose=1, cv=10
)

In [63]:
grid_search_cv.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             verbose=1)

In [64]:
grid_search_cv.best_estimator_.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 7,
 'p': 2,
 'weights': 'uniform'}

In [65]:
printScore(grid_search_cv, X_train, X_test, y_train, y_test)
printScore(grid_search_cv, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.97

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546
 

Confusion Matrix: 
 [[358   7]
 [  8 173]] 

ROC AUC: 0.9683

Test Results:

Accuracy: 0.96

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion Matrix: 
 [[78  1]
 [ 5 53]] 

ROC AUC: 0.9506

