In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPool1D
from tensorflow.keras.optimizers import Adam

In [2]:
cancer_dataset = pd.read_csv('../Dataset/dataset_with_missing_values.csv')

In [3]:
cancer_dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,,0.1096,0.1599,0.1974,,...,23.57,25.53,,1709.0,0.1444,0.4245,,0.243,0.3613,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
cancer_dataset = cancer_dataset.drop(columns='id', axis=1)

In [5]:
cancer_dataset.shape

(569, 31)

In [6]:
cancer_dataset.isnull().sum()

diagnosis                   0
radius_mean                 0
texture_mean                2
perimeter_mean              0
area_mean                   3
smoothness_mean             7
compactness_mean            3
concavity_mean              5
concave points_mean         6
symmetry_mean               3
fractal_dimension_mean      2
radius_se                   4
texture_se                  3
perimeter_se                4
area_se                     3
smoothness_se              10
compactness_se              6
concavity_se                1
concave points_se           6
symmetry_se                 4
fractal_dimension_se        2
radius_worst                3
texture_worst               1
perimeter_worst             3
area_worst                  4
smoothness_worst            0
compactness_worst           6
concavity_worst             3
concave points_worst        3
symmetry_worst              1
fractal_dimension_worst     3
dtype: int64

In [7]:
cancer_dataset = cancer_dataset.dropna(thresh=cancer_dataset.shape[1]-7, axis=0)

In [8]:
cancer_dataset.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,567.0,566.0,567.0,565.0,560.0,564.0,562.0,562.0,565.0,565.0,...,565.0,566.0,565.0,563.0,567.0,562.0,565.0,564.0,567.0,565.0
mean,14.125136,19.276802,91.954621,653.818938,0.096334,0.104314,0.088814,0.048951,0.181248,0.062763,...,16.278246,25.666007,107.195929,876.07762,0.132349,0.254885,0.273076,0.114653,0.29012,0.083926
std,3.51779,4.307084,24.256876,351.063762,0.014059,0.052937,0.079881,0.03868,0.027405,0.007052,...,4.828349,6.157515,33.494994,562.794824,0.022867,0.157834,0.208558,0.065642,0.061771,0.018121
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.705,16.17,75.19,420.3,0.08629,0.064815,0.029565,0.020413,0.162,0.05769,...,13.03,21.065,84.16,514.65,0.1166,0.1463,0.1167,0.06473,0.25045,0.07127
50%,13.37,18.825,86.24,551.1,0.095895,0.092525,0.0614,0.0336,0.1793,0.06149,...,14.97,25.37,97.66,684.5,0.1313,0.21485,0.2282,0.100015,0.2822,0.07993
75%,15.78,21.795,103.95,782.6,0.1053,0.1304,0.13035,0.07391,0.1957,0.06612,...,18.79,29.705,125.1,1077.0,0.14605,0.3397,0.3829,0.161675,0.31775,0.09209
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [9]:
cancer_dataset.shape

(567, 31)

In [10]:
cancer_dataset.replace({'diagnosis': {'B':0, 'M':1}}, inplace=True)

In [11]:
cancer_dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [12]:
cancer_dataset['diagnosis'].value_counts()

0    356
1    211
Name: diagnosis, dtype: int64

In [13]:
cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 567 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                567 non-null    int64  
 1   radius_mean              567 non-null    float64
 2   texture_mean             566 non-null    float64
 3   perimeter_mean           567 non-null    float64
 4   area_mean                565 non-null    float64
 5   smoothness_mean          560 non-null    float64
 6   compactness_mean         564 non-null    float64
 7   concavity_mean           562 non-null    float64
 8   concave points_mean      562 non-null    float64
 9   symmetry_mean            565 non-null    float64
 10  fractal_dimension_mean   565 non-null    float64
 11  radius_se                564 non-null    float64
 12  texture_se               564 non-null    float64
 13  perimeter_se             563 non-null    float64
 14  area_se                  5

In [14]:
cancer_dataset.groupby('diagnosis').mean()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.153152,17.909073,78.120253,463.253239,0.092516,0.080101,0.046136,0.025905,0.174337,0.062809,...,13.390393,23.484592,87.09293,559.001408,0.124941,0.182839,0.16726,0.074583,0.270462,0.079411
1,17.452275,21.595429,115.296019,975.965714,0.102795,0.145443,0.160896,0.087877,0.192843,0.062686,...,21.12327,29.336161,141.179571,1417.241346,0.144847,0.375647,0.450607,0.1822,0.323289,0.091558


In [15]:
correlation = cancer_dataset.corr()

In [16]:
correlation

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,1.0,0.728787,0.413824,0.741467,0.706396,0.353597,0.596668,0.694958,0.775041,0.326928,...,0.775392,0.459928,0.781034,0.736685,0.421163,0.591503,0.657765,0.793252,0.41375,0.324246
radius_mean,0.728787,1.0,0.323519,0.99784,0.987254,0.162324,0.504372,0.677046,0.820347,0.141947,...,0.969361,0.3013,0.964764,0.942853,0.118526,0.410081,0.52368,0.743098,0.158451,0.006475
texture_mean,0.413824,0.323519,1.0,0.329304,0.319688,-0.021691,0.237553,0.303559,0.29791,0.070474,...,0.352338,0.912833,0.355365,0.35298,0.078868,0.280768,0.302181,0.296187,0.105268,0.121171
perimeter_mean,0.741467,0.99784,0.329304,1.0,0.986404,0.198197,0.555489,0.716347,0.849221,0.177472,...,0.969411,0.307323,0.970072,0.9438,0.149583,0.452641,0.560881,0.770306,0.183813,0.050719
area_mean,0.706396,0.987254,0.319688,0.986404,1.0,0.167951,0.497992,0.68589,0.820687,0.149067,...,0.962439,0.289758,0.958712,0.960629,0.119792,0.386768,0.50756,0.71861,0.139671,0.003685
smoothness_mean,0.353597,0.162324,-0.021691,0.198197,0.167951,1.0,0.656557,0.515317,0.545982,0.552162,...,0.207529,0.03582,0.231102,0.204462,0.805856,0.470356,0.430621,0.501229,0.386135,0.498895
compactness_mean,0.596668,0.504372,0.237553,0.555489,0.497992,0.656557,1.0,0.883564,0.830838,0.601063,...,0.536654,0.25083,0.586436,0.518553,0.565717,0.865796,0.815208,0.816448,0.508063,0.690124
concavity_mean,0.694958,0.677046,0.303559,0.716347,0.68589,0.515317,0.883564,1.0,0.920926,0.49686,...,0.692535,0.302449,0.727797,0.683032,0.448863,0.753469,0.883264,0.860722,0.405008,0.515918
concave points_mean,0.775041,0.820347,0.29791,0.849221,0.820687,0.545982,0.830838,0.920926,1.0,0.457589,...,0.828901,0.299109,0.854211,0.813932,0.455229,0.667398,0.751943,0.910103,0.371521,0.372041
symmetry_mean,0.326928,0.141947,0.070474,0.177472,0.149067,0.552162,0.601063,0.49686,0.457589,1.0,...,0.182104,0.091923,0.20233,0.177698,0.426636,0.469969,0.429003,0.426422,0.698398,0.440149


In [17]:
data = cancer_dataset.values

In [18]:
data.shape

(567, 31)

In [19]:
ix = [i for i in range(data.shape[1]) if i != 0]

In [20]:
x, y = data[:, ix], data[:, 0]

In [21]:
# summarizing total missing
print('Missing: %d' % sum(np.isnan(x).flatten()))

Missing: 85


In [22]:
# define imputer
imputer = KNNImputer(n_neighbors=6)

In [23]:
#fit on the dataset
imputer.fit(x)

KNNImputer(n_neighbors=6)

In [24]:
# transforming the dataset
xtrans = imputer.transform(x)

In [25]:
#summarizing total missing
print('Missing: %d' % sum(np.isnan(xtrans).flatten()))

Missing: 0


In [26]:
x_train, x_test, y_train, y_test = train_test_split(xtrans, y, test_size = 0.2, random_state = 4)

In [27]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(567, 30)
(453, 30)
(114, 30)
(453,)
(114,)


In [28]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [29]:
x_train = x_train.reshape(453, 30, 1)
x_test = x_test.reshape(114, 30, 1)

In [30]:
epochs = 50
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(30,1)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv1D(filters=64, kernel_size=2, activation='relu',))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 29, 32)            96        
_________________________________________________________________
batch_normalization (BatchNo (None, 29, 32)            128       
_________________________________________________________________
dropout (Dropout)            (None, 29, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 28, 64)            4160      
_________________________________________________________________
batch_normalization_1 (Batch (None, 28, 64)            256       
_________________________________________________________________
dropout_1 (Dropout)          (None, 28, 64)            0         
_________________________________________________________________
flatten (Flatten)            (None, 1792)              0

In [32]:
model.compile(optimizer=Adam(learning_rate=0.00001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [33]:
history = model.fit(x_train, y_train, epochs=epochs, validation_data = (x_test, y_test), verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
