# Importing Dependencies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.optimizers import Adam

In [3]:
cancer_dataset = pd.read_csv('../Dataset/dataset_with_missing_values.csv')

In [4]:
cancer_dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
cancer_dataset = cancer_dataset.drop(columns='id', axis=1)

# Checking The Shape Of The Dataset

In [6]:
cancer_dataset.shape

(5121, 31)

In [7]:
cancer_dataset.isnull().sum()

diagnosis                   0
radius_mean                 2
texture_mean                4
perimeter_mean              3
area_mean                   4
smoothness_mean            12
compactness_mean            5
concavity_mean             16
concave points_mean        12
symmetry_mean               9
fractal_dimension_mean      7
radius_se                  19
texture_se                  4
perimeter_se                7
area_se                     4
smoothness_se              14
compactness_se             12
concavity_se                7
concave points_se          11
symmetry_se                 7
fractal_dimension_se       11
radius_worst                2
texture_worst               5
perimeter_worst             7
area_worst                  2
smoothness_worst            4
compactness_worst           9
concavity_worst             4
concave points_worst        8
symmetry_worst              5
fractal_dimension_worst     6
dtype: int64

# Setting Threshold as 70%

In [8]:
cancer_dataset = cancer_dataset.dropna(thresh=cancer_dataset.shape[1]-9, axis=0)

In [9]:
cancer_dataset.shape

(5112, 31)

In [10]:
cancer_dataset.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,5112.0,5110.0,5111.0,5111.0,5105.0,5108.0,5100.0,5104.0,5107.0,5106.0,...,5111.0,5109.0,5110.0,5112.0,5110.0,5109.0,5111.0,5110.0,5111.0,5109.0
mean,14.20807,19.291579,91.988445,654.870554,0.096707,0.104426,0.089079,0.04911,0.181158,0.06279,...,16.270562,25.682664,107.273599,880.701995,0.132375,0.254426,0.272382,0.114642,0.290091,0.083963
std,4.270376,4.300208,24.601481,351.751841,0.022238,0.052789,0.080629,0.040825,0.02739,0.007094,...,4.831485,6.144758,33.592748,569.147901,0.022828,0.1573,0.208531,0.065705,0.061834,0.018055
min,6.981,9.71,10.34,143.5,0.05263,0.01938,0.0,0.0,0.106,0.01111,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.1,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.057693,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09362,0.06155,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.85,21.8075,104.1,782.7,0.1053,0.1304,0.1319,0.07404,0.1957,0.066142,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,96.57,39.28,356.1,2501.0,0.9682,0.3454,0.9559,0.95623,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [11]:
cancer_dataset.shape

(5112, 31)

# Categorical Encoding

B --> 0

M --> 1

In [12]:
cancer_dataset.replace({'diagnosis': {'B':0, 'M':1}}, inplace=True)

In [13]:
cancer_dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [14]:
cancer_dataset['diagnosis'].value_counts()

0    3206
1    1906
Name: diagnosis, dtype: int64

In [15]:
cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5112 entries, 0 to 5120
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                5112 non-null   int64  
 1   radius_mean              5112 non-null   float64
 2   texture_mean             5110 non-null   float64
 3   perimeter_mean           5111 non-null   float64
 4   area_mean                5111 non-null   float64
 5   smoothness_mean          5105 non-null   float64
 6   compactness_mean         5108 non-null   float64
 7   concavity_mean           5100 non-null   float64
 8   concave points_mean      5104 non-null   float64
 9   symmetry_mean            5107 non-null   float64
 10  fractal_dimension_mean   5106 non-null   float64
 11  radius_se                5100 non-null   float64
 12  texture_se               5109 non-null   float64
 13  perimeter_se             5109 non-null   float64
 14  area_se                 

In [16]:
cancer_dataset.groupby('diagnosis').mean()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.212896,17.915546,78.026168,462.596818,0.092473,0.080155,0.046142,0.026018,0.174178,0.06287,...,13.37727,23.51852,86.994353,558.659295,0.124956,0.182739,0.16635,0.074433,0.270221,0.079455
1,17.564066,21.608577,115.466459,978.455643,0.103832,0.145199,0.161152,0.088018,0.192911,0.062655,...,21.135724,29.31947,141.391701,1422.396065,0.144857,0.374894,0.450678,0.182234,0.323503,0.091537


In [17]:
data = cancer_dataset.values

In [18]:
data

array([[ 1.     , 17.99   , 10.38   , ...,  0.2654 ,  0.4601 ,  0.1189 ],
       [ 1.     , 20.57   , 17.77   , ...,  0.186  ,  0.275  ,  0.08902],
       [ 1.     , 19.69   , 21.25   , ...,  0.243  ,  0.3613 ,  0.08758],
       ...,
       [ 1.     , 16.6    , 28.08   , ...,  0.1418 ,  0.2218 ,  0.0782 ],
       [ 1.     , 20.6    , 29.33   , ...,  0.265  ,  0.4087 ,  0.124  ],
       [ 0.     ,  7.76   , 24.54   , ...,  0.     ,  0.2871 ,  0.07039]])

In [19]:
data.shape

(5112, 31)

In [20]:
data.shape[1]

31

In [21]:
ix = [i for i in range(data.shape[1]) if i != 0] 

In [22]:
x, y = data[:, ix], data[:, 0]

# Summarizing Total Missing Values

In [23]:
print('Missing: %d' % sum(np.isnan(x).flatten())) 

Missing: 123


In [None]:
# Using KNNImputer to Impute Missing Values

In [24]:
# define imputer 
imputer = KNNImputer(n_neighbors=6)

In [25]:
#fit on the dataset
imputer.fit(x)

KNNImputer(n_neighbors=6)

In [26]:
# transforming the dataset
xtrans = imputer.transform(x) 

# Summarizing Total Missing After Imputation

In [27]:
print('Missing: %d' % sum(np.isnan(xtrans).flatten()))

Missing: 0


In [28]:
xtrans.shape

(5112, 30)

In [40]:
cancer_dataset.columns.values.tolist()

['diagnosis',
 'radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

In [33]:
X_k = pd.DataFrame(xtrans, columns = [ 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'])

In [36]:
Y_k = pd.DataFrame(y, columns = ['diagnosis'])