In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
cancer_dataset = pd.read_csv('../Dataset/dataset_with_missing_values.csv')

In [3]:
cancer_dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
cancer_dataset = cancer_dataset.drop(columns='id', axis=1)

In [5]:
cancer_dataset.shape

(5121, 31)

In [6]:
cancer_dataset.isnull().sum()

diagnosis                   0
radius_mean                 2
texture_mean                4
perimeter_mean              3
area_mean                   4
smoothness_mean            12
compactness_mean            5
concavity_mean             16
concave points_mean        12
symmetry_mean               9
fractal_dimension_mean      7
radius_se                  19
texture_se                  4
perimeter_se                7
area_se                     4
smoothness_se              14
compactness_se             12
concavity_se                7
concave points_se          11
symmetry_se                 7
fractal_dimension_se       11
radius_worst                2
texture_worst               5
perimeter_worst             7
area_worst                  2
smoothness_worst            4
compactness_worst           9
concavity_worst             4
concave points_worst        8
symmetry_worst              5
fractal_dimension_worst     6
dtype: int64

In [7]:
cancer_dataset = cancer_dataset.dropna(thresh=cancer_dataset.shape[1]-7, axis=0)

In [8]:
cancer_dataset.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,5109.0,5107.0,5108.0,5108.0,5103.0,5105.0,5098.0,5102.0,5104.0,5104.0,...,5108.0,5107.0,5108.0,5109.0,5108.0,5108.0,5109.0,5109.0,5109.0,5108.0
mean,14.128766,19.294353,91.981181,654.998904,0.096363,0.104404,0.088968,0.048925,0.181164,0.062798,...,16.272436,25.686011,107.281766,880.912527,0.132371,0.254444,0.272431,0.114655,0.290109,0.083964
std,3.522671,4.29944,24.291471,351.814552,0.014062,0.052811,0.079717,0.038799,0.027389,0.007058,...,4.832268,6.143052,33.596788,569.247897,0.02283,0.15731,0.208555,0.065705,0.061823,0.018057
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.057697,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06155,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.81,104.1,782.7,0.1053,0.1304,0.1319,0.074,0.1957,0.06615,...,18.79,29.795,125.4,1084.0,0.146,0.3393,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [9]:
cancer_dataset.shape

(5109, 31)

In [10]:
cancer_dataset.replace({'diagnosis': {'B':0, 'M':1}}, inplace=True)

In [11]:
cancer_dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [12]:
cancer_dataset['diagnosis'].value_counts()

0    3203
1    1906
Name: diagnosis, dtype: int64

In [13]:
cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5120
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                5109 non-null   int64  
 1   radius_mean              5109 non-null   float64
 2   texture_mean             5107 non-null   float64
 3   perimeter_mean           5108 non-null   float64
 4   area_mean                5108 non-null   float64
 5   smoothness_mean          5103 non-null   float64
 6   compactness_mean         5105 non-null   float64
 7   concavity_mean           5098 non-null   float64
 8   concave points_mean      5102 non-null   float64
 9   symmetry_mean            5104 non-null   float64
 10  fractal_dimension_mean   5104 non-null   float64
 11  radius_se                5098 non-null   float64
 12  texture_se               5107 non-null   float64
 13  perimeter_se             5107 non-null   float64
 14  area_se                 

In [14]:
cancer_dataset.groupby('diagnosis').mean()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.144787,17.918679,78.062086,462.621417,0.092461,0.080104,0.046138,0.025709,0.174181,0.062868,...,13.377549,23.522509,86.994714,558.693475,0.124945,0.182745,0.166363,0.074442,0.270238,0.079457
1,17.462807,21.608577,115.364675,978.455643,0.102925,0.145189,0.160817,0.088018,0.192911,0.06268,...,21.135724,29.31947,141.391701,1422.396065,0.144857,0.374894,0.450678,0.182234,0.323503,0.091537


In [15]:
correlation = cancer_dataset.corr()

In [16]:
correlation

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,1.0,0.730171,0.415041,0.742761,0.709111,0.35989,0.596172,0.695954,0.776544,0.330704,...,0.776555,0.456454,0.783059,0.733853,0.421827,0.590808,0.659366,0.793481,0.41672,0.323604
radius_mean,0.730171,1.0,0.323946,0.997855,0.987333,0.170744,0.505918,0.676612,0.822558,0.148158,...,0.969515,0.297278,0.96514,0.941048,0.120085,0.413547,0.526794,0.744193,0.164145,0.007344
texture_mean,0.415041,0.323946,1.0,0.329639,0.321209,-0.023494,0.235947,0.302251,0.293182,0.070917,...,0.352717,0.912186,0.358053,0.34357,0.077202,0.277963,0.300923,0.295243,0.104911,0.118547
perimeter_mean,0.742761,0.997855,0.329639,1.0,0.986484,0.207448,0.55675,0.715995,0.851027,0.18342,...,0.969448,0.303244,0.970394,0.941507,0.150991,0.455847,0.56376,0.771221,0.189293,0.051275
area_mean,0.709111,0.987333,0.321209,0.986484,1.0,0.17714,0.498257,0.685843,0.823284,0.151664,...,0.962704,0.287689,0.959107,0.959183,0.123902,0.390459,0.512457,0.721939,0.143763,0.003963
smoothness_mean,0.35989,0.170744,-0.023494,0.207448,0.17714,1.0,0.659859,0.522251,0.554423,0.558064,...,0.213368,0.03613,0.23919,0.206987,0.805518,0.47312,0.435469,0.503798,0.394946,0.50041
compactness_mean,0.596172,0.505918,0.235947,0.55675,0.498257,0.659859,1.0,0.882953,0.831118,0.602897,...,0.535012,0.247257,0.589955,0.509267,0.565719,0.865746,0.816057,0.815377,0.51018,0.687535
concavity_mean,0.695954,0.676612,0.302251,0.715995,0.685843,0.522251,0.882953,1.0,0.921269,0.500787,...,0.68794,0.299612,0.729359,0.675653,0.449318,0.75468,0.883877,0.861082,0.409473,0.514842
concave points_mean,0.776544,0.822558,0.293182,0.851027,0.823284,0.554423,0.831118,0.921269,1.0,0.463411,...,0.830355,0.292465,0.855966,0.809627,0.453098,0.667494,0.752214,0.910091,0.376109,0.368924
symmetry_mean,0.330704,0.148158,0.070917,0.18342,0.151664,0.558064,0.602897,0.500787,0.463411,1.0,...,0.185892,0.090283,0.219323,0.177456,0.426346,0.473433,0.433796,0.430858,0.699359,0.438962


In [17]:
data = cancer_dataset.values

In [18]:
data.shape

(5109, 31)

In [19]:
ix = [i for i in range(data.shape[1]) if i != 0]

In [20]:
x, y = data[:, ix], data[:, 0]

In [21]:
# summarizing total missing
print('Missing: %d' % sum(np.isnan(x).flatten()))

Missing: 96


In [22]:
# define imputer
imputer = KNNImputer()

In [23]:
#fit on the dataset
imputer.fit(x)

KNNImputer()

In [24]:
# transforming the dataset
xtrans = imputer.transform(x)

In [25]:
#summarizing total missing
print('Missing: %d' % sum(np.isnan(xtrans).flatten()))

Missing: 0


In [26]:
xtrans

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [27]:
x_train, x_test, y_train, y_test = train_test_split(xtrans, y, test_size=0.2, stratify=y, random_state=2)

In [28]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5109, 30)
(4087, 30)
(1022, 30)
(4087,)
(1022,)


In [29]:
scaler = StandardScaler()

In [30]:
scaler.fit(x_train)

StandardScaler()

In [31]:

x_train = scaler.transform(x_train)

x_test = scaler.transform(x_test)

In [32]:
print(x_train)

[[ 3.17108551e-01  2.60122545e+00  4.76857004e-01 ...  1.83692664e+00
   1.89778332e+00  3.10503164e+00]
 [-1.11735582e+00 -1.02775641e+00 -1.12270635e+00 ... -1.34534109e+00
   1.04896078e+00 -2.11492770e-01]
 [-5.97540016e-01  2.05019199e+00 -6.19574709e-01 ... -6.57197118e-01
  -7.22078738e-01 -1.16422713e-01]
 ...
 [-4.78238029e-01 -9.90867139e-01 -5.43816754e-01 ... -1.12192560e+00
   2.29238038e-03 -8.87364384e-01]
 [-1.54315029e+00 -1.12689632e+00 -1.53978683e+00 ... -1.73650779e+00
  -4.90726729e-01 -3.05470068e-01]
 [ 9.05096915e-01  6.43788601e-01  9.29757824e-01 ...  5.81067683e-01
   2.67150542e-01 -5.04352256e-01]]


In [33]:
model = LogisticRegression()

In [34]:
# training the Logistic Regression Model with Training Data
model.fit(x_train, y_train)

LogisticRegression()

In [35]:
# accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [36]:
print('Accuracy on Training data: ',training_data_accuracy)

Accuracy on Training data:  0.9880107658429166


In [37]:
# accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [38]:
print('Accuracy on Test data: ',test_data_accuracy)

Accuracy on Test data:  0.9931506849315068
