## Breast cancer cell classification using Weighted K-Nearest Neighbor classifier. Use the dataset of file wisc_bc_data.csv and following settings to design the classifier:
-	Min-max feature normalization.
-	Randomly select 100 healthy and 100 cancerous cell samples to construct the training dataset. Use rest of the samples to estimate the accuracy of the classifier.
-	Calculate the accuracies for K = 9, 11, 13, 15, 17 and 19

In [1]:
import pandas as pd
df = pd.read_csv('wisc_bc_data.csv')

In [2]:
df.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Data Dictionary
- radius_mean                569 non-null float64
- texture_mean               569 non-null float64
- perimeter_mean             569 non-null float64
- area_mean                  569 non-null float64
- smoothness_mean            569 non-null float64
- compactness_mean           569 non-null float64
- concavity_mean             569 non-null float64
- concave points_mean        569 non-null float64
- symmetry_mean              569 non-null float64
- fractal_dimension_mean     569 non-null float64
- radius_se                  569 non-null float64
- texture_se                 569 non-null float64
- perimeter_se               569 non-null float64
- area_se                    569 non-null float64
- smoothness_se              569 non-null float64
- compactness_se             569 non-null float64
- concavity_se               569 non-null float64
- concave points_se          569 non-null float64
- symmetry_se                569 non-null float64
- fractal_dimension_se       569 non-null float64
- radius_worst               569 non-null float64
- texture_worst              569 non-null float64
- perimeter_worst            569 non-null float64
- area_worst                 569 non-null float64
- smoothness_worst           569 non-null float64
- compactness_worst          569 non-null float64
- concavity_worst            569 non-null float64
- concave points_worst       569 non-null float64
- symmetry_worst             569 non-null float64
- fractal_dimension_worst    569 non-null float64

In [3]:
df.shape

(569, 32)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non

In [5]:
df.isnull().sum()
df = df.drop('id', axis = True)

In [6]:
def diagnosis_mapping(diagnosis): 
    if diagnosis == 'M': 
        return 1
    else: 
        return 0
    
df['diagnosis'] = df['diagnosis'].apply(diagnosis_mapping) 

In [7]:
#import numpy as np  
#X = np.array(df.iloc[:, 1:]) 
#y = np.array(df['diagnosis']) 

In [8]:
df_M = df[df['diagnosis'] == 1]
df_B = df[df['diagnosis'] == 0]

In [9]:
df_M_train = df_M.sample(n=100)
df_M_train

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
91,1,15.37,22.76,100.20,728.2,0.09200,0.10360,0.1122,0.07483,0.1717,...,16.43,25.84,107.5,830.9,0.1257,0.1997,0.2846,0.1476,0.2556,0.06828
277,1,18.81,19.98,120.90,1102.0,0.08923,0.05884,0.0802,0.05843,0.1550,...,19.96,24.30,129.0,1236.0,0.1243,0.1160,0.2210,0.1294,0.2567,0.05737
368,1,21.71,17.25,140.90,1546.0,0.09384,0.08562,0.1168,0.08465,0.1717,...,30.75,26.44,199.5,3143.0,0.1363,0.1628,0.2861,0.1820,0.2510,0.06494
260,1,20.31,27.06,132.90,1288.0,0.10000,0.10880,0.1519,0.09333,0.1814,...,24.33,39.16,162.3,1844.0,0.1522,0.2945,0.3788,0.1697,0.3151,0.07999
36,1,14.25,21.72,93.63,633.0,0.09823,0.10980,0.1319,0.05598,0.1885,...,15.89,30.36,116.2,799.6,0.1446,0.4238,0.5186,0.1447,0.3591,0.10140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,1,19.55,23.21,128.90,1174.0,0.10100,0.13180,0.1856,0.10210,0.1989,...,20.82,30.44,142.0,1313.0,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602
31,1,11.84,18.70,77.93,440.6,0.11090,0.15160,0.1218,0.05182,0.2301,...,16.82,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.14020
509,1,15.46,23.95,103.80,731.3,0.11830,0.18700,0.2030,0.08520,0.1807,...,17.11,36.33,117.7,909.4,0.1732,0.4967,0.5911,0.2163,0.3013,0.10670
253,1,17.30,17.08,113.00,928.2,0.10080,0.10410,0.1266,0.08353,0.1813,...,19.85,25.09,130.9,1222.0,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113


In [10]:
df_B_train = df_B.sample(n=100)
df_B_train

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
413,0,14.99,22.11,97.53,693.7,0.08515,0.10250,0.06859,0.03876,0.1944,...,16.76,31.55,110.20,867.1,0.10770,0.33450,0.31140,0.13080,0.3163,0.09251
354,0,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.1690,...,12.12,15.82,79.62,453.5,0.08864,0.12560,0.12010,0.03922,0.2576,0.07018
148,0,14.44,15.18,93.97,640.1,0.09970,0.10210,0.08487,0.05532,0.1724,...,15.85,19.85,108.60,766.9,0.13160,0.27350,0.31030,0.15990,0.2691,0.07683
289,0,11.37,18.89,72.17,396.0,0.08713,0.05008,0.02399,0.02173,0.2013,...,12.36,26.14,79.29,459.3,0.11180,0.09708,0.07529,0.06203,0.3267,0.06994
490,0,12.25,22.44,78.18,466.5,0.08192,0.05200,0.01714,0.01261,0.1544,...,14.17,31.99,92.74,622.9,0.12560,0.18040,0.12300,0.06335,0.3100,0.08203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,0,13.27,17.02,84.55,546.4,0.08445,0.04994,0.03554,0.02456,0.1496,...,15.14,23.60,98.84,708.8,0.12760,0.13110,0.17860,0.09678,0.2506,0.07623
104,0,10.49,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,0.2217,...,11.54,23.31,74.22,402.8,0.12190,0.14860,0.07987,0.03203,0.2826,0.07552
165,0,14.97,19.76,95.50,690.2,0.08421,0.05352,0.01947,0.01939,0.1515,...,15.98,25.82,102.30,782.1,0.10450,0.09995,0.07750,0.05754,0.2646,0.06085
281,0,11.74,14.02,74.24,427.3,0.07813,0.04340,0.02245,0.02763,0.2101,...,13.31,18.26,84.70,533.7,0.10360,0.08500,0.06735,0.08290,0.3101,0.06688


In [11]:
#df_train = df_M_train.merge(df_B_train, left_index=True, right_index=True, how = 'outer')
df_train = pd.concat([df_B_train, df_M_train], ignore_index=True)
df_train

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0,14.99,22.11,97.53,693.7,0.08515,0.10250,0.06859,0.03876,0.1944,...,16.76,31.55,110.20,867.1,0.10770,0.33450,0.31140,0.13080,0.3163,0.09251
1,0,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.1690,...,12.12,15.82,79.62,453.5,0.08864,0.12560,0.12010,0.03922,0.2576,0.07018
2,0,14.44,15.18,93.97,640.1,0.09970,0.10210,0.08487,0.05532,0.1724,...,15.85,19.85,108.60,766.9,0.13160,0.27350,0.31030,0.15990,0.2691,0.07683
3,0,11.37,18.89,72.17,396.0,0.08713,0.05008,0.02399,0.02173,0.2013,...,12.36,26.14,79.29,459.3,0.11180,0.09708,0.07529,0.06203,0.3267,0.06994
4,0,12.25,22.44,78.18,466.5,0.08192,0.05200,0.01714,0.01261,0.1544,...,14.17,31.99,92.74,622.9,0.12560,0.18040,0.12300,0.06335,0.3100,0.08203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,19.55,23.21,128.90,1174.0,0.10100,0.13180,0.18560,0.10210,0.1989,...,20.82,30.44,142.00,1313.0,0.12510,0.24140,0.38290,0.18250,0.2576,0.07602
196,1,11.84,18.70,77.93,440.6,0.11090,0.15160,0.12180,0.05182,0.2301,...,16.82,28.12,119.40,888.7,0.16370,0.57750,0.69560,0.15460,0.4761,0.14020
197,1,15.46,23.95,103.80,731.3,0.11830,0.18700,0.20300,0.08520,0.1807,...,17.11,36.33,117.70,909.4,0.17320,0.49670,0.59110,0.21630,0.3013,0.10670
198,1,17.30,17.08,113.00,928.2,0.10080,0.10410,0.12660,0.08353,0.1813,...,19.85,25.09,130.90,1222.0,0.14160,0.24050,0.33780,0.18570,0.3138,0.08113


In [12]:
#df_test = pd.merge(df, df_train, on=['radius_mean','texture_mean', 'perimeter_mean', 'area_mean'], how='left', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)
df_test = pd.concat([df, df_train])
df_test.drop_duplicates(keep=False)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.99,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.18600,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.57,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.24300,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.91,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.25750,0.6638,0.17300
7,1,13.71,20.83,90.20,577.9,0.11890,0.16450,0.09366,0.05985,0.2196,...,17.06,28.14,110.60,897.0,0.16540,0.36820,0.2678,0.15560,0.3196,0.11510
8,1,13.00,21.82,87.50,519.8,0.12730,0.19320,0.18590,0.09353,0.2350,...,15.49,30.73,106.20,739.3,0.17030,0.54010,0.5390,0.20600,0.4378,0.10720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0,11.51,23.93,74.52,403.5,0.09261,0.10210,0.11120,0.04105,0.1388,...,12.48,37.16,82.28,474.2,0.12980,0.25170,0.3630,0.09653,0.2112,0.08732
560,0,14.05,27.15,91.38,600.4,0.09929,0.11260,0.04462,0.04304,0.1537,...,15.30,33.17,100.20,706.7,0.12410,0.22640,0.1326,0.10480,0.2250,0.08321
561,0,11.20,29.37,70.67,386.0,0.07449,0.03558,0.00000,0.00000,0.1060,...,11.92,38.30,75.19,439.6,0.09267,0.05494,0.0000,0.00000,0.1566,0.05905
563,1,20.92,25.09,143.00,1347.0,0.10990,0.22360,0.31740,0.14740,0.2149,...,24.29,29.41,179.10,1819.0,0.14070,0.41860,0.6599,0.25420,0.2929,0.09873


In [13]:
import numpy as np  
X_train = np.array(df_train.iloc[:, 1:]) 
y_train = np.array(df_train['diagnosis'])

In [14]:
import numpy as np  
X_test = np.array(df_test.iloc[:, 1:]) 
y_test = np.array(df_test['diagnosis']) 

In [15]:
from sklearn import preprocessing
import numpy as np

min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 13) 
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9570871261378413