## Breast cancer cell classification using Weighted K-Nearest Neighbor classifier. Use the dataset of file wisc_bc_data.csv and following settings to design the classifier:
-	Min-max feature normalization.
-	Randomly select 100 healthy and 100 cancerous cell samples to construct the training dataset. Use rest of the samples to estimate the accuracy of the classifier.
-	Calculate the accuracies for K = 9, 11, 13, 15, 17 and 19

In [1]:
# Python3 program to implement the 
# weighted K nearest neighbour algorithm.  
  
import math  
  
def weightedkNN(points,p,k=3):  
    '''  
    This function finds classification of p using  
    weighted k nearest neighbour algorithm. It assumes only two  
    two classes and returns 0 if p belongs to class 0, else  
    1 (belongs to class 1).  
  
    Parameters -  
        points : Dictionary of training points having two keys - 0 and 1  
            Each key have a list of training data points belong to that  
  
        p : A tuple ,test data point of form (x,y)  
  
        k : number of nearest neighbour to consider, default is 3  
    '''
  
    distance=[]  
    for group in points:  
        for feature in points[group]:  
  
            #calculate the euclidean distance of p from training points  
            euclidean_distance = math.sqrt((feature[0]-p[0])**2 +(feature[1]-p[1])**2)  
  
            # Add a tuple of form (distance,group) in the distance list  
            distance.append((euclidean_distance,group))  
  
    # sort the distance list in ascending order  
    # and select first k distances  
    distance = sorted(distance)[:k]  
  
    freq1 = 0 # weighted sum of group 0  
    freq2 = 0 # weighted sum of group 1  
  
    for d in distance: 
        if d[1] == 0: 
            freq1 += (1 / d[0]) 
              
        elif d[1] == 1:  
            freq2 += (1 /d[0]) 
              
  
    return 0 if freq1>freq2 else 1
  
# Driver function  
def main():  
  
    # Dictionary of training points having two keys - 0 and 1  
    # key 0 have points belong to class 0  
    # key 1 have points belong to class 1  
  
    points = {0:[(0, 4),(1, 4.9),(1.6, 5.4),(2.2, 6),(2.8, 7),(3.2, 8),(3.4, 9)],  
            1:[(1.8, 1),(2.2, 3),(3, 4),(4, 4.5),(5, 5),(6, 5.5)]}  
  
    # query point p(x,y)  
    p = (2, 4)  
  
    # Number of neighbours  
    k = 5
  
    print("The value classified to query point is: {}".format(weightedkNN(points,p,k)))

In [36]:
import pandas as pd
df = pd.read_csv('wisc_bc_data.csv')

In [37]:
df.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Data Dictionary
- radius_mean                569 non-null float64
- texture_mean               569 non-null float64
- perimeter_mean             569 non-null float64
- area_mean                  569 non-null float64
- smoothness_mean            569 non-null float64
- compactness_mean           569 non-null float64
- concavity_mean             569 non-null float64
- concave points_mean        569 non-null float64
- symmetry_mean              569 non-null float64
- fractal_dimension_mean     569 non-null float64
- radius_se                  569 non-null float64
- texture_se                 569 non-null float64
- perimeter_se               569 non-null float64
- area_se                    569 non-null float64
- smoothness_se              569 non-null float64
- compactness_se             569 non-null float64
- concavity_se               569 non-null float64
- concave points_se          569 non-null float64
- symmetry_se                569 non-null float64
- fractal_dimension_se       569 non-null float64
- radius_worst               569 non-null float64
- texture_worst              569 non-null float64
- perimeter_worst            569 non-null float64
- area_worst                 569 non-null float64
- smoothness_worst           569 non-null float64
- compactness_worst          569 non-null float64
- concavity_worst            569 non-null float64
- concave points_worst       569 non-null float64
- symmetry_worst             569 non-null float64
- fractal_dimension_worst    569 non-null float64

In [38]:
df.shape

(569, 32)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non

In [40]:
df.isnull().sum()
df = df.drop('id', axis = True)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [41]:
def diagnosis_mapping(diagnosis): 
    if diagnosis == 'M': 
        return 1
    else: 
        return 0
    
df['diagnosis'] = df['diagnosis'].apply(diagnosis_mapping) 

In [42]:
#import numpy as np  
#X = np.array(df.iloc[:, 1:]) 
#y = np.array(df['diagnosis']) 

In [43]:
df_M = df[df['diagnosis'] == 1]
df_B = df[df['diagnosis'] == 0]

In [44]:
df_M_train = df_M.sample(n=100)
df_M_train

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
186,1,18.31,18.58,118.60,1041.0,0.08588,0.08468,0.08169,0.05814,0.1621,...,21.31,26.36,139.2,1410.0,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
210,1,20.58,22.14,134.70,1290.0,0.09090,0.13480,0.16400,0.09561,0.1765,...,23.24,27.84,158.3,1656.0,0.1178,0.2920,0.3861,0.1920,0.2909,0.05865
168,1,17.47,24.68,116.10,984.6,0.10490,0.16030,0.21590,0.10430,0.1538,...,23.14,32.33,155.3,1660.0,0.1376,0.3830,0.4890,0.1721,0.2160,0.09300
78,1,20.18,23.97,143.70,1245.0,0.12860,0.34540,0.37540,0.16040,0.2906,...,23.37,31.72,170.3,1623.0,0.1639,0.6164,0.7681,0.2508,0.5440,0.09964
5,1,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.12440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,1,14.95,17.57,96.85,678.1,0.11670,0.13050,0.15390,0.08624,0.1957,...,18.55,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147
64,1,12.68,23.84,82.69,499.0,0.11220,0.12620,0.11280,0.06873,0.1905,...,17.09,33.47,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.10310
365,1,20.44,21.78,133.80,1293.0,0.09150,0.11310,0.09799,0.07785,0.1618,...,24.31,26.37,161.2,1780.0,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.45,26.40,166.1,2027.0,0.1410,0.2113,0.4107,0.2216,0.2060,0.07115


In [45]:
df_B_train = df_B.sample(n=100)
df_B_train

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
425,0,10.03,21.28,63.19,307.3,0.08117,0.03912,0.00247,0.005159,0.1630,...,11.11,28.94,69.92,376.3,0.11260,0.07094,0.01235,0.02579,0.2349,0.08061
483,0,13.70,17.64,87.76,571.1,0.09950,0.07957,0.04548,0.031600,0.1732,...,14.96,23.53,95.78,686.5,0.11990,0.13460,0.17420,0.09077,0.2518,0.06960
375,0,16.17,16.07,106.30,788.5,0.09880,0.14380,0.06651,0.053970,0.1990,...,16.97,19.14,113.10,861.5,0.12350,0.25500,0.21140,0.12510,0.3153,0.08960
55,0,11.52,18.75,73.34,409.0,0.09524,0.05473,0.03036,0.022780,0.1920,...,12.84,22.47,81.81,506.2,0.12490,0.08720,0.09076,0.06316,0.3306,0.07036
395,0,14.06,17.18,89.75,609.1,0.08045,0.05361,0.02681,0.032510,0.1641,...,14.92,25.34,96.42,684.5,0.10660,0.12310,0.08460,0.07911,0.2523,0.06609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,0,17.85,13.23,114.60,992.1,0.07838,0.06217,0.04445,0.041780,0.1220,...,19.82,18.42,127.10,1210.0,0.09862,0.09976,0.10480,0.08341,0.1783,0.05871
312,0,12.76,13.37,82.29,504.1,0.08794,0.07948,0.04052,0.025480,0.1601,...,14.19,16.40,92.04,618.8,0.11940,0.22080,0.17690,0.08411,0.2564,0.08253
336,0,12.99,14.23,84.08,514.3,0.09462,0.09965,0.03738,0.020980,0.1652,...,13.72,16.91,87.38,576.0,0.11420,0.19750,0.14500,0.05850,0.2432,0.10090
279,0,13.85,15.18,88.99,587.4,0.09516,0.07688,0.04479,0.037110,0.2110,...,14.98,21.74,98.37,670.0,0.11850,0.17240,0.14560,0.09993,0.2955,0.06912


In [46]:
#df_train = df_M_train.merge(df_B_train, left_index=True, right_index=True, how = 'outer')
df_train = pd.concat([df_B_train, df_M_train], ignore_index=True)
df_train

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0,10.03,21.28,63.19,307.3,0.08117,0.03912,0.00247,0.005159,0.1630,...,11.11,28.94,69.92,376.3,0.1126,0.07094,0.01235,0.02579,0.2349,0.08061
1,0,13.70,17.64,87.76,571.1,0.09950,0.07957,0.04548,0.031600,0.1732,...,14.96,23.53,95.78,686.5,0.1199,0.13460,0.17420,0.09077,0.2518,0.06960
2,0,16.17,16.07,106.30,788.5,0.09880,0.14380,0.06651,0.053970,0.1990,...,16.97,19.14,113.10,861.5,0.1235,0.25500,0.21140,0.12510,0.3153,0.08960
3,0,11.52,18.75,73.34,409.0,0.09524,0.05473,0.03036,0.022780,0.1920,...,12.84,22.47,81.81,506.2,0.1249,0.08720,0.09076,0.06316,0.3306,0.07036
4,0,14.06,17.18,89.75,609.1,0.08045,0.05361,0.02681,0.032510,0.1641,...,14.92,25.34,96.42,684.5,0.1066,0.12310,0.08460,0.07911,0.2523,0.06609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,14.95,17.57,96.85,678.1,0.11670,0.13050,0.15390,0.086240,0.1957,...,18.55,21.43,121.40,971.4,0.1411,0.21640,0.33550,0.16670,0.3414,0.07147
196,1,12.68,23.84,82.69,499.0,0.11220,0.12620,0.11280,0.068730,0.1905,...,17.09,33.47,111.80,888.3,0.1851,0.40610,0.40240,0.17160,0.3383,0.10310
197,1,20.44,21.78,133.80,1293.0,0.09150,0.11310,0.09799,0.077850,0.1618,...,24.31,26.37,161.20,1780.0,0.1327,0.23760,0.27020,0.17650,0.2609,0.06735
198,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.138900,0.1726,...,25.45,26.40,166.10,2027.0,0.1410,0.21130,0.41070,0.22160,0.2060,0.07115


In [47]:
#df_test = pd.merge(df, df_train, on=['radius_mean','texture_mean', 'perimeter_mean', 'area_mean'], how='left', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)
df_test = pd.concat([df, df_train])
df_test.drop_duplicates(keep=False)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.24300,0.3613,0.08758
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.16250,0.2364,0.07678
6,1,18.25,19.98,119.60,1040.0,0.09463,0.10900,0.11270,0.07400,0.1794,...,22.880,27.66,153.20,1606.0,0.14420,0.25760,0.3784,0.19320,0.3063,0.08368
10,1,16.02,23.24,102.70,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,...,19.190,33.88,123.80,1150.0,0.11810,0.15510,0.1459,0.09975,0.2948,0.08452
12,1,19.17,24.80,132.40,1123.0,0.09740,0.24580,0.20650,0.11180,0.2397,...,20.960,29.94,151.70,1332.0,0.10370,0.39030,0.3639,0.17670,0.3176,0.10230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0,11.51,23.93,74.52,403.5,0.09261,0.10210,0.11120,0.04105,0.1388,...,12.480,37.16,82.28,474.2,0.12980,0.25170,0.3630,0.09653,0.2112,0.08732
560,0,14.05,27.15,91.38,600.4,0.09929,0.11260,0.04462,0.04304,0.1537,...,15.300,33.17,100.20,706.7,0.12410,0.22640,0.1326,0.10480,0.2250,0.08321
561,0,11.20,29.37,70.67,386.0,0.07449,0.03558,0.00000,0.00000,0.1060,...,11.920,38.30,75.19,439.6,0.09267,0.05494,0.0000,0.00000,0.1566,0.05905
562,1,15.22,30.62,103.40,716.9,0.10480,0.20870,0.25500,0.09429,0.2128,...,17.520,42.79,128.70,915.0,0.14170,0.79170,1.1700,0.23560,0.4089,0.14090


In [48]:
import numpy as np  
X_train = np.array(df_train.iloc[:, 1:]) 
y_train = np.array(df_train['diagnosis'])

In [49]:
import numpy as np  
X_test = np.array(df_test.iloc[:, 1:]) 
y_test = np.array(df_test['diagnosis']) 

In [50]:
from sklearn import preprocessing
import numpy as np

min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)


In [51]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 13) 
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9557867360208062