In [1]:
import numpy as np
import pandas as pd

# Data of patients with breast cancer-
bc_data = pd.read_csv('wisc_bc_data.csv')

# converting the "diagnosis" column into 2 categories
bc_data['diagnosis'] = bc_data.diagnosis.astype('category')

# the id isn't essential for the prediction drop the ID column
bc_WorkingData = bc_data.drop(labels=['id'],axis=1)

# to predict whether a patient has Benign or Malignant breast cancer based on the values of all the other columns in the given data
# X = (radius_mean,texture_mean,......)
# Y = diagnosis
X = bc_WorkingData.drop(labels=['diagnosis'],axis=1)
Y = bc_WorkingData['diagnosis']

# Since the ranges of the columns of X are all very different
# Normalize the columns using z-score normalization
zScoreNorm = lambda x : (x-x.mean())/x.std()
X = X.apply(zScoreNorm)

# Now the X and Y data has been prepared for the application of necessary algorithms
# for this we will divide the X and Y data into 2 sets - training set and testing set
# training set is to train the algorithm such that the X and Y values are mapped as accurately as possible
# the testing set is then used to test the correctness and accuracy of that trained algorithm on the given data

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 120)

# function to calculate the distance between an X_test row and an X_train row
def calculate_distance (index,X_train,X_test):
    distance = list()
    columns = X_test.columns
    for i in range (0,X_train.shape[0]):
        sumDiff = 0.0
        for j in range (0,len(columns)):
            sumDiff = sumDiff + ((X_test.iloc[index][columns[j]])-(X_train.iloc[i][columns[j]]))**2
        distance.append(sumDiff**0.5)
    return distance

# function to decide on a Y-value based on majority of k-Y-values
def predict_Ytest (yList):
    Bcount = 0
    Mcount = 0
    for i in range (0,len(yList)):
        if yList[i]=='B':
            Bcount = Bcount + 1
        else:
            Mcount = Mcount + 1
    if Bcount > Mcount:
        return 'B'
    else:
        return 'M'
    return ''

# K-nearest neighbours function
def KNN (k, X_train, Y_train, X_test):
    num = len(X_test.index)
    dummy_Yval = ['Nil']*num
    Y_test_predicted = pd.Series(dummy_Yval,dtype=str)
    for i in range (0,num):
        print(i, end = ",")
        distance = list()
        distance = calculate_distance(i,X_train,X_test)
        #print(distance)
        X_train.insert(0,'Dist_from_test',distance,True)
        X_train_sorted = X_train.sort_values('Dist_from_test')
        y_values = list()
        for j in range (0,k):
            y_values.append(Y_train[X_train_sorted.index[j]])
        #print(y_values)
        Y_test_predicted.iloc[i] = predict_Ytest(y_values)
        X_train.drop('Dist_from_test',axis=1,inplace=True)
        X_train_sorted.drop(index = X_train_sorted.index, columns = X_train_sorted.columns, inplace = True)
    return Y_test_predicted

# predited Y_test values
k = int(input("Enter k value : "))
prediction = pd.Series(dtype = str)
prediction = KNN(k,X_train,Y_train,X_test)
prediction = prediction.astype("category")

print('\n')
# Comparison view of the predicted and actual Y_test values
total = len(Y_test.index)
print("Y_test\tPredicted")
for i in range (0,total):
    print(Y_test.iloc[i],"\t",prediction.iloc[i])

# calculating the score or accuracy of the prediction
matches = 0 
for i in range (0,total):
    if prediction.iloc[i] == Y_test.iloc[i]:
        matches = matches + 1

print("matches : ", matches)

KNN_Score = matches/total
print("KNN Score = ", KNN_Score)


Enter k value : 5
0,

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,

Y_test	Predicted
B 	 B
M 	 M
B 	 B
M 	 M
B 	 B
M 	 M
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
M 	 M
M 	 M
M 	 B
M 	 M
B 	 B
B 	 B
M 	 M
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
M 	 M
M 	 M
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
B 	 B
B

In [5]:
import numpy as np
import pandas as pd

# Data of patients with breast cancer-
bc_data = pd.read_csv('wisc_bc_data.csv')

# converting the "diagnosis" column into 2 categories
bc_data['diagnosis'] = bc_data.diagnosis.astype('category')

# the id isn't essential for the prediction drop the ID column
bc_WorkingData = bc_data.drop(labels=['id'],axis=1)

# to predict whether a patient has Benign or Malignant breast cancer based on the values of all the other columns in the given data
# X = (radius_mean,texture_mean,......)
# Y = diagnosis
X = bc_WorkingData.drop(labels=['diagnosis'],axis=1)
Y = bc_WorkingData['diagnosis']

# Since the ranges of the columns of X are all very different
# Normalize the columns using z-score normalization
zScoreNorm = lambda x : (x-x.mean())/x.std()
X = X.apply(zScoreNorm)

# Now the X and Y data has been prepared for the application of necessary algorithms
# for this we will divide the X and Y data into 2 sets - training set and testing set
# training set is to train the algorithm such that the X and Y values are mapped as accurately as possible
# the testing set is then used to test the correctness and accuracy of that trained algorithm on the given data

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 100)

# function to calculate the distance between an X_test row and an X_train row
def calculate_distance (index,X_train,X_test):
    distance = list()
    columns = X_test.columns
    for i in range (0,X_train.shape[0]):
        sumDiff = 0.0
        for j in range (0,len(columns)):
            sumDiff = sumDiff + ((X_test.iloc[index][columns[j]])-(X_train.iloc[i][columns[j]]))**2
        distance.append(sumDiff**0.5)
    return distance

# function to decide on a Y-value based on majority of k-Y-values
def predict_Ytest (yList):
    Bcount = 0
    Mcount = 0
    for i in range (0,len(yList)):
        if yList[i]=='B':
            Bcount = Bcount + 1
        else:
            Mcount = Mcount + 1
    if Bcount > Mcount:
        return 'B'
    else:
        return 'M'
    return ''

# K-nearest neighbours function
def KNN (k, X_train, Y_train, X_test):
    num = len(X_test.index)
    dummy_Yval = ['Nil']*num
    Y_test_predicted = pd.Series(dummy_Yval,dtype=str)
    for i in range (0,num):
        print(i, end = ",")
        distance = list()
        distance = calculate_distance(i,X_train,X_test)
        #print(distance)
        X_train.insert(0,'Dist_from_test',distance,True)
        X_train_sorted = X_train.sort_values('Dist_from_test')
        y_values = list()
        for j in range (0,k):
            y_values.append(Y_train[X_train_sorted.index[j]])
        #print(y_values)
        Y_test_predicted.iloc[i] = predict_Ytest(y_values)
        X_train.drop('Dist_from_test',axis=1,inplace=True)
        X_train_sorted.drop(index = X_train_sorted.index, columns = X_train_sorted.columns, inplace = True)
    return Y_test_predicted

# predited Y_test values
k = int(input("Enter k value : "))
prediction = pd.Series(dtype = str)
prediction = KNN(k,X_train,Y_train,X_test)
prediction = prediction.astype("category")

print('\n')
# Comparison view of the predicted and actual Y_test values
total = len(Y_test.index)
print("Y_test\tPredicted")
for i in range (0,total):
    print(Y_test.iloc[i],"\t",prediction.iloc[i])

# calculating the score or accuracy of the prediction
matches = 0 
for i in range (0,total):
    if prediction.iloc[i] == Y_test.iloc[i]:
        matches = matches + 1

print("matches : ", matches)

KNN_Score = matches/total
print("KNN Score = ", KNN_Score)


Enter k value : 10
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,

Y_test	Predicted
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
M 	 B
M 	 M
M 	 M
M 	 M
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 M
B 	 B
B 	 B
B 	 B
M 	 M
M 	 B
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 B
B 	 B
B 	 B
B 	 

In [2]:
import numpy as np
import pandas as pd

# Data of patients with breast cancer-
bc_data = pd.read_csv('wisc_bc_data.csv')

# converting the "diagnosis" column into 2 categories
bc_data['diagnosis'] = bc_data.diagnosis.astype('category')

# the id isn't essential for the prediction drop the ID column
bc_WorkingData = bc_data.drop(labels=['id'],axis=1)

# to predict whether a patient has Benign or Malignant breast cancer based on the values of all the other columns in the given data
# X = (radius_mean,texture_mean,......)
# Y = diagnosis
X = bc_WorkingData.drop(labels=['diagnosis'],axis=1)
Y = bc_WorkingData['diagnosis']

# Since the ranges of the columns of X are all very different
# Normalize the columns using z-score normalization
zScoreNorm = lambda x : (x-x.mean())/x.std()
X = X.apply(zScoreNorm)

# Now the X and Y data has been prepared for the application of necessary algorithms
# for this we will divide the X and Y data into 2 sets - training set and testing set
# training set is to train the algorithm such that the X and Y values are mapped as accurately as possible
# the testing set is then used to test the correctness and accuracy of that trained algorithm on the given data

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 100)

# function to calculate the distance between an X_test row and an X_train row
def calculate_distance (index,X_train,X_test):
    distance = list()
    columns = X_test.columns
    for i in range (0,X_train.shape[0]):
        sumDiff = 0.0
        for j in range (0,len(columns)):
            sumDiff = sumDiff + ((X_test.iloc[index][columns[j]])-(X_train.iloc[i][columns[j]]))**2
        distance.append(sumDiff**0.5)
    return distance

# function to decide on a Y-value based on majority of k-Y-values
def predict_Ytest (yList):
    Bcount = 0
    Mcount = 0
    for i in range (0,len(yList)):
        if yList[i]=='B':
            Bcount = Bcount + 1
        else:
            Mcount = Mcount + 1
    if Bcount > Mcount:
        return 'B'
    else:
        return 'M'
    return ''

# K-nearest neighbours function
def KNN (k, X_train, Y_train, X_test):
    num = len(X_test.index)
    dummy_Yval = ['Nil']*num
    Y_test_predicted = pd.Series(dummy_Yval,dtype=str)
    for i in range (0,num):
        print(i, end = ",")
        distance = list()
        distance = calculate_distance(i,X_train,X_test)
        #print(distance)
        X_train.insert(0,'Dist_from_test',distance,True)
        X_train_sorted = X_train.sort_values('Dist_from_test')
        y_values = list()
        for j in range (0,k):
            y_values.append(Y_train[X_train_sorted.index[j]])
        #print(y_values)
        Y_test_predicted.iloc[i] = predict_Ytest(y_values)
        X_train.drop('Dist_from_test',axis=1,inplace=True)
        X_train_sorted.drop(index = X_train_sorted.index, columns = X_train_sorted.columns, inplace = True)
    return Y_test_predicted

# predited Y_test values
k = int(input("Enter k value : "))
prediction = pd.Series(dtype = str)
prediction = KNN(k,X_train,Y_train,X_test)
prediction = prediction.astype("category")

print('\n')
# Comparison view of the predicted and actual Y_test values
total = len(Y_test.index)
print("Y_test\tPredicted")
for i in range (0,total):
    print(Y_test.iloc[i],"\t",prediction.iloc[i])

# calculating the score or accuracy of the prediction
matches = 0 
for i in range (0,total):
    if prediction.iloc[i] == Y_test.iloc[i]:
        matches = matches + 1

print("matches : ", matches)

KNN_Score = matches/total
print("KNN Score = ", KNN_Score)


Enter k value : 20
0,

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,

Y_test	Predicted
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
M 	 B
M 	 M
M 	 M
M 	 M
M 	 B
B 	 B
B 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
M 	 B
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B

In [3]:
import numpy as np
import pandas as pd

# Data of patients with breast cancer-
bc_data = pd.read_csv('wisc_bc_data.csv')

# converting the "diagnosis" column into 2 categories
bc_data['diagnosis'] = bc_data.diagnosis.astype('category')

# the id isn't essential for the prediction drop the ID column
bc_WorkingData = bc_data.drop(labels=['id'],axis=1)

# to predict whether a patient has Benign or Malignant breast cancer based on the values of all the other columns in the given data
# X = (radius_mean,texture_mean,......)
# Y = diagnosis
X = bc_WorkingData.drop(labels=['diagnosis'],axis=1)
Y = bc_WorkingData['diagnosis']

# Since the ranges of the columns of X are all very different
# Normalize the columns using z-score normalization
zScoreNorm = lambda x : (x-x.mean())/x.std()
X = X.apply(zScoreNorm)

# Now the X and Y data has been prepared for the application of necessary algorithms
# for this we will divide the X and Y data into 2 sets - training set and testing set
# training set is to train the algorithm such that the X and Y values are mapped as accurately as possible
# the testing set is then used to test the correctness and accuracy of that trained algorithm on the given data

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 100)

# function to calculate the distance between an X_test row and an X_train row
def calculate_distance (index,X_train,X_test):
    distance = list()
    columns = X_test.columns
    for i in range (0,X_train.shape[0]):
        sumDiff = 0.0
        for j in range (0,len(columns)):
            sumDiff = sumDiff + ((X_test.iloc[index][columns[j]])-(X_train.iloc[i][columns[j]]))**2
        distance.append(sumDiff**0.5)
    return distance

# function to decide on a Y-value based on majority of k-Y-values
def predict_Ytest (yList):
    Bcount = 0
    Mcount = 0
    for i in range (0,len(yList)):
        if yList[i]=='B':
            Bcount = Bcount + 1
        else:
            Mcount = Mcount + 1
    if Bcount > Mcount:
        return 'B'
    else:
        return 'M'
    return ''

# K-nearest neighbours function
def KNN (k, X_train, Y_train, X_test):
    num = len(X_test.index)
    dummy_Yval = ['Nil']*num
    Y_test_predicted = pd.Series(dummy_Yval,dtype=str)
    for i in range (0,num):
        print(i, end = ",")
        distance = list()
        distance = calculate_distance(i,X_train,X_test)
        #print(distance)
        X_train.insert(0,'Dist_from_test',distance,True)
        X_train_sorted = X_train.sort_values('Dist_from_test')
        y_values = list()
        for j in range (0,k):
            y_values.append(Y_train[X_train_sorted.index[j]])
        #print(y_values)
        Y_test_predicted.iloc[i] = predict_Ytest(y_values)
        X_train.drop('Dist_from_test',axis=1,inplace=True)
        X_train_sorted.drop(index = X_train_sorted.index, columns = X_train_sorted.columns, inplace = True)
    return Y_test_predicted

# predited Y_test values
k = int(input("Enter k value : "))
prediction = pd.Series(dtype = str)
prediction = KNN(k,X_train,Y_train,X_test)
prediction = prediction.astype("category")

print('\n')
# Comparison view of the predicted and actual Y_test values
total = len(Y_test.index)
print("Y_test\tPredicted")
for i in range (0,total):
    print(Y_test.iloc[i],"\t",prediction.iloc[i])

# calculating the score or accuracy of the prediction
matches = 0 
for i in range (0,total):
    if prediction.iloc[i] == Y_test.iloc[i]:
        matches = matches + 1

print("matches : ", matches)

KNN_Score = matches/total
print("KNN Score = ", KNN_Score)


Enter k value : 6
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,

Y_test	Predicted
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
M 	 M
M 	 M
M 	 M
M 	 M
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 M
B 	 B
B 	 B
B 	 B
M 	 M
M 	 B
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 B
B 	 B
B 	 B
B 	 B

In [4]:
import numpy as np
import pandas as pd

# Data of patients with breast cancer-
bc_data = pd.read_csv('wisc_bc_data.csv')

# converting the "diagnosis" column into 2 categories
bc_data['diagnosis'] = bc_data.diagnosis.astype('category')

# the id isn't essential for the prediction drop the ID column
bc_WorkingData = bc_data.drop(labels=['id'],axis=1)

# to predict whether a patient has Benign or Malignant breast cancer based on the values of all the other columns in the given data
# X = (radius_mean,texture_mean,......)
# Y = diagnosis
X = bc_WorkingData.drop(labels=['diagnosis'],axis=1)
Y = bc_WorkingData['diagnosis']

# Since the ranges of the columns of X are all very different
# Normalize the columns using z-score normalization
zScoreNorm = lambda x : (x-x.mean())/x.std()
X = X.apply(zScoreNorm)

# Now the X and Y data has been prepared for the application of necessary algorithms
# for this we will divide the X and Y data into 2 sets - training set and testing set
# training set is to train the algorithm such that the X and Y values are mapped as accurately as possible
# the testing set is then used to test the correctness and accuracy of that trained algorithm on the given data

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 10)

# function to calculate the distance between an X_test row and an X_train row
def calculate_distance (index,X_train,X_test):
    distance = list()
    columns = X_test.columns
    for i in range (0,X_train.shape[0]):
        sumDiff = 0.0
        for j in range (0,len(columns)):
            sumDiff = sumDiff + ((X_test.iloc[index][columns[j]])-(X_train.iloc[i][columns[j]]))**2
        distance.append(sumDiff**0.5)
    return distance

# function to decide on a Y-value based on majority of k-Y-values
def predict_Ytest (yList):
    Bcount = 0
    Mcount = 0
    for i in range (0,len(yList)):
        if yList[i]=='B':
            Bcount = Bcount + 1
        else:
            Mcount = Mcount + 1
    if Bcount > Mcount:
        return 'B'
    else:
        return 'M'
    return ''

# K-nearest neighbours function
def KNN (k, X_train, Y_train, X_test):
    num = len(X_test.index)
    dummy_Yval = ['Nil']*num
    Y_test_predicted = pd.Series(dummy_Yval,dtype=str)
    for i in range (0,num):
        print(i, end = ",")
        distance = list()
        distance = calculate_distance(i,X_train,X_test)
        #print(distance)
        X_train.insert(0,'Dist_from_test',distance,True)
        X_train_sorted = X_train.sort_values('Dist_from_test')
        y_values = list()
        for j in range (0,k):
            y_values.append(Y_train[X_train_sorted.index[j]])
        #print(y_values)
        Y_test_predicted.iloc[i] = predict_Ytest(y_values)
        X_train.drop('Dist_from_test',axis=1,inplace=True)
        X_train_sorted.drop(index = X_train_sorted.index, columns = X_train_sorted.columns, inplace = True)
    return Y_test_predicted

# predited Y_test values
k = int(input("Enter k value : "))
prediction = pd.Series(dtype = str)
prediction = KNN(k,X_train,Y_train,X_test)
prediction = prediction.astype("category")

print('\n')
# Comparison view of the predicted and actual Y_test values
total = len(Y_test.index)
print("Y_test\tPredicted")
for i in range (0,total):
    print(Y_test.iloc[i],"\t",prediction.iloc[i])

# calculating the score or accuracy of the prediction
matches = 0 
for i in range (0,total):
    if prediction.iloc[i] == Y_test.iloc[i]:
        matches = matches + 1

print("matches : ", matches)

KNN_Score = matches/total
print("KNN Score = ", KNN_Score)


Enter k value : 5
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,

Y_test	Predicted
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
M 	 B
M 	 M
B 	 B
B 	 B
M 	 M
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
M 	 M
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
M 	 M
B 	 M
M 	 M
B 	 B
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
M 	 M
M 	 M
B 	 B
M 	 M
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M

In [5]:
import numpy as np
import pandas as pd

# Data of patients with breast cancer-
bc_data = pd.read_csv('wisc_bc_data.csv')


# converting the "diagnosis" column into 2 categories
bc_data['diagnosis'] = bc_data.diagnosis.astype('category')


# the id isn't essential for the prediction drop the ID column
bc_WorkingData = bc_data.drop(labels=['id'],axis=1)


# to predict whether a patient has Benign or Malignant breast cancer based on the values of all the other columns in the given data
# X = (radius_mean,texture_mean,......)
# Y = diagnosis
X = bc_WorkingData.drop(labels=['diagnosis'],axis=1)
Y = bc_WorkingData['diagnosis']


# Since the ranges of the columns of X are all very different
# Normalize the columns using z-score normalization
zScoreNorm = lambda x : (x-x.mean())/x.std()
X = X.apply(zScoreNorm)

# Now the X and Y data has been prepared for the application of necessary algorithms
# for this we will divide the X and Y data into 2 sets - training set and testing set
# training set is to train the algorithm such that the X and Y values are mapped as accurately as possible
# the testing set is then used to test the correctness and accuracy of that trained algorithm on the given data


from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 100)


# function to calculate the distance between an X_test row and an X_train row
def calculate_distance (index,X_train,X_test):
    distance = list()
    columns = X_test.columns
    for i in range (0,X_train.shape[0]):
        sumDiff = 0.0
        for j in range (0,len(columns)):
            sumDiff = sumDiff + ((X_test.iloc[index][columns[j]])-(X_train.iloc[i][columns[j]]))**2
        distance.append(sumDiff**0.5)
    return distance


# function to decide on a Y-value based on majority of k-Y-values
def predict_Ytest (yList):
    Bcount = 0
    Mcount = 0
    for i in range (0,len(yList)):
        if yList[i]=='B':
            Bcount = Bcount + 1
        else:
            Mcount = Mcount + 1
    if Bcount > Mcount:
        return 'B'
    else:
        return 'M'
    return ''


# K-nearest neighbours function
def KNN (k, X_train, Y_train, X_test):
    num = len(X_test.index)
    dummy_Yval = ['Nil']*num
    Y_test_predicted = pd.Series(dummy_Yval,dtype=str)
    for i in range (0,num):
        print(i, end = ",")
        distance = list()
        # computing the distances of the ith row of X_test from each row of X_train and storing the distances in a list
        distance = calculate_distance(i,X_train,X_test)
        # inserting the distance list as a column in X_train
        X_train.insert(0,'Dist_from_test',distance,True)
        # sorting X_train rows in ascending order of their distances from the ith row of X_test
        X_train_sorted = X_train.sort_values('Dist_from_test')
        # finding the Y_train values corresponding to the first K rows of the sorted X_train dataframe
        y_values = list()
        for j in range (0,k):
            y_values.append(Y_train[X_train_sorted.index[j]])
        # finding the Y-value with maximum count and setting that as the predicted Y_test value
        Y_test_predicted.iloc[i] = predict_Ytest(y_values)
        
        # removing the column with the distances from X_train
        X_train.drop('Dist_from_test',axis=1,inplace=True)
        # emptying the sorted X_train dataframe for the next iteration
        X_train_sorted.drop(index = X_train_sorted.index, columns = X_train_sorted.columns, inplace = True)
    return Y_test_predicted



# predited Y_test values
k = int(input("Enter k value : "))
prediction = pd.Series(dtype = str)
prediction = KNN(k,X_train,Y_train,X_test)
prediction = prediction.astype("category")



print('\n')
# Comparison view of the predicted and actual Y_test values
total = len(Y_test.index)
print("Y_test\tPredicted")
for i in range (0,total):
    print(Y_test.iloc[i],"\t",prediction.iloc[i])



# calculating the score or accuracy of the prediction
matches = 0 
for i in range (0,total):
    if prediction.iloc[i] == Y_test.iloc[i]:
        matches = matches + 1

print("matches : ", matches)

KNN_Score = matches/total
print("KNN Score = ", KNN_Score)


Enter k value : 50
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,

Y_test	Predicted
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
M 	 B
M 	 M
M 	 M
M 	 M
M 	 B
B 	 B
B 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 M
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
M 	 B
M 	 B
B 	 B
B 	 B
B 	 B
B 	 B
B 	 B
M 	 M
B 	 B
M 	 B
B 	 B
B 	 B
B 	 