**Assignment:** Breast Cancer Diagnosis Prediction using K-Nearest Neighbors (KNN) and K-Means Clustering


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


## LOADING DATASET

In [None]:
data =pd.read_csv("/content/Dataset.csv")


## ENCODE DIAGNOSIS (M = 1, B = 0)

In [None]:
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
data


##DROP COLUMNS (id,unnamed)

In [None]:
data.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)

data


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

##  MIN-MAX NORMALIZATION(excluding diagnosis)

In [None]:
target = "diagnosis"

X = data.drop(target, axis=1)
y = data[target]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_scaled.head()


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Cluster
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864,0.0
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878,0.0
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433,0.0
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0.0
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595,0.0


##Spliting the data into training (80%) and testing (20%) sets

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

Xtrain.shape, Xtest.shape


((455, 31), (114, 31))

In [None]:
ytrain.shape

(455,)

##  K-MEANS CLUSTERING (k = 2)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

data["Cluster"] = clusters

data[["diagnosis", "Cluster"]].head(20)


Unnamed: 0,diagnosis,Cluster
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


## TRAIN KNN CLASSIFIER (k = 5)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtrain, ytrain)

ypred = knn.predict(Xtest)


## Evaluation

In [None]:
accuracy = accuracy_score(ytest, ypred)
precision = precision_score(ytest, ypred)
recall = recall_score(ytest, ypred)
f1 = f1_score(ytest, ypred)

print("KNN Classification Results :")
print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)



KNN Classification Results :
Accuracy : 0.9736842105263158
Precision: 0.9761904761904762
Recall   : 0.9534883720930233
F1-score : 0.9647058823529412
