Breast Cancer Diagnosis Prediction using K-Nearest Neighbors (KNN) and K-Means Clustering 


In [7]:
# Ensure matplotlib is installed in this environment
import sys, subprocess
try:
    import matplotlib
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib", "-q"])
    import matplotlib


In [None]:
# Ready-to-run pipeline for:
# "Breast Cancer Diagnosis Prediction using KNN and K-Means"
# Put Dataset.csv into the same folder or upload to Colab.

import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
try:
    import matplotlib.pyplot as plt
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib", "-q"])
    import matplotlib.pyplot as plt
try:
    import nbformat as nbf
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nbformat", "-q"])
    import nbformat as nbf

# ---------- CONFIG ----------
# Default (project-relative) path. Update if you run the notebook from a different folder.
DEFAULT_DATA_PATHS = [
    "Dataset.csv",                       # same folder as notebook/process
    os.path.join("..", "Dataset", "Dataset.csv"),  # when running from Notebook/ folder
    os.path.join("..", "..", "Dataset", "Dataset.csv"),
    os.path.join("Dataset", "Dataset.csv"),
]

OUT_DIR = "breast_cancer_assignment_output"
os.makedirs(OUT_DIR, exist_ok=True)

# ---------- 1. Load ----------
# Resolve DATA_PATH by checking common locations
DATA_PATH = None
for p in DEFAULT_DATA_PATHS:
    if os.path.exists(p):
        DATA_PATH = p
        break

# If not found yet, allow absolute path environment override
if DATA_PATH is None:
    env_path = os.environ.get("DATA_PATH")
    if env_path and os.path.exists(env_path):
        DATA_PATH = env_path

if DATA_PATH is None:
    tried = [os.path.abspath(p) for p in DEFAULT_DATA_PATHS]
    raise FileNotFoundError(
        "Dataset.csv not found. Tried the following paths:\n" + "\n".join(tried) +
        "\n\nPlace Dataset.csv in one of those locations or set the DATA_PATH variable to the absolute file path.\nExample: DATA_PATH = 'C:/full/path/to/Dataset/Dataset.csv'"
    )

print(f"Using dataset at: {DATA_PATH}")

# Defensive: if pd.read_csv was accidentally overwritten in the kernel, reload pandas and retry
try:
    df = pd.read_csv(DATA_PATH)
except TypeError:
    import importlib
    importlib.reload(pd)
    df = pd.read_csv(DATA_PATH)

print("Loaded dataset shape:", df.shape)
display(df.head())

# ---------- 2. Preprocess ----------
# Encode diagnosis: M -> 1, B -> 0
if 'diagnosis' not in df.columns:
    raise ValueError("Column 'diagnosis' not found. Check dataset.")
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})

# Drop 'id' and 'Unnamed: 32' if present
for col in ['id', 'Unnamed: 32', 'Unnamed:32']:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Separate features & target
y = df['diagnosis']
X = df.drop(columns=['diagnosis'])

# ---------- 3. Min-Max Normalization ----------
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# ---------- 4. Train/Test Split (80/20) ----------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.20, random_state=42, stratify=y
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ---------- 5. KMeans clustering (k=2) ----------
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_scaled)
clusters = kmeans.labels_

# Compare cluster assignments to true labels
cluster_df = pd.DataFrame({'diagnosis': y.values, 'cluster': clusters})
contingency = pd.crosstab(cluster_df['cluster'], cluster_df['diagnosis'])
print("\nContingency table (cluster vs diagnosis):")
print(contingency)

# Map each cluster to the majority class inside it
cluster_to_label = {}
for cl in sorted(cluster_df['cluster'].unique()):
    counts = cluster_df[cluster_df['cluster']==cl]['diagnosis'].value_counts()
    cluster_to_label[cl] = int(counts.idxmax())
print("Cluster -> majority label mapping:", cluster_to_label)

cluster_pred_label = np.array([cluster_to_label[c] for c in clusters])
clustering_accuracy = (cluster_pred_label == y.values).mean()
print(f"Clustering accuracy (after mapping clusters to labels): {clustering_accuracy:.4f}")

# ---------- 6. KNN classifier (k=5) ----------
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# ---------- 7. Evaluation ----------
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\nKNN (k=5) Evaluation on test set:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix (rows=true, cols=predicted):")
print(pd.DataFrame(cm, index=['True 0 (B)','True 1 (M)'], columns=['Pred 0 (B)','Pred 1 (M)']))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Benign (0)','Malignant (1)']))

# ---------- 8. Save outputs ----------
# Confusion matrix figure
plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.xticks([0,1], ['Benign (0)','Malignant (1)'])
plt.yticks([0,1], ['Benign (0)','Malignant (1)'])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i,j], ha='center', va='center')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "confusion_matrix.png"))
plt.close()

# Save preview CSV of test predictions
test_results = X_test.copy()
test_results['true'] = y_test.values
test_results['pred'] = y_pred
test_results.head(20).to_csv(os.path.join(OUT_DIR, "test_predictions_preview.csv"), index=False)

# Create a simple notebook file with the same pipeline so you can open in Colab
notebook_code = """# Breast Cancer Diagnosis - KNN & KMeans

# (The notebook code is the same pipeline as the script. Upload Dataset.csv to the notebook path and run.)
"""
nb = nbf.v4.new_notebook()
nb['cells'] = [
    nbf.v4.new_markdown_cell("# Breast Cancer Diagnosis Prediction\nThis notebook reproduces the preprocessing, KMeans clustering and KNN classifier evaluation."),
    nbf.v4.new_code_cell("# Paste the same pipeline code here if you want to run it step by step in Colab.")
]
nb_path = os.path.join(OUT_DIR, "breast_cancer_knn_kmeans.ipynb")
with open(nb_path, 'w') as f:
    nbf.write(nb, f)

# README
readme_text = """# Breast Cancer Diagnosis (KNN & KMeans)

Steps & files included:
- Dataset.csv (upload to Colab / local folder)
- This notebook: breast_cancer_knn_kmeans.ipynb
- confusion_matrix.png
- test_predictions_preview.csv

How to run:
1. Open the notebook or run this script.
2. Ensure Dataset.csv is available in the working directory.
3. Run all cells or execute the script.
"""
with open(os.path.join(OUT_DIR, "README.md"), 'w') as f:
    f.write(readme_text)

# Zip up outputs
zip_path = "breast_cancer_assignment_submission.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(OUT_DIR):
        for file in files:
            z.write(os.path.join(root, file), arcname=file)

print("\\nSaved outputs to folder:", OUT_DIR)
print("Zipped package:", zip_path)


Using dataset at: ..\Dataset\Dataset.csv
Loaded dataset shape: (569, 33)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


Train shape: (455, 30) Test shape: (114, 30)

Contingency table (cluster vs diagnosis):
diagnosis    0    1
cluster            
0            9  180
1          348   32
Cluster -> majority label mapping: {np.int32(0): 1, np.int32(1): 0}
Clustering accuracy (after mapping clusters to labels): 0.9279

KNN (k=5) Evaluation on test set:
Accuracy: 0.9649
Precision: 1.0000
Recall: 0.9048
F1-score: 0.9500

Confusion Matrix (rows=true, cols=predicted):
            Pred 0 (B)  Pred 1 (M)
True 0 (B)          72           0
True 1 (M)           4          38

Classification Report:
               precision    recall  f1-score   support

   Benign (0)       0.95      1.00      0.97        72
Malignant (1)       1.00      0.90      0.95        42

     accuracy                           0.96       114
    macro avg       0.97      0.95      0.96       114
 weighted avg       0.97      0.96      0.96       114

\nSaved outputs to folder: breast_cancer_assignment_output
Zipped package: breast_cancer_a