In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#Loading the breast cancer dataset from sklearn datasets

cancer = load_breast_cancer()
print(list(cancer.keys()))
print("\n")
print("The feature names are: \n", cancer.feature_names)
print("\n")
print("Number of features: {}". format(len(cancer.feature_names)))
print("The classes of data: \n", cancer.target_names)
print("Sample counts per class:\n{}".format({n: v for n, v in zip(cancer.target_names, np.bincount(cancer.target))}))

['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']


The feature names are: 
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


Number of features: 30
The classes of data: 
 ['malignant' 'benign']
Sample counts per class:
{np.str_('malignant'): np.int64(212), np.str_('benign'): np.int64(357)}


In [2]:
original_data=cancer.data
labels=cancer.target

print("The shape of the attributes data: \n", original_data.shape)
print(pd.DataFrame(original_data).iloc[:, :30].describe().loc[['mean', 'std']].round(2))

scaler_op=StandardScaler() #  Making each feature zero mean and unit variance
# PCA step 1:
scaled_data=scaler_op.fit_transform(original_data)

print(pd.DataFrame(scaled_data).iloc[:, :30].describe().loc[['mean', 'std']].round(2))

pca = PCA(n_components=5)  #keep the principal components of the data

# Steps 2 + 3 + 4
pca.fit(scaled_data) #fit PCA model to breast cancer data
# Steps 5 : Project data onto the principal components
pca_features = pca.transform(scaled_data)

print("Original shape (before PCA): {}".format(str(scaled_data.shape)))
print("Reduced shape (After PCA): {}".format(str(pca_features.shape)))

# Access the principal components
principal_components = pca.components_

# Access the explained variance (eigenvalues)
eigenvalues = pca.explained_variance_

# Print the principal components and eigenvalues
#print("Principal Components:")
#print(principal_components)
print("\nEigenvalues:")
print(eigenvalues)

The shape of the attributes data: 
 (569, 30)
         0      1      2       3     4     5     6     7     8     9   ...  \
mean  14.13  19.29  91.97  654.89  0.10  0.10  0.09  0.05  0.18  0.06  ...   
std    3.52   4.30  24.30  351.91  0.01  0.05  0.08  0.04  0.03  0.01  ...   

         20     21      22      23    24    25    26    27    28    29  
mean  16.27  25.68  107.26  880.58  0.13  0.25  0.27  0.11  0.29  0.08  
std    4.83   6.15   33.60  569.36  0.02  0.16  0.21  0.07  0.06  0.02  

[2 rows x 30 columns]
       0    1    2    3    4    5    6    7    8    9   ...   20   21   22  \
mean -0.0 -0.0 -0.0 -0.0  0.0 -0.0 -0.0  0.0 -0.0 -0.0  ... -0.0  0.0 -0.0   
std   1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  1.0  1.0  1.0   

       23   24   25   26   27   28   29  
mean  0.0 -0.0 -0.0  0.0 -0.0 -0.0  0.0  
std   1.0  1.0  1.0  1.0  1.0  1.0  1.0  

[2 rows x 30 columns]
Original shape (before PCA): (569, 30)
Reduced shape (After PCA): (569, 5)

Eigenvalues:
[13.

In [3]:
# Prepare to store results
results = []

# KNN on original data
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(original_data, labels, test_size=0.2, random_state=42)
for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_orig, y_train_orig)
    y_pred = knn.predict(X_test_orig)
    accuracy = accuracy_score(y_test_orig, y_pred)
    results.append({'K': k, 'Accuracy (Original)': accuracy})

# KNN on PCA-reduced data
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(pca_features, labels, test_size=0.2, random_state=42)
for i, k in enumerate(range(1, 11)):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_pca, y_train_pca)
    y_pred = knn.predict(X_test_pca)
    accuracy = accuracy_score(y_test_pca, y_pred)
    results[i]['Accuracy (PCA)'] = accuracy

# Convert to DataFrame and display
results_df = pd.DataFrame(results)
results_df['K'] = results_df['K'].astype(int)
results_df[['Accuracy (Original)', 'Accuracy (PCA)']] = results_df[['Accuracy (Original)', 'Accuracy (PCA)']].apply(lambda x: (x*100).round(2))
print("\nKNN Accuracy Comparison Table:")
print(results_df.to_string(index=False))


KNN Accuracy Comparison Table:
 K  Accuracy (Original)  Accuracy (PCA)
 1                92.98           94.74
 2                92.98           94.74
 3                92.98           95.61
 4                93.86           94.74
 5                95.61           94.74
 6                96.49           95.61
 7                95.61           96.49
 8                95.61           95.61
 9                95.61           96.49
10                97.37           96.49
