# Week 7 Practice Solution 2: PCA on Breast Cancer Wisconsin

**Dataset**: Breast Cancer (30 features, 569 samples, 2 classes: malignant/benign)

We use PCA for variance analysis, dimensionality choice, visualization, and reconstruction.

## 1. Imports and Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

plt.style.use('default')
sns.set_palette('husl')
np.random.seed(42)

In [None]:
bc = datasets.load_breast_cancer()
X = bc.data
y = bc.target  # 0 = malignant, 1 = benign
feature_names = bc.feature_names
target_names = bc.target_names

print('Shape:', X.shape)
print('Classes:', target_names)
print('Class counts:', np.bincount(y))

## 2. Standardize and Full PCA

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca_full = PCA()
pca_full.fit(X_scaled)

n_components = len(pca_full.explained_variance_ratio_)
cumvar = np.cumsum(pca_full.explained_variance_ratio_)

print('Explained variance ratio (first 10):')
for i in range(min(10, n_components)):
    print(f'  PC{i+1}: {pca_full.explained_variance_ratio_[i]:.4f}')
print('  ...')
print(f'\nCumulative variance (first 5): {cumvar[:5].round(4)}')

## 3. Variance Plots and Components for 95% / 99%

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(range(1, n_components + 1), pca_full.explained_variance_ratio_, alpha=0.7, color='steelblue')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Explained Variance per Component')
axes[0].set_xticks(range(1, n_components + 1, 2))
axes[0].grid(True, alpha=0.3, axis='y')

axes[1].plot(range(1, n_components + 1), cumvar, 'o-', color='coral')
axes[1].axhline(0.95, color='r', linestyle='--', label='95%')
axes[1].axhline(0.99, color='g', linestyle='--', label='99%')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xticks(range(1, n_components + 1, 2))
plt.tight_layout()
plt.show()

n_95 = np.argmax(cumvar >= 0.95) + 1
n_99 = np.argmax(cumvar >= 0.99) + 1
print(f'Components for 95% variance: {n_95} (actual: {cumvar[n_95-1]:.4f})')
print(f'Components for 99% variance: {n_99} (actual: {cumvar[n_99-1]:.4f})')
print(f'Chosen n_components: {n_95} (95% rule).')

## 4. 2D and 3D Visualization

In [None]:
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap='viridis', s=50, alpha=0.7, edgecolors='k')
plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}%)')
plt.title('Breast Cancer: 2D PCA (colored by malignant/benign)')
plt.colorbar(scatter, label='Class (0=malignant, 1=benign)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f'2 components explain {pca_2d.explained_variance_ratio_.sum()*100:.2f}% of variance.')

In [None]:
from mpl_toolkits.mplot3d import Axes3D

pca_3d = PCA(n_components=3)
X_3d = pca_3d.fit_transform(X_scaled)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], c=y, cmap='viridis', s=50, alpha=0.7)
ax.set_xlabel(f'PC1 ({pca_3d.explained_variance_ratio_[0]*100:.1f}%)')
ax.set_ylabel(f'PC2 ({pca_3d.explained_variance_ratio_[1]*100:.1f}%)')
ax.set_zlabel(f'PC3 ({pca_3d.explained_variance_ratio_[2]*100:.1f}%)')
ax.set_title('Breast Cancer: 3D PCA')
plt.tight_layout()
plt.show()

## 5. Reconstruction Error vs n_components

In [None]:
n_comp_list = [2, 5, 10, 15, 20, 30]
mse_list = []

for n_comp in n_comp_list:
    pca = PCA(n_components=n_comp)
    X_reduced = pca.fit_transform(X_scaled)
    X_recon = pca.inverse_transform(X_reduced)
    mse = np.mean((X_scaled - X_recon) ** 2)
    mse_list.append(mse)

plt.figure(figsize=(10, 5))
plt.plot(n_comp_list, mse_list, 'o-', linewidth=2, markersize=8)
plt.xlabel('Number of Components')
plt.ylabel('Mean Squared Reconstruction Error')
plt.title('Reconstruction Error vs n_components')
plt.xticks(n_comp_list)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print('Reconstruction error decreases as we add more components.')
print('Diminishing returns after ~10–15 components for this dataset.')

## 6. Summary

- **95% / 99% variance**: Reported above from cumulative variance; typically ~7–10 components for 95% and more for 99%.
- **Choice of n_components**: 95% rule is a common choice; reconstruction error plot shows the trade-off between compression and fidelity.
- **Visualization**: 2D/3D PCA reveals separation between malignant and benign; most variance is in the first few components.
- **Reconstruction**: MSE drops quickly with first components, then levels off—consistent with explained variance being concentrated in early PCs.