Lab 3: Principal Component Analysis (PCA)
This script demonstrates PCA for dimensionality reduction.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris, load_breast_cancer


In [None]:
def perform_pca_on_iris():
    """Perform PCA on Iris dataset"""
    print("=" * 50)
    print("PCA on Iris Dataset")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    print(f"\nOriginal data shape: {X.shape}")
    print(f"Features: {iris.feature_names}")
    
    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"\nReduced data shape: {X_pca.shape}")
    print(f"\nExplained variance ratio: {pca.explained_variance_ratio_}")
    print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.4f}")
    
    # Visualize PCA results
    plt.figure(figsize=(10, 6))
    colors = ['red', 'green', 'blue']
    for i, color in enumerate(colors):
        plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], 
                   c=color, label=iris.target_names[i], alpha=0.6)
    
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.title('PCA of Iris Dataset')
    plt.legend()
    plt.grid(True)
    plt.savefig('lab3_iris_pca.png')
    plt.close()
    print("\nPCA visualization saved as 'lab3_iris_pca.png'")
    
    return X_pca, pca


In [None]:
def analyze_components():
    """Analyze PCA components and variance"""
    print("\n" + "=" * 50)
    print("Component Analysis")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    
    print(f"\nOriginal data shape: {X.shape}")
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA with all components
    pca_full = PCA()
    pca_full.fit(X_scaled)
    
    # Calculate cumulative variance
    cumsum_variance = np.cumsum(pca_full.explained_variance_ratio_)
    
    print(f"\nVariance explained by each component:")
    for i, var in enumerate(pca_full.explained_variance_ratio_[:5]):
        print(f"PC{i+1}: {var:.4f}")
    
    # Find number of components for 95% variance
    n_components_95 = np.argmax(cumsum_variance >= 0.95) + 1
    print(f"\nComponents needed for 95% variance: {n_components_95}")
    
    # Plot explained variance
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.bar(range(1, len(pca_full.explained_variance_ratio_[:10]) + 1),
            pca_full.explained_variance_ratio_[:10])
    plt.xlabel('Principal Component')
    plt.ylabel('Variance Explained')
    plt.title('Explained Variance by Component (Top 10)')
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.plot(range(1, len(cumsum_variance) + 1), cumsum_variance, 'bo-')
    plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Variance Explained')
    plt.title('Cumulative Variance Explained')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('lab3_variance_analysis.png')
    plt.close()
    print("\nVariance analysis saved as 'lab3_variance_analysis.png'")


In [None]:
def manual_pca_implementation():
    """Implement PCA manually using eigenvalue decomposition"""
    print("\n" + "=" * 50)
    print("Manual PCA Implementation")
    print("=" * 50)
    
    # Create sample data
    np.random.seed(42)
    X = np.random.randn(100, 3)
    
    print(f"\nOriginal data shape: {X.shape}")
    
    # Step 1: Standardize the data
    X_mean = np.mean(X, axis=0)
    X_std = X - X_mean
    
    # Step 2: Calculate covariance matrix
    cov_matrix = np.cov(X_std.T)
    print(f"\nCovariance matrix shape: {cov_matrix.shape}")
    
    # Step 3: Calculate eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    
    # Step 4: Sort by eigenvalues
    idx = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    
    print(f"\nEigenvalues: {eigenvalues}")
    print(f"Variance explained: {eigenvalues / eigenvalues.sum()}")
    
    # Step 5: Project data onto principal components
    n_components = 2
    X_pca_manual = X_std.dot(eigenvectors[:, :n_components])
    
    print(f"\nReduced data shape: {X_pca_manual.shape}")
    
    # Compare with sklearn
    pca_sklearn = PCA(n_components=2)
    X_pca_sklearn = pca_sklearn.fit_transform(X_std)
    
    print(f"\nManual PCA first 5 rows:\n{X_pca_manual[:5]}")
    print(f"\nSklearn PCA first 5 rows:\n{X_pca_sklearn[:5]}")
    print("\nNote: Signs may differ but magnitudes should be similar")


In [None]:
def dimensionality_reduction_comparison():
    """Compare data before and after dimensionality reduction"""
    print("\n" + "=" * 50)
    print("Dimensionality Reduction Comparison")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA with different number of components
    components = [1, 2, 3]
    
    print(f"\nOriginal shape: {X.shape}")
    
    for n_comp in components:
        pca = PCA(n_components=n_comp)
        X_reduced = pca.fit_transform(X_scaled)
        variance_explained = sum(pca.explained_variance_ratio_)
        
        print(f"\nWith {n_comp} component(s):")
        print(f"  Reduced shape: {X_reduced.shape}")
        print(f"  Variance explained: {variance_explained:.4f}")
        
        # Reconstruct data
        X_reconstructed = pca.inverse_transform(X_reduced)
        reconstruction_error = np.mean((X_scaled - X_reconstructed) ** 2)
        print(f"  Reconstruction error: {reconstruction_error:.6f}")


In [None]:
def main():
    """Main function to demonstrate PCA"""
    print("\n" + "=" * 50)
    print("Lab 3: Principal Component Analysis (PCA)")
    print("=" * 50)
    
    # Perform PCA on Iris dataset
    perform_pca_on_iris()
    
    # Analyze components
    analyze_components()
    
    # Manual PCA implementation
    manual_pca_implementation()
    
    # Dimensionality reduction comparison
    dimensionality_reduction_comparison()
    
    print("\n" + "=" * 50)
    print("Lab 3 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
