In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.decomposition import NMF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np

# **Reducing Features Using Principal Components (PCA)**

In [None]:
# Load the data
digits = datasets.load_digits()

# Standardize the feature matrix
features = StandardScaler().fit_transform(digits.data)

# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.99, whiten=True)

# Conduct PCA
features_pca = pca.fit_transform(features)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_pca.shape[1])

Original number of features: 64
Reduced number of features: 54


**The output of solution shows that PCA let us reduce our dimensionality by 10
features while still retaining 99% of the information (variance) in the feature matrix.**

# **Reducing Features When Data Is Linearly Inseparable**

In [None]:
# Create linearly inseparable data
features, _ = datasets.make_circles(n_samples=1000, random_state=1, noise=0.1, factor=0.1)

# Apply kernal PCA with radius basis function (RBF) kernel
kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1)
features_kpca = kpca.fit_transform(features)

print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kpca.shape[1])

Original number of features: 2
Reduced number of features: 1


# **Reducing Features by Maximizing Class Separability**

In [None]:
# Load Iris flower dataset:
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create and run an LDA, then use it to transform the features
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)

# Print the number of features
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_lda.shape[1])

Original number of features: 4
Reduced number of features: 1


In [None]:
lda.explained_variance_ratio_

array([0.9912126])

**Specifically, we can run LinearDiscriminantAnalysis with n_components set to
None to return the ratio of variance explained by every component feature, then calculate
how many components are required to get above some threshold of variance
explained (often 0.95 or 0.99):**

In [None]:
# Create and run LDA
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features, target)

# Create array of explained variance ratios
lda_var_ratios = lda.explained_variance_ratio_

# Create function
def select_n_components(var_ratio, goal_var: float) -> int:

    # Set initial variance explained so far
    total_variance = 0.0

    # Set initial number of features
    n_components = 0

    # For the explained variance of each feature:
    for explained_variance in var_ratio:

        # Add the explained variance to the total
        total_variance += explained_variance

        # Add one to the number of components
        n_components += 1

        # If we reach our goal level of explained variance
        if total_variance >= goal_var:

          # End the loop
          break

    # Return the number of components
    return n_components

# Run function
select_n_components(lda_var_ratios, 0.95)

1

# **Reducing Features Using Matrix Factorization**

In [None]:
# Load the data
digits = datasets.load_digits()

# Load feature matrix
features = digits.data

# Create, fit, and apply NMF
nmf = NMF(n_components=10, random_state=1)
features_nmf = nmf.fit_transform(features)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_nmf.shape[1])

Original number of features: 64
Reduced number of features: 10




# **Reducing Features on Sparse**

In [None]:
# Load the data
digits = datasets.load_digits()

# Standardize feature matrix
features = StandardScaler().fit_transform(digits.data)

# Make sparse matrix
features_sparse = csr_matrix(features)

# Create a TSVD
tsvd = TruncatedSVD(n_components=10)

# Conduct TSVD on sparse matrix
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)

# Show results
print("Original number of features:", features_sparse.shape[1])
print("Reduced number of features:", features_sparse_tsvd.shape[1])

Original number of features: 64
Reduced number of features: 10


In [None]:
# Sum of first three components' explained variance ratios
tsvd.explained_variance_ratio_[0:3].sum()

np.float64(0.3003938538720454)

**We can automate the process by creating a function that runs TSVD with n_compo
nents set to one less than the number of original features and then calculate the number
of components that explain a desired amount of the original data’s variance:**

In [None]:
# Create and run an TSVD with one less than number of features
tsvd = TruncatedSVD(n_components=features_sparse.shape[1]-1)
features_tsvd = tsvd.fit(features)

# List of explained variances
tsvd_var_ratios = tsvd.explained_variance_ratio_

# Create a function
def select_n_components(var_ratio, goal_var):

    # Set initial variance explained so far
    total_variance = 0.0

    # Set initial number of features
    n_components = 0

    # For the explained variance of each feature:
    for explained_variance in var_ratio:

        # Add the explained variance to the total
        total_variance += explained_variance

        # Add one to the number of components
        n_components += 1

        # If we reach our goal level of explained variance
        if total_variance >= goal_var:

            # End the loop
            break

    return n_components

# Run function
select_n_components(tsvd_var_ratios, 0.95)

40

# Important Notes


| Technique      | Type      | Works With        | Uses Labels? | Best For                                      | Notes                              |
| -------------- | --------- | ----------------- | ------------ | --------------------------------------------- | ---------------------------------- |
| **PCA**        | Linear    | Any numeric data  | ❌ No         | Reducing features while preserving variance   | Keeps most variance                |
| **Kernel PCA** | Nonlinear | Complex patterns  | ❌ No         | Nonlinear data (e.g., circular shapes)        | Good for curved data               |
| **LDA**        | Linear    | Classification    | ✅ Yes        | Supervised classification problems            | Maximizes class separation         |
| **NMF**        | Linear    | Non-negative data | ❌ No         | Topic modeling, interpretability (e.g., text) | Used in NLP, interpretable results |
| **TSVD**       | Linear    | Sparse data       | ❌ No         | Text data, large sparse datasets              | Efficient on big text-like data    |


* Use **PCA** if your data is numeric and you
want a quick, general-purpose dimensionality reduction.

* Use **Kernel PCA** if PCA doesn’t work well and the data seems nonlinear (e.g., spiral or circular shapes).

* Use **LDA** if you're working on a classification task and want to maximize separation between classes.

* Use **NMF** if your data is non-negative, and you want meaningful components (good for NLP topics).

* Use **TSVD** if you're dealing with sparse, high-dimensional data (like bag-of-words or TF-IDF in text).