Dimensionality reduction techniques can significantly enhance the performance of machine learning models by simplifying datasets, reducing computational costs, and mitigating overfitting. Let's explore some of these techniques using a high-dimensional dataset and observe their impact on a model's performance.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target

# Artificially create a high-dimensional dataset
X_high_dim = np.hstack([X, X + np.random.normal(0, 0.1, X.shape), X + np.random.normal(0, 0.2, X.shape)])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_high_dim, y, test_size=0.2, random_state=42)


In [3]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Apply Dimensionality Reduction Techniques

In [4]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train and evaluate model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_pca, y_train)
y_pred_pca = clf.predict(X_test_pca)
print("PCA Accuracy:", accuracy_score(y_test, y_pred_pca))


PCA Accuracy: 0.9


In [5]:
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

# Train and evaluate model
clf.fit(X_train_lda, y_train)
y_pred_lda = clf.predict(X_test_lda)
print("LDA Accuracy:", accuracy_score(y_test, y_pred_lda))


LDA Accuracy: 0.9666666666666667


In [7]:
ica = FastICA(n_components=2, random_state=42)
X_train_ica = ica.fit_transform(X_train)
X_test_ica = ica.transform(X_test)

# Train and evaluate model
clf.fit(X_train_ica, y_train)
y_pred_ica = clf.predict(X_test_ica)
print("ICA Accuracy:", accuracy_score(y_test, y_pred_ica))


ICA Accuracy: 0.9


In [8]:
select_k_best = SelectKBest(f_classif, k=10)
X_train_kbest = select_k_best.fit_transform(X_train, y_train)
X_test_kbest = select_k_best.transform(X_test)

# Train and evaluate model
clf.fit(X_train_kbest, y_train)
y_pred_kbest = clf.predict(X_test_kbest)
print("SelectKBest Accuracy:", accuracy_score(y_test, y_pred_kbest))


SelectKBest Accuracy: 1.0


In [9]:
rfe = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Train and evaluate model
clf.fit(X_train_rfe, y_train)
y_pred_rfe = clf.predict(X_test_rfe)
print("RFE Accuracy:", accuracy_score(y_test, y_pred_rfe))


RFE Accuracy: 1.0


After applying each dimensionality reduction technique, you can compare the model's performance to understand which method works best for your dataset. Here's a summary of the accuracies obtained:


In [10]:
results = {
    "PCA": accuracy_score(y_test, y_pred_pca),
    "LDA": accuracy_score(y_test, y_pred_lda),
    "ICA": accuracy_score(y_test, y_pred_ica),
    "SelectKBest": accuracy_score(y_test, y_pred_kbest),
    "RFE": accuracy_score(y_test, y_pred_rfe)
}

print(results)

{'PCA': 0.9, 'LDA': 0.9666666666666667, 'ICA': 0.9, 'SelectKBest': 1.0, 'RFE': 1.0}
