
# DR Comparison with Pandas Tables & Summary Dashboard

We evaluate **linear vs non-linear vs semi-linear** dimensionality reduction with **pandas DataFrames** for clear tabular reporting and a concise **summary dashboard**.

**Sections**
1. **Linear techniques on linear data** (10 common linear methods)
2. **Non-linear techniques on non-linear data** (5 common non-linear methods)
3. **Top-3 performers from Sections 1 & 2 on semi-linear data** (6 methods total)

**Metrics** (on centered data): Frobenius norm, Relative Frobenius, Spectral norm, MSE, Reconstruction $R^2$.


## Imports

In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import norm, pinv
from scipy.linalg import svdvals
import matplotlib.pyplot as plt
from IPython.display import display

from sklearn.decomposition import (
    PCA, IncrementalPCA, TruncatedSVD, FactorAnalysis, FastICA,
    SparsePCA, MiniBatchSparsePCA, DictionaryLearning, KernelPCA
)
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.datasets import make_swiss_roll

np.random.seed(42)

## Utilities

### Metrics

In [2]:
def frobenius_norm(R):
    return norm(R, 'fro')

def relative_frobenius(R, Xc):
    return frobenius_norm(R) / (norm(Xc, 'fro') + 1e-12)

def spectral_norm(R):
    return svdvals(R)[0]

def mse(R):
    return np.mean(R**2)

def reconstruction_R2(R, Xc):
    return 1.0 - (frobenius_norm(R)**2) / (norm(Xc, 'fro')**2 + 1e-12)

METRICS = ['Frobenius', 'RelFro', 'Spectral', 'MSE', 'R2']
TARGET_DIMS = [2, 4, 6, 8]


### Utilities for results

In [3]:
def results_to_df(results_dict):
    '''Convert nested results {dim: [rows...]} to a pandas DataFrame.
    Returns (df_scores, df_errors) where df_errors includes method/dim/error.'''
    rows_scores = []
    rows_errors = []
    for d, rows in results_dict.items():
        for r in rows:
            if 'Error' in r:
                rows_errors.append({'Method': r['Method'], 'Dim': d, 'Error': r['Error']})
            else:
                rows_scores.append({
                    'Method': r['Method'], 'Dim': d,
                    'Frobenius': r['Frobenius'], 'RelFro': r['RelFro'],
                    'Spectral': r['Spectral'], 'MSE': r['MSE'], 'R2': r['R2']
                })
    df_scores = pd.DataFrame(rows_scores)
    df_errors = pd.DataFrame(rows_errors)
    return df_scores, df_errors

def avg_metric_by_method(df_scores, metric='R2'):
    '''Compute average of a metric across dims per method.'''
    return df_scores.groupby('Method')[metric].mean().sort_values(ascending=False)



# Plot trends per metric using DataFrame

def plot_metric_trends_df(df_scores, title_prefix=""):
    if df_scores.empty:
        print("No successful results to plot.")
        return
    plt.style.use('seaborn-v0_8-whitegrid')
    FIGSIZE = (10, 6)
    for m in METRICS:
        fig, ax = plt.subplots(figsize=FIGSIZE)
        for meth, grp in df_scores.groupby('Method'):
            grp_sorted = grp.sort_values('Dim')
            ax.plot(grp_sorted['Dim'], grp_sorted[m], marker='o', linewidth=2, markersize=6, label=meth)
        ax.set_xlabel('Reduced dimension (k)', fontsize=12)
        ax.set_ylabel('Reconstruction $R^2$' if m == 'R2' else m, fontsize=12)
        ax.set_title(f"{title_prefix}{m} vs. dimensionality", fontsize=14)
        ax.grid(True, alpha=0.3)
        if m in ('Frobenius', 'Spectral', 'MSE'):
            ymin, ymax = ax.get_ylim()
            if ymin > 0 and ymax / max(ymin, 1e-12) > 10:
                ax.set_yscale('log')
        ax.legend(ncol=2, fontsize=9, frameon=True)
        fig.tight_layout()
        plt.show()


# Rankings as DataFrames

def rankings_df(df_scores):
    '''Return dict {dim: {metric: df_ranking}} with sorted DataFrames.'''
    rankings = {}
    for d, grp in df_scores.groupby('Dim'):
        rankings[d] = {}
        for m in METRICS:
            ascending = (m != 'R2')  # R2 higher is better; others lower is better
            df_rank = grp[['Method', m]].groupby('Method').mean().sort_values(by=m, ascending=ascending)
            df_rank = df_rank.reset_index()
            df_rank['Rank'] = np.arange(1, len(df_rank)+1)
            rankings[d][m] = df_rank[['Rank', 'Method', m]]
    return rankings


### Data generators

In [4]:

def generate_linear_data(n_samples=2000, n_features=12, true_dim=5):
    scales = np.array([3.0, 2.5, 2.0, 1.5, 1.0])
    S = np.random.randn(n_samples, true_dim) * scales
    n_mix = 9
    B = np.random.randn(true_dim, n_mix)
    Y = S @ B
    Y += 0.05 * np.random.randn(n_samples, n_mix)
    f10 = 0.6 * Y[:, 0] - 0.4 * Y[:, 3] + 0.1 * np.random.randn(n_samples)
    f11 = -0.2 * Y[:, 5] + 0.5 * Y[:, 1] + 0.1 * np.random.randn(n_samples)
    f12 = 0.5 * np.random.randn(n_samples)
    X = np.column_stack([Y, f10, f11, f12])
    X += 0.02 * np.random.randn(n_samples, X.shape[1])
    mu = X.mean(axis=0)
    Xc = X - mu
    return Xc, mu, S

def generate_nonlinear_data(n_samples=2000, n_features=12, true_dim=5):
    X_sr, t = make_swiss_roll(n_samples=n_samples, noise=0.05)
    z_sr = X_sr[:, 2]
    t = (t - t.mean()) / (t.std() + 1e-12)
    z_sr = (z_sr - z_sr.mean()) / (z_sr.std() + 1e-12)
    S_gauss = np.random.randn(n_samples, 3) * np.array([2.5, 1.5, 1.0])
    S = np.column_stack([t, z_sr, S_gauss])
    f = []
    f.append(np.sin(S[:,0]) + 0.3*S[:,2])
    f.append(np.cos(S[:,1]) + 0.5*S[:,3])
    f.append(S[:,0]*S[:,1] + 0.2*S[:,4])
    f.append(S[:,2]**2 - 0.1*S[:,3])
    f.append(np.tanh(0.5*S[:,0] + 0.3*S[:,2]))
    f.append(np.exp(0.05*S[:,1]) - 1.0)
    f.append(0.7*S[:,3]*np.sin(S[:,1]))
    f.append(0.6*np.cos(S[:,0]) + 0.2*S[:,4])
    f.append((S[:,2]+S[:,3])**2 * 0.05)
    f.append(0.4*S[:,0] - 0.3*S[:,1] + 0.1*np.sin(S[:,4]))
    f.append(0.2*np.sin(S[:,2]) + 0.3*np.cos(S[:,3]))
    f.append(0.5*np.random.randn(n_samples))
    X = np.column_stack(f)
    X += 0.02 * np.random.randn(n_samples, X.shape[1])
    mu = X.mean(axis=0)
    Xc = X - mu
    return Xc, mu, S

def generate_semilinear_data(n_samples=2000, n_features=12, true_dim=5):
    scales = np.array([3.0, 2.5, 2.0, 1.5, 1.0])
    S = np.random.randn(n_samples, true_dim) * scales
    B_lin = np.random.randn(true_dim, 6)
    Y_lin = S @ B_lin + 0.05 * np.random.randn(n_samples, 6)
    NL = []
    NL.append(np.sin(S[:,0]) + 0.2*S[:,2])
    NL.append(np.cos(S[:,1]) + 0.3*S[:,3])
    NL.append(S[:,0]*S[:,1])
    NL.append(np.tanh(0.5*S[:,4]))
    NL.append((S[:,2]+S[:,3])**2 * 0.05)
    NL = np.column_stack(NL)
    corr = Y_lin[:, 0] - 0.4 * Y_lin[:, 1] + 0.1 * np.random.randn(n_samples)
    noise = 0.5 * np.random.randn(n_samples)
    X = np.column_stack([Y_lin, NL, corr, noise])
    X += 0.02 * np.random.randn(n_samples, X.shape[1])
    mu = X.mean(axis=0)
    Xc = X - mu
    return Xc, mu, S



### Reconstructions

In [5]:

def best_linear_reconstruction(Z, Xc):
    A = pinv(Z) @ Xc
    return Z @ A

# Linear methods (10)

def reconstruct_with_pca(Xc, k):
    model = PCA(n_components=k, random_state=42)
    Z = model.fit_transform(Xc)
    return model.inverse_transform(Z)

def reconstruct_with_ipca(Xc, k):
    model = IncrementalPCA(n_components=k)
    model.fit(Xc)
    Z = model.transform(Xc)
    return model.inverse_transform(Z)

def reconstruct_with_tsvd(Xc, k):
    model = TruncatedSVD(n_components=k, random_state=42)
    Z = model.fit_transform(Xc)
    if hasattr(model, 'inverse_transform'):
        return model.inverse_transform(Z)
    return Z @ model.components_

def reconstruct_with_fa(Xc, k):
    model = FactorAnalysis(n_components=k, random_state=42, max_iter=1000)
    Z = model.fit_transform(Xc)
    if hasattr(model, 'inverse_transform'):
        return model.inverse_transform(Z)
    return Z @ model.components_

def reconstruct_with_ica(Xc, k):
    model = FastICA(n_components=k, random_state=42, max_iter=3000, tol=5e-4, algorithm='parallel', whiten='unit-variance')
    Z = model.fit_transform(Xc)
    return model.inverse_transform(Z)

def reconstruct_with_spca(Xc, k):
    model = SparsePCA(n_components=k, random_state=42, alpha=1e-3, max_iter=1000)
    Z = model.fit_transform(Xc)
    return Z @ model.components_

def reconstruct_with_mbspca(Xc, k):
    model = MiniBatchSparsePCA(n_components=k, random_state=42, alpha=1e-3, max_iter=1000)
    Z = model.fit_transform(Xc)
    return Z @ model.components_

def reconstruct_with_dictlearn(Xc, k):
    model = DictionaryLearning(n_components=k, random_state=42, max_iter=1000)
    Z = model.fit_transform(Xc)
    return Z @ model.components_

def reconstruct_with_grp(Xc, k):
    model = GaussianRandomProjection(n_components=k, random_state=42)
    Z = model.fit_transform(Xc)
    W = model.components_
    A = pinv(W.T)
    return Z @ A

def reconstruct_with_srp(Xc, k):
    model = SparseRandomProjection(n_components=k, random_state=42)
    Z = model.fit_transform(Xc)
    W = model.components_
    A = pinv(W.T)
    return Z @ A

LINEAR_METHODS = {
    'PCA': reconstruct_with_pca,
    'IncrementalPCA': reconstruct_with_ipca,
    'TruncatedSVD': reconstruct_with_tsvd,
    'FactorAnalysis': reconstruct_with_fa,
    'FastICA': reconstruct_with_ica,
    'SparsePCA': reconstruct_with_spca,
    'MiniBatchSparsePCA': reconstruct_with_mbspca,
    'DictionaryLearning': reconstruct_with_dictlearn,
    'GaussianRP': reconstruct_with_grp,
    'SparseRP': reconstruct_with_srp,
}

# Non-linear methods (5)

def reconstruct_with_tsne(Xc, k):
    tsne_kwargs = dict(n_components=k, random_state=42, perplexity=30, n_iter=1500, init='pca', learning_rate='auto', metric='euclidean')
    if k <= 3:
        tsne_kwargs.update(method='barnes_hut', angle=0.5)
    else:
        tsne_kwargs.update(method='exact')
    Z = TSNE(**tsne_kwargs).fit_transform(Xc)
    return best_linear_reconstruction(Z, Xc)

def reconstruct_with_isomap(Xc, k):
    Z = Isomap(n_neighbors=12, n_components=k).fit_transform(Xc)
    return best_linear_reconstruction(Z, Xc)

def reconstruct_with_lle(Xc, k):
    Z = LocallyLinearEmbedding(n_neighbors=12, n_components=k, method='standard', random_state=42).fit_transform(Xc)
    return best_linear_reconstruction(Z, Xc)

def reconstruct_with_kpca(Xc, k):
    gamma = 1.0 / Xc.shape[1]
    kpca = KernelPCA(n_components=k, kernel='rbf', gamma=gamma, fit_inverse_transform=True, alpha=1e-3)
    Z = kpca.fit_transform(Xc)
    try:
        Xhat = kpca.inverse_transform(Z)
    except Exception:
        Xhat = best_linear_reconstruction(Z, Xc)
    return Xhat

def reconstruct_with_spectral(Xc, k):
    Z = SpectralEmbedding(n_components=k, n_neighbors=12, random_state=42).fit_transform(Xc)
    return best_linear_reconstruction(Z, Xc)

NONLINEAR_METHODS = {
    'tSNE': reconstruct_with_tsne,
    'Isomap': reconstruct_with_isomap,
    'LLE': reconstruct_with_lle,
    'KernelPCA': reconstruct_with_kpca,
    'SpectralEmbedding': reconstruct_with_spectral,
}


### Evaluation utility

In [6]:
def evaluate_methods(Xc, methods_dict, target_dims=TARGET_DIMS):
    results = {d: [] for d in target_dims}
    for d in target_dims:
        for name, fn in methods_dict.items():
            try:
                Xhat = fn(Xc, d)
                R = Xc - Xhat
                row = {
                    'Method': name, 'Dim': d,
                    'Frobenius': frobenius_norm(R),
                    'RelFro': relative_frobenius(R, Xc),
                    'Spectral': spectral_norm(R),
                    'MSE': mse(R),
                    'R2': reconstruction_R2(R, Xc),
                }
                results[d].append(row)
            except Exception as e:
                results[d].append({'Method': name, 'Dim': d, 'Error': str(e)})
    return results

## Section 1: Linear DR Algorithms on Linear Data

In [None]:

Xc_lin, mu_lin, S_lin = generate_linear_data(n_samples=500, n_features=12, true_dim=5)
print(f"[Section 1] Centered linear data shape: {Xc_lin.shape}")

results_linear = evaluate_methods(Xc_lin, LINEAR_METHODS, TARGET_DIMS)
df_linear_scores, df_linear_errors = results_to_df(results_linear)

print("\n[Section 1] Scores (DataFrame):")
display(df_linear_scores.sort_values(['Dim','Method']).reset_index(drop=True))

if not df_linear_errors.empty:
    print("\n[Section 1] Errors (DataFrame):")
    display(df_linear_errors)


[Section 1] Centered linear data shape: (500, 12)


### Trends and rankings

In [None]:
plot_metric_trends_df(df_linear_scores, title_prefix='[Section 1] ')
rankings_linear = rankings_df(df_linear_scores)

print("\n[Section 1] Rankings per dimension (top 5 shown)")
for d in TARGET_DIMS:
    for m in METRICS:
        df_rank = rankings_linear[d][m]
        print(f"\n  k={d} | {m} {'(higher is better)' if m=='R2' else '(lower is better)'}")
        display(df_rank.head(5))


### Top-3 linear by average R2

In [None]:
avg_R2_lin = avg_metric_by_method(df_linear_scores, metric='R2')
TOP3_LINEAR = list(avg_R2_lin.head(3).items())  # list of (method, score)
print("\n[Section 1] Top-3 linear methods by average R2:")
display(pd.DataFrame(TOP3_LINEAR, columns=['Method','R2_avg']))

## Section 2: Non-linear DR Algorithms on Non-linear Data

In [None]:

# =============================
# Section 2: Non-linear on Non-linear Data
# =============================
Xc_nonlin, mu_nonlin, S_nonlin = generate_nonlinear_data(n_samples=2000, n_features=12, true_dim=5)
print(f"[Section 2] Centered non-linear data shape: {Xc_nonlin.shape}")

results_nonlinear = evaluate_methods(Xc_nonlin, NONLINEAR_METHODS, TARGET_DIMS)
df_nonlinear_scores, df_nonlinear_errors = results_to_df(results_nonlinear)

print("\n[Section 2] Scores (DataFrame):")
display(df_nonlinear_scores.sort_values(['Dim','Method']).reset_index(drop=True))

if not df_nonlinear_errors.empty:
    print("\n[Section 2] Errors (DataFrame):")
    display(df_nonlinear_errors)


### Trends and rankings

In [None]:
plot_metric_trends_df(df_nonlinear_scores, title_prefix='[Section 2] ')
rankings_nonlinear = rankings_df(df_nonlinear_scores)

print("\n[Section 2] Rankings per dimension (top 5 shown)")
for d in TARGET_DIMS:
    for m in METRICS:
        df_rank = rankings_nonlinear[d][m]
        print(f"\n  k={d} | {m} {'(higher is better)' if m=='R2' else '(lower is better)'}")
        display(df_rank.head(5))

### Top-3 non-linear by average R2

In [None]:
avg_R2_nonlin = avg_metric_by_method(df_nonlinear_scores, metric='R2')
TOP3_NONLINEAR = list(avg_R2_nonlin.head(3).items())
print("\n[Section 2] Top-3 non-linear methods by average R2:")
display(pd.DataFrame(TOP3_NONLINEAR, columns=['Method','R2_avg']))

## Section 3: Top performers on Semi-linear Data

In [None]:

Xc_semilin, mu_semilin, S_semilin = generate_semilinear_data(n_samples=2000, n_features=12, true_dim=5)
print(f"[Section 3] Centered semi-linear data shape: {Xc_semilin.shape}")

# Build combined dict of top-3 linear + top-3 non-linear
LINEAR_METHODS_MAP = {name: LINEAR_METHODS[name] for name, _ in TOP3_LINEAR}
NONLINEAR_METHODS_MAP = {name: NONLINEAR_METHODS[name] for name, _ in TOP3_NONLINEAR}
COMBINED_TOP6 = {}
COMBINED_TOP6.update(LINEAR_METHODS_MAP)
COMBINED_TOP6.update(NONLINEAR_METHODS_MAP)

results_semilinear = evaluate_methods(Xc_semilin, COMBINED_TOP6, TARGET_DIMS)
df_semilinear_scores, df_semilinear_errors = results_to_df(results_semilinear)

print("\n[Section 3] Scores (DataFrame):")
display(df_semilinear_scores.sort_values(['Dim','Method']).reset_index(drop=True))

if not df_semilinear_errors.empty:
    print("\n[Section 3] Errors (DataFrame):")
    display(df_semilinear_errors)

plot_metric_trends_df(df_semilinear_scores, title_prefix='[Section 3] ')
rankings_semilinear = rankings_df(df_semilinear_scores)

print("\n[Section 3] Rankings per dimension (top 5 shown)")
for d in TARGET_DIMS:
    for m in METRICS:
        df_rank = rankings_semilinear[d][m]
        print(f"\n  k={d} | {m} {'(higher is better)' if m=='R2' else '(lower is better)'}")
        display(df_rank.head(5))


## Summary Dashboard

### Average metrics per method (by section)

In [None]:

summary_linear = df_linear_scores.groupby('Method')[METRICS].mean().sort_values('R2', ascending=False)
summary_nonlinear = df_nonlinear_scores.groupby('Method')[METRICS].mean().sort_values('R2', ascending=False)
summary_semilinear = df_semilinear_scores.groupby('Method')[METRICS].mean().sort_values('R2', ascending=False)

print("[Dashboard] Average metrics per method — Linear section")
display(summary_linear)
print("\n[Dashboard] Average metrics per method — Non-linear section")
display(summary_nonlinear)
print("\n[Dashboard] Average metrics per method — Semi-linear section (Top-6 only)")
display(summary_semilinear)

### Consolidated Top performers table

In [None]:

top3_lin_df = pd.DataFrame(TOP3_LINEAR, columns=['Method','R2_avg']).assign(Section='Linear')
top3_nonlin_df = pd.DataFrame(TOP3_NONLINEAR, columns=['Method','R2_avg']).assign(Section='Non-linear')
consolidated_top = pd.concat([top3_lin_df, top3_nonlin_df], ignore_index=True)
consolidated_top = consolidated_top.sort_values(['Section','R2_avg'], ascending=[True, False])
print("\n[Dashboard] Consolidated Top-3 (Linear & Non-linear)")
display(consolidated_top)

### Bar chart comparison of R2_avg for top-3 of each section

In [None]:

plt.figure(figsize=(8,5))
for i, (sect_name, sect_df) in enumerate([("Linear", top3_lin_df), ("Non-linear", top3_nonlin_df)]):
    plt.bar([f"{sect_name}\n{m}" for m in sect_df['Method']], sect_df['R2_avg'], label=sect_name)
plt.ylabel('Average R2 (across k)')
plt.title('Top-3 R2 averages: Linear vs Non-linear sections')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()


### Heatmap-like table: methods vs dims for R2 (semi-linear top-6)

In [None]:

pivot_semilin = df_semilinear_scores.pivot_table(index='Method', columns='Dim', values='R2', aggfunc='mean')
print("\n[Dashboard] Semi-linear R2 by method and dimension (Top-6)")
display(pivot_semilin)



## Notes
- Tables are presented as **pandas DataFrames** for clarity and downstream export.
- Trend plots and rankings are computed from the DataFrames.
- t‑SNE uses **Barnes–Hut** for k ≤ 3 and **Exact** for k ≥ 4 to satisfy algorithmic constraints.
- Reconstruction for methods without `inverse_transform` uses the **least‑squares decoder** from embedding to data space.
- Metrics are simple matrix/vector quantities (norms, MSE, R²) on **centered** data.
