In [5]:
"""
mnist_pca_tsne.py

Usage:
    - Put your local MNIST CSV file path in --csv or pass it on the command line.
    - The script will:
        1. load the CSV (attempt to detect a label column)
        2. scale the features (optional)
        3. run PCA (default 50 components)
        4. run t-SNE (2D) on the PCA result
        5. plot and save the 2D embedding and export embedding CSV

Notes:
    - The code tries to be robust: if a "label" column exists it will be used for coloring the plot.
    - If no explicit label column is found, it will check whether the first column looks like labels (integers, few unique values).

Dependencies:
    numpy, pandas, scikit-learn, matplotlib

Example:
    python mnist_pca_tsne.py --csv /path/to/mnist.csv --n_pca 50 --perplexity 30
"""

import argparse
import os
import sys

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


def load_mnist_csv(path):
    """Load CSV and try to detect label column.
    Returns: X (n_samples, n_features), labels (n_samples) or None
    """
    df = pd.read_csv(path)

    # Detect label column
    label_col = None
    for candidate in ("label", "y", "target", "class"):
        if candidate in df.columns:
            label_col = candidate
            break

    if label_col is None:
        # Heuristic: if first column is integer and has <= 20 unique values, treat it as label
        first_col = df.columns[0]
        if pd.api.types.is_integer_dtype(df[first_col]) or pd.api.types.is_object_dtype(df[first_col]):
            n_unique = df[first_col].nunique()
            if n_unique <= 20:
                label_col = first_col

    if label_col is not None:
        labels = df[label_col].values
        X = df.drop(columns=[label_col]).values
    else:
        labels = None
        X = df.values

    return X, labels


def run_pipeline(csv_path, out_dir, n_pca=50, standardize=True, tsne_perplexity=30, tsne_lr=200.0, tsne_iter=1000, random_state=42):
    os.makedirs(out_dir, exist_ok=True)

    print(f"Loading: {csv_path}")
    X, labels = load_mnist_csv(csv_path)
    n_samples, n_features = X.shape
    print(f"Data shape: {X.shape}; labels: {'present' if labels is not None else 'none'}")

    # Optional standardization: many workflows use StandardScaler before PCA
    if standardize:
        print("Standardizing features (Zero mean, unit variance)")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
    else:
        print("Skipping standardization (using raw pixel values)")
        X_scaled = X.astype(float)

    # PCA
    if n_pca is not None and n_pca > 0 and n_pca < n_features:
        print(f"Running PCA -> {n_pca} components")
        pca = PCA(n_components=n_pca, svd_solver='randomized', random_state=random_state)
        X_pca = pca.fit_transform(X_scaled)
        # save explained variance ratio plot
        evr = pca.explained_variance_ratio_
        cum_evr = np.cumsum(evr)
        evr_path = os.path.join(out_dir, 'pca_explained_variance_ratio.png')
        plt.figure(figsize=(6,4))
        plt.plot(np.arange(1, len(evr)+1), cum_evr, marker='o')
        plt.xlabel('Number of PCA components')
        plt.ylabel('Cumulative explained variance')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(evr_path)
        plt.close()
        print(f"Saved PCA cumulative explained variance plot -> {evr_path}")
    else:
        print("Skipping PCA; using original features for t-SNE (not recommended for very high-dimensional data)")
        X_pca = X_scaled

    # t-SNE (2D)
    print(f"Running t-SNE (perplexity={tsne_perplexity}, lr={tsne_lr}, iterations={tsne_iter})")
    tsne = TSNE(n_components=2, perplexity=tsne_perplexity, learning_rate=tsne_lr, n_iter=tsne_iter, random_state=random_state, init='pca')
    X_tsne = tsne.fit_transform(X_pca)

    # Save embedding to CSV
    emb_df = pd.DataFrame(X_tsne, columns=['tsne1','tsne2'])
    if labels is not None:
        emb_df['label'] = labels
    emb_csv = os.path.join(out_dir, 'mnist_tsne_embedding.csv')
    emb_df.to_csv(emb_csv, index=False)
    print(f"Saved t-SNE embedding -> {emb_csv}")

    # Plot
    plt.figure(figsize=(8,6))
    if labels is not None:
        # color by label
        unique_labels = np.unique(labels)
        n_labels = len(unique_labels)
        for lab in unique_labels:
            mask = (labels == lab)
            plt.scatter(X_tsne[mask,0], X_tsne[mask,1], label=str(lab), s=6, alpha=0.7)
        plt.legend(markerscale=3, fontsize='small', ncol=2)
    else:
        plt.scatter(X_tsne[:,0], X_tsne[:,1], s=6, alpha=0.7)

    plt.title('t-SNE of MNIST (after PCA)')
    plt.xlabel('tsne1')
    plt.ylabel('tsne2')
    plt.tight_layout()
    plot_path = os.path.join(out_dir, 'mnist_tsne_plot.png')
    plt.savefig(plot_path, dpi=150)
    plt.close()
    print(f"Saved t-SNE plot -> {plot_path}")

    print("Done.")


if __name__ == '__main__':
    # *** DIRECT EXECUTION MODE FOR VS CODE ***
    # Instead of passing command-line arguments, set your file name here:
    csv_path = "mnist_test_nolabels.csv"   # <--- put your file name here

    # Default parameters (modify as needed)
    out_dir = "mnist_tsne_output"
    n_pca = 50
    standardize = True  # set False to skip StandardScaler
    tsne_perplexity = 30.0
    tsne_lr = 200.0
    tsne_iter = 1000
    random_state = 42

    run_pipeline(
        csv_path,
        out_dir,
        n_pca=n_pca,
        standardize=standardize,
        tsne_perplexity=tsne_perplexity,
        tsne_lr=tsne_lr,
        tsne_iter=tsne_iter,
        random_state=random_state,
    )

Loading: mnist_test_nolabels.csv
Data shape: (10000, 783); labels: present
Standardizing features (Zero mean, unit variance)
Running PCA -> 50 components
Saved PCA cumulative explained variance plot -> mnist_tsne_output\pca_explained_variance_ratio.png
Running t-SNE (perplexity=30.0, lr=200.0, iterations=1000)




Saved t-SNE embedding -> mnist_tsne_output\mnist_tsne_embedding.csv
Saved t-SNE plot -> mnist_tsne_output\mnist_tsne_plot.png
Done.
