In [None]:
import plotly.express as px
from umap import UMAP
import pandas as pd
import numpy as np


# Kaggle MNIST UMAP Visualization
# This script demonstrates how to use UMAP for dimensionality reduction on the Kaggle MNIST datase

# Load Kaggle MNIST dataset (adjust path as needed)
df = pd.read_csv('train.csv')  # Path to Kaggle's train.csv
y = df['label'].values  # Labels (0-9)
X = df.drop('label', axis=1).values  # 784 features (28x28 pixels)

# Apply UMAP to reduce from 784 dimensions to 2
umap_2d = UMAP(n_components=2, random_state=42,n_epochs=600, verbose=True)
projections = umap_2d.fit_transform(X)

# Create DataFrame for Plotly
df_plot = pd.DataFrame(projections, columns=['UMAP1', 'UMAP2'])
df_plot['digit'] = y.astype(str)  # Convert labels to strings for coloring

fig = px.scatter(
    df_plot,
    x='UMAP1',
    y='UMAP2',
    color='digit',
    title='UMAP Projection of Kaggle MNIST (784D to 2D)',
    labels={'color': 'Digit'},
    width=800,
    height=600
)

# Show the plot
fig.show()

df.head()

df_plot.head()

  from .autonotebook import tqdm as notebook_tqdm
  warn(


UMAP(n_epochs=600, n_jobs=1, random_state=42, verbose=True)
Wed Jun 25 18:45:09 2025 Construct fuzzy simplicial set
Wed Jun 25 18:45:09 2025 Finding Nearest Neighbors
Wed Jun 25 18:45:09 2025 Building RP forest with 15 trees
Wed Jun 25 18:45:30 2025 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	Stopping threshold met -- exiting after 4 iterations
Wed Jun 25 18:46:16 2025 Finished Nearest Neighbor Search
Wed Jun 25 18:46:25 2025 Construct embedding


Epochs completed:   0%|            3/600 [00:01]

	completed  0  /  600 epochs


Epochs completed:  10%| █          62/600 [00:10]

	completed  60  /  600 epochs


Epochs completed:  20%| ██         122/600 [00:19]

	completed  120  /  600 epochs


Epochs completed:  30%| ███        182/600 [00:27]

	completed  180  /  600 epochs


Epochs completed:  40%| ████       243/600 [00:36]

	completed  240  /  600 epochs


Epochs completed:  50%| █████      302/600 [00:45]

	completed  300  /  600 epochs


Epochs completed:  60%| ██████     363/600 [00:53]

	completed  360  /  600 epochs


Epochs completed:  70%| ███████    422/600 [01:02]

	completed  420  /  600 epochs


Epochs completed:  80%| ████████   482/600 [01:10]

	completed  480  /  600 epochs


Epochs completed:  91%| █████████  544/600 [01:20]

	completed  540  /  600 epochs


Epochs completed: 100%| ██████████ 600/600 [01:27]


Wed Jun 25 18:47:57 2025 Finished embedding


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)
# Predict on the test set
y_pred = clf.predict(x_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Classifier Accuracy: {accuracy:.2f}")

xplot_train, xplot_test, yplot_train, yplot_test = train_test_split(df_plot[['UMAP1','UMAP2']], df_plot['digit'], test_size=0.2, random_state=42)
# Train a Decision Tree Classifier on UMAP projections
clf_umap = DecisionTreeClassifier(random_state=42)
clf_umap.fit(xplot_train, yplot_train)
# Predict on the test set of UMAP projections
yplot_pred = clf_umap.predict(xplot_test)
# Calculate accuracy for UMAP projections
accuracy_umap = accuracy_score(yplot_test, yplot_pred)
print(f"Decision Tree Classifier Accuracy on UMAP Projections: {accuracy_umap:.2f}")

print(df.shape)


Decision Tree Classifier Accuracy: 0.85
Decision Tree Classifier Accuracy on UMAP Projections: 0.94
(42000, 785)


In [2]:
df_plot.head()

Unnamed: 0,UMAP1,UMAP2,digit
0,-3.81347,8.009191,1
1,15.751199,1.848397,0
2,-3.854583,13.006274,1
3,-0.562138,-2.867821,4
4,16.135321,1.686615,0
