# D&C on the MNIST dataset

Comparison between SMACOF, Isomap, Local MDS and t-SNE when used with D&C on the training set of numeric MNIST. 

In [5]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
project_root

'/Users/airdac/Documents/Uni/Second/TFM/TFM_Adria/code'

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.colors as mcolors
import pickle
import pyreadr
from scipy.stats import gaussian_kde

np.random.seed(42)

In [16]:
# Load all_data.RData
rdata = pyreadr.read_r(os.path.join(project_root, 'd_and_c', 'MNIST.RData'))

all_data_pixels = rdata["all_data_pixels"]
target = rdata["target"]
type_data = rdata["type_data"]

target = np.squeeze(np.array(target))
type_data = np.squeeze(np.array(type_data))

# Select train images of numbers
numeric_target = pd.to_numeric(target, errors="coerce")
train_idx = (type_data == "train") & (pd.notnull(numeric_target))

MNIST_pixels = all_data_pixels.loc[train_idx].values
MNIST_target = numeric_target[train_idx]

print(f"MNIST_pixels shape: {MNIST_pixels.shape}")
print(f"MNIST_target shape: {MNIST_target.shape}")

MNIST_pixels shape: (345035, 784)
MNIST_target shape: (345035,)


In [17]:
digits = np.unique(MNIST_target)

Isomap

In [9]:
Isomap_path = os.path.join(project_root, 'd_and_c', 'Isomap', 'pickles', 'MNIST_train_num_DC_Isomap.pkl')
with open(Isomap_path, "rb") as f:
    bare_data = pickle.load(f)
    Isomap_embedding = bare_data["embedding"]
    Isomap_runtime = bare_data["runtime"]

In [18]:
# Estimate densities
Isomap_densities = {}

for digit in digits:
    mask = MNIST_target == digit
    data = Isomap_embedding[mask].T
    kde = gaussian_kde(data)
    Isomap_densities[int(digit)] = kde

Local MDS

In [19]:
LMDS_path = os.path.join(project_root, 'd_and_c', 'LMDS', 'pickles', 'MNIST_train_num_DC_LMDS.pkl')
with open(Isomap_path, "rb") as f:
    bare_data = pickle.load(f)
    LMDS_embedding = bare_data["embedding"]
    LMDS_runtime = bare_data["runtime"]

In [20]:
# Estimate densities
LMDS_densities = {}

for digit in digits:
    mask = MNIST_target == digit
    data = LMDS_embedding[mask].T
    kde = gaussian_kde(data)
    LMDS_densities[int(digit)] = kde

SMACOF

In [21]:
SMACOF_path = os.path.join(project_root, 'd_and_c',
                           'SMACOF', 'pickles', 'MNIST_train_num_DC_SMACOF.pkl')
with open(SMACOF_path, "rb") as f:
    bare_data = pickle.load(f)
    SMACOF_embedding = bare_data["embedding"]
    SMACOF_runtime = bare_data["runtime"]

In [22]:
# Estimate densities
SMACOF_densities = {}

for digit in digits:
    mask = MNIST_target == digit
    data = SMACOF_embedding[mask].T
    kde = gaussian_kde(data)
    SMACOF_densities[int(digit)] = kde

t-SNE

In [24]:
tSNE_path = os.path.join(project_root, 'd_and_c',
                         't-SNE', 'pickles', 'MNIST_train_num_DC_tSNE.pkl')
with open(tSNE_path, "rb") as f:
    bare_data = pickle.load(f)
    tSNE_embedding = bare_data["embedding"]
    tSNE_runtime = bare_data["runtime"]

In [25]:
# Estimate densities
tSNE_densities = {}

for digit in digits:
    mask = MNIST_target == digit
    data = tSNE_embedding[mask].T
    kde = gaussian_kde(data)
    tSNE_densities[int(digit)] = kde

Plot

In [None]:
# Contour plot estimated densities
set1_colors = list(plt.get_cmap('Set1').colors)
set1_colors.append((0, 0, 0))  # Add black
cmap = mcolors.ListedColormap(set1_colors)

# Compute meshgrid for contours
embedding_x = Isomap_embedding[:, 0]
embedding_y = Isomap_embedding[:, 1]
xpad = (embedding_x.max() - embedding_x.min()) * 0.05
ypad = (embedding_y.max() - embedding_y.min()) * 0.05
xmin, xmax = embedding_x.min() - xpad, embedding_x.max() + xpad
ymin, ymax = embedding_y.min() - ypad, embedding_y.max() + ypad
xx, yy = np.meshgrid(
    np.linspace(xmin, xmax, 200),
    np.linspace(ymin, ymax, 200)
)

fig, ax = plt.subplots(1, 1, figsize=(7.5, 7))
for i, digit in enumerate(digits):
    kde = Isomap_densities[int(digit)]
    zz = kde(np.vstack([xx.ravel(), yy.ravel()])).reshape(xx.shape)
    ax.contour(xx, yy, zz, levels=1, colors=[cmap(i)], linewidths=2)
ax.axis('equal')
legend_handles = [mlines.Line2D([], [], linestyle='None', marker='o',
                                markersize=7,
                                markerfacecolor=cmap(i),
                                markeredgewidth=0,
                                label=int(digit))
                  for i, digit in enumerate(digits)]
ax.legend(handles=legend_handles, title="Digit",
          bbox_to_anchor=(1.01, 1.01), loc='upper left')
fig.suptitle(
    f"D&C Isomap on the {MNIST_target.shape[0]}-points numeric training dataset of MNIST\nBivariate density estimation per digit\nl={1000}, c_points={100}, runtime: {Isomap_runtime:.2f} s")
plt.tight_layout()