In [1]:
import time
import copy
import numpy as np
import pandas as pd
import seaborn as sn
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.manifold import TSNE, LocallyLinearEmbedding, Isomap, MDS, SpectralEmbedding
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function

from models import *
from utils import *
from datasets import *

# generate z

In [2]:
# from torch.utils.data import DataLoader, TensorDataset

# # data preprocessing
# transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
# trainTransform  = tv.transforms.Compose([tv.transforms.ToTensor(), tv.transforms.Normalize((0.1307,), (0.3081,))])
# trainset = tv.datasets.MNIST(root='./data',  train=True, download=False, transform=transform)
# testset = tv.datasets.MNIST(root='./data',  train=False, download=False, transform=transform)

# X_train = [i[0].reshape(-1) for i in trainset]
# X_train = torch.vstack(X_train)
# y_train = torch.tensor([i[1] for i in trainset])
# print("X_train.shape: {}".format(X_train.shape))        # X_train.shape: torch.Size([60000, 1, 28, 28])

# X_test = [i[0].reshape(-1) for i in testset]
# X_test = torch.vstack(X_test)
# y_test = torch.tensor([i[1] for i in testset])
# print("X_test.shape: {}".format(X_test.shape))          # X_test.shape: torch.Size([10000, 1, 28, 28])

In [3]:
# np.save("./data/MNIST/X_train.npy", X_train)
# np.save("./data/MNIST/y_train.npy", y_train)
# np.save("./data/MNIST/X_test.npy", X_test)
# np.save("./data/MNIST/y_test.npy", y_test)

In [4]:
X_train = np.load("./data/MNIST/X_train.npy")
y_train = np.load("./data/MNIST/y_train.npy")
X_test = np.load("./data/MNIST/X_test.npy")
y_test = np.load("./data/MNIST/y_test.npy")

print("X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
print("X_test: {}, y_test: {}".format(X_test.shape, y_test.shape))

X_train: (60000, 784), y_train: (60000,)
X_test: (10000, 784), y_test: (10000,)


## others

In [5]:
def generate(X, y, model_name='t-sne', n_neighbors=10):

    if model_name=='pca':
        pca = PCA(n_components=2)
        z = pca.fit_transform(X)
    elif model_name=='lle_ltsa':
        lle_ltsa = LocallyLinearEmbedding(method="ltsa", n_components=2, n_neighbors=n_neighbors)
        z = lle_ltsa.fit_transform(X)
    elif model_name=='lle_hessian':
        lle_hessian = LocallyLinearEmbedding(method="hessian", n_components=2, n_neighbors=n_neighbors)
        z = lle_hessian.fit_transform(X)
    elif model_name=='lle_mod':
        lle_mod = LocallyLinearEmbedding(method="modified", n_components=2, n_neighbors=n_neighbors)
        z = lle_mod.fit_transform(X)
    elif model_name=='isomap':
        isomap = Isomap(n_neighbors=5, n_components=2, p=1)
        z = isomap.fit_transform(X)
    elif model_name=='mds':
        mds = MDS(n_components=2)
        z = mds.fit_transform(X)
    elif model_name=='spectralembedding':
        se = SpectralEmbedding(n_components=2, n_neighbors=n_neighbors)
        z = se.fit_transform(X)
    elif model_name=='lda':
        lda = LinearDiscriminantAnalysis(n_components=2)
        z = lda.fit_transform(X_train, y_train)
    else:
        print("wrong model name")
        exit()

    z = normalise(z=z)

    scags = scagnostics.compute(z[:, 0], z[:, 1])
    scags = torch.tensor([list(scags.values())]).view(1,-1)

    return z, y, scags

In [6]:
model_names = ['pca', 'lle_ltsa', 'lle_hessian', 'lle_mod', 'isomap', 'mds', 'spectralembedding', 'lda']
n_neighbors = [5, 10, 15]

cnt = 0
for model_name in model_names:
    for neighbor in n_neighbors:
        print("{}, {}".format(model_name, neighbor))
        try:
            z, y, scags = generate(X_train, y_train, model_name=model_name, n_neighbors=neighbor)
            torch.save([model_name, neighbor, z, y, scags], "./data/pretraining/{}.pt".format(cnt))
            cnt += 1
        except:
            print("wrong param")

pca, 5
pca, 10
pca, 15
lle_ltsa, 5
wrong param
lle_ltsa, 10


## t-SNE

## UMAP

# pretraining Metric-MLP

# pretraining HM