In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import sklearn
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import time
import sys
import math
import numpy as np
import random
import joblib
from tqdm import tqdm_notebook

import Config
import Dataloader as DL
import HD_basis as HDB
import HD_encoder as HDE
import HD_classifier as HDC

import sklearn.manifold as skm 
import sklearn.utils.random as skr # for random sampling 


In [33]:
# train data 
def train(hdc, traindata, trainlabels, testdata, testlabels, param = Config.config):
    train_acc = []
    test_acc = []
    # Early stopping 
    epsilon = 0.001
    counter = 0
    for _ in tqdm_notebook(range(param["epochs"]), desc='epochs'):
        train_acc.append(hdc.fit(traindata, trainlabels, param))
        test_acc.append(hdc.test(testdata, testlabels))
        if len(train_acc) % 5 == 0:
            print("Train: %f \t \t Test: %f"%(train_acc[-1], test_acc[-1]))
        if train_acc[-1] == 1:
            print("Train: %f \t \t Test: %f"%(train_acc[-1], test_acc[-1]))
            break
        if max(test_acc) - test_acc[-1] >= epsilon:
            counter += 1
            #if counter >= 20:
            #    sys.stderr.write("Early stopping initiated")
            #    print("Train: %f \t \t Test: %f"%(train_acc[-1], test_acc[-1]))
            #    break
        else:
            counter = 0
    return np.asarray(train_acc), np.asarray(test_acc)

In [34]:
def dump_log(param, train_acc, test_acc, filename):
    joblib.dump((param, train_acc, test_acc), open(filename+".pkl", "wb"), compress=True)
    file = open(filename+".txt", "a")
    msg = str(100*max(train_acc)) + " " + str(100*max(test_acc)) + " " +\
        str(len(train_acc)) + " " + str(np.argmax(test_acc) + 1) + "\n"
    file.write(msg)
    file.close()

In [35]:
############### ISOMAP ########################
# Everyone uses a neighborhood size of 8-12.
# If your manifold is more than 2-3 dimensional, then Isomap probably won't work, 
# you would need to have a neighborhood size that is larger (more points!) so that 
# you have reasonable options to estimate your geodesic path with links to nearby neighbors.
def iso_wrapper(n_neighbors, n_components):
    
    start = time.time()
    isomap = skm.Isomap(n_neighbors, n_components)
    isomap.fit(manifolddata)
    end = time.time()
    # Preparation time 
    prep_time = int(end - start) 
    print( "Prep time: ", prep_time ) 
    
    start = time.time()
    trainiso = isomap.transform(traindata)
    testiso = isomap.transform(testdata)
    end = time.time() 
    # Transformation time
    trans_time = int(end-start)
    print( "Transform time: ", trans_time )

    filename = "./dumper/isomap_data_"+str(n_neighbors)+"_"+str(n_components)+".pkl"
    joblib.dump((trainiso, trainlabels, testiso, testlabels), open(filename, "wb"), compress=True)
    
    return trainiso, testiso, prep_time, trans_time

In [36]:
############### Modified/Hessian Locally Linear Embedding , more ########################

def lle_wrapper(n_neighbors, n_components, lle_type = "modified"):
    
    start = time.time()
    mlle = skm.LocallyLinearEmbedding(n_neighbors, n_components, method = lle_type)
    mlle.fit(manifolddata)
    end = time.time()
    prep_time = int(end - start) 
    print( prep_time ) 

    start = time.time()
    trainmlle = mlle.transform(traindata)
    testmlle = mlle.transform(testdata)
    end = time.time()
    trans_time = int(end-start)
    print( trans_time )

    file_name = "./dumper/"+lle_type+"_data_"+str(n_neighbors)+"_"+str(n_components)+".pkl"
    joblib.dump((trainmlle, trainlabels, testmlle, testlabels), open(filename, "wb"), compress=True)

    return trainiso, testiso, prep_time, trans_time

In [7]:
############### Spectral Embedding ########################
############### Multi-dimensional Scaling ########################
############### t-distributed Stochastic Neighbor Embedding #######
# No online learning version; has to input full data

In [8]:
dl = DL.Dataloader()
nFeatures, nClasses, traindata, trainlabels, testdata, testlabels = dl.getParam()

Loading dataset MNIST from MNIST
Loading train data... train data of shape (60000, 784) loaded
Loading test data...  test  data of shape (10000, 784) loaded
Data Loaded. Num of features = 784 Num of Classes = 10

In [9]:
# data cropping
traindata = traindata[:20000]
testdata = testdata[:5000]

In [37]:
param = Config.config
param["nFeatures"] = nFeatures
param["nClasses"] = nClasses
print(param)

{'data_location': '../dataset/', 'directory': 'MNIST', 'dataset': 'MNIST', 'D': 500, 'vector': 'Gaussian', 'mu': 0, 'sigma': 1, 'binarize': 0, 'lr': 0.037, 'sparse': 0, 's': 0.1, 'binaryModel': 0, 'width': None, 'height': None, 'nLayers': 5, 'uniform_dim': 1, 'uniform_ker': 1, 'dArr': None, 'k': 5, 'kArr': None, 'one_shot': 0, 'data_percentages': [1.0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5], 'train_percent': 1, 'dropout': 0, 'drop_percentages': [0, 0.1, 0.2, 0.5], 'dropout_rate': 0, 'update_type': <Update_T.FULL: 1>, 'iter_per_trial': 3, 'iter_per_encoding': 5, 'epochs': 250, 'nFeatures': 784, 'nClasses': 10, 'id': '3505', 'gen_type': <Generator.Vanilla: 1>, 'manifold': 'isomap'}


In [38]:
# Run manifolding learning data
# Random sampling for manifold 

#manifoldlist = skr.sample_without_replacement(len(traindata), 2000)
#manifolddata = np.asfarray(traindata[np.asarray(manifoldlist)])
#n_neighbors = 200
#n_components  = 25
#param["nFeatures"] = n_components
#traintrans, testtrans, prep_time, trans_time = iso_wrapper(n_neighbors, n_components)

In [39]:
# Run basiline data 
# param["nFeatures"] = nFeatures
# param["manifold"] = "baseline"
# traintrans, testtrans = traindata, testdata

In [43]:
################# VANILLA #################
def vanilla_train(param):
    hdb = HDB.HD_basis(HDB.Generator.Vanilla, param)
    basis = hdb.getBasis()
    bid = hdb.getParam()["id"]
    # Update param with bid
    param = hdb.getParam()
    print(bid)

    hde = HDE.HD_encoder(basis)
    trainencoded = hde.encodeData(traintrans)
    #trainencoded = hde.encodeData(traindata)
    HDE.saveEncoded(trainencoded, trainlabels, bid, "train")
    testencoded = hde.encodeData(testtrans)
    #testencoded = hde.encodeData(testdata)
    HDE.saveEncoded(testencoded, testlabels, bid, "test")

    # Should have 95%
    for i in range(param["iter_per_encoding"]):
        hdc = HDC.HD_classifier(param["D"], param["nClasses"], bid)
        train_acc, test_acc = train(hdc, trainencoded, trainlabels, testencoded, testlabels, param)
    
    return train_acc, test_acc


In [41]:
# Dimension tuning 
param["D"] = 500

In [None]:
for i in range(param["iter_per_trial"]):
    
    # Run manifolding learning data
    # Random sampling for manifold 
    manifoldlist = skr.sample_without_replacement(len(traindata), 2000)
    manifolddata = np.asfarray(traindata[np.asarray(manifoldlist)])
    n_neighbors = 200
    n_components  = 25
    param["nFeatures"] = n_components
    param["manifold"] = "isomap"
    traintrans, testtrans, prep_time, trans_time = iso_wrapper(n_neighbors, n_components)
    
    train_acc, test_acc = vanilla_train(param)
    
    #filename = "./logfile/" + param["manifold"] + "_" +str(param["D"])+"_"+str(param["id"])
    filename = "./logfile/" + param["manifold"] + "_" +str(param["D"])+"_"+ \
        str(n_components) + "_" + str(n_neighbors) + "_" + str(param["id"])
    dump_log(param, train_acc, test_acc, filename)
    

Prep time:  18
