In [1]:
import gc
import time
import pickle
import numpy as np
import tensorflow as tf

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn import svm

In [2]:
time_checks = [10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]#, 500000, 1000000] # number of samples to be generated
sizes = [1, 2, 3, 4, 5, 6, 7, 8, 31]

# best OCSVM model
kernel = 'linear'
nu = .01
gamma = .5
clf_ocsvm = {}
for n_components in sizes:
    data = np.random.normal(size=(time_checks[0], n_components))
    clf_ocsvm[n_components] = svm.OneClassSVM(nu=nu, kernel=kernel, gamma=gamma)
    clf_ocsvm[n_components].fit(data)

# best DBSCAN model
min_samples = 5
epsilon = 1.
clf_dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric='euclidean')

# data from encoder
layer = 400, 100, 40
dataset_filter = 'normal'
optimizer_name = 'SGD'
lr = 0.01
base_autoencoder_model = r'./models/{}_{}_{}_encoder_{}-model.h5'

dimensionality_reduction = ['pca', 'tsne', 'ae']
algorithm = ['ocsvm', 'dbscan']
scalability_results = {}
scalability_results_full = {}
for alg in algorithm:
    scalability_results[alg] = {}
    scalability_results_full[alg] = {}
    for dim in dimensionality_reduction:
        scalability_results[alg][dim] = {}
        for n_components in sizes:
            if n_components == 31: continue
            scalability_results[alg][dim][n_components] = {}

In [3]:
for num_samples in time_checks:
    data = np.random.normal(size=(num_samples*100, 31))
    
    start_time = time.time_ns()
    y = clf_ocsvm[31].predict(data[:num_samples])
    scalability_results_full['ocsvm'][num_samples] = time.time_ns() - start_time
    
    clf = DBSCAN(eps=epsilon, min_samples=min_samples, metric='euclidean')
    start_time = time.time_ns()
    y = clf.fit_predict(data)
    scalability_results_full['dbscan'][num_samples] = time.time_ns() - start_time
    
    for n_components in sizes:
        if n_components == 31: continue
        
        pca = PCA(n_components=n_components)
        start_time = time.time_ns()
        pca_data = pca.fit_transform(data[:num_samples])
        time_pca = time.time_ns() - start_time
        
        tsne = TSNE(n_components=n_components, verbose=0, n_iter=300, method='exact')
        start_time = time.time_ns()
        tsne_data = tsne.fit_transform(data[:num_samples])
        time_tsne = time.time_ns() - start_time
        
        layers = layer + (n_components,)
        model = tf.keras.models.load_model(base_autoencoder_model.format(dataset_filter, optimizer_name, lr, '-'.join(str(x) for x in layers)))
        output_layer = len(model.layers) // 2
        encoder = tf.keras.Model(inputs=model.input, outputs=model.layers[output_layer-1].output)
#         encoder.summary()
        start_time = time.time_ns()
        encoded_data = encoder.predict(data[:num_samples])
        time_encoder = time.time_ns() - start_time
        
        start_time = time.time_ns()
        y_pred = clf_ocsvm[n_components].predict(pca_data)
        scalability_results['ocsvm']['pca'][n_components][num_samples] = time.time_ns() - start_time + time_pca
        
        start_time = time.time_ns()
        y_pred = clf_ocsvm[n_components].predict(tsne_data)
        scalability_results['ocsvm']['tsne'][n_components][num_samples] = time.time_ns() - start_time + time_tsne
        
        start_time = time.time_ns()
        y_pred = clf_ocsvm[n_components].predict(encoded_data)
        scalability_results['ocsvm']['ae'][n_components][num_samples] = time.time_ns() - start_time + time_encoder
        
        # DBSCAN
        start_time = time.time_ns()
        pca = PCA(n_components=n_components)
        pca_data = pca.fit_transform(data)
        time_pca = time.time_ns() - start_time
        
        start_time = time.time_ns()
        tsne = TSNE(n_components=n_components, verbose=0, n_iter=300, method='exact')
        tsne_data = tsne.fit_transform(data)
        time_tsne = time.time_ns() - start_time
        
        layers = layer + (n_components,)
        model = tf.keras.models.load_model(base_autoencoder_model.format(dataset_filter, optimizer_name, lr, '-'.join(str(x) for x in layers)))
        output_layer = len(model.layers) // 2
        encoder = tf.keras.Model(inputs=model.input, outputs=model.layers[output_layer-1].output)
#         encoder.summary()
        start_time = time.time_ns()
        encoded_data = encoder.predict(data)
        time_encoder = time.time_ns() - start_time
        
        clf = DBSCAN(eps=epsilon, min_samples=min_samples, metric='euclidean')
        
        start_time = time.time_ns()
        y_pred = clf.fit_predict(pca_data)
        scalability_results['dbscan']['pca'][n_components][num_samples] = time.time_ns() - start_time + time_pca
        
        start_time = time.time_ns()
        y_pred = clf.fit_predict(tsne_data)
        scalability_results['dbscan']['tsne'][n_components][num_samples] = time.time_ns() - start_time + time_tsne
        
        start_time = time.time_ns()
        y_pred = clf.fit_predict(encoded_data)
        scalability_results['dbscan']['ae'][n_components][num_samples] = time.time_ns() - start_time + time_encoder
        
    del data, pca_data, tsne_data, encoded_data
    print('gc:', gc.collect())
    
    with open('./models/scalability_results.h5', 'wb') as file:
        pickle.dump({'scalability_results': scalability_results,
                    'scalability_results_full': scalability_results_full}, file)
    print('done', num_samples)
#     break

gc: 55185
done 10
gc: 55128
done 50
gc: 82704
done 100
gc: 68904
done 150
gc: 55104
done 200
gc: 55113
done 250
gc: 41328
done 300
gc: 27561
done 350
gc: 27561
done 400
gc: 27552
done 450


MemoryError: Unable to allocate array with shape (1249975000,) and data type float64