In [4]:
import numpy as np

In [22]:
mnist_full_data = np.load("../data/mnist-data.npz")
spam_full_data = np.load("../data/spam-data.npz")

In [42]:
# Set random seed for reproducibility
np.random.seed(189)

In [40]:
# Extract subsets of data that we need
mnist_training_data, spam_training_data = mnist_full_data['training_data'], spam_full_data['training_data']
mnist_data_labels, spam_data_labels = mnist_full_data['training_labels'], spam_full_data['training_labels']

In [77]:
# Reshape and flatten our 28x28 matrix representation of each MNIST image into a vector
# mnist_training_data_2d shape = (60,000, 784)
# spam_training_data_2d shape = (4171, 32)
mnist_training_data_2d, spam_training_data_2d = mnist_training_data.reshape(mnist_training_data.shape[0], -1), spam_training_data.reshape(spam_training_data.shape[0], -1)

In [78]:
# Join the 2d data together with given labels
# mnist_training_data_2d shape = (60,000, 785)
# spam_training_data_2d shape = (4171, 33)
mnist_training_data_2d = np.column_stack((mnist_training_data_2d, mnist_data_labels))
spam_training_data_2d = np.column_stack((spam_training_data_2d, spam_data_labels))

In [79]:
# Shuffle joined data randomly
np.random.shuffle(mnist_training_data_2d)
np.random.shuffle(spam_training_data_2d)

In [84]:
MNIST_DATA_LENGTH = 60000
def split_and_write_data(data: np.ndarray, training_percent = 0.0, training_amount = 0):
    """ Splits 2d numPy array into training and testing data based on a given training amount (int) OR training percentage (float) """
    data_length = data.shape[0]
    if training_percent:
        index = int(data_length * training_percent)
        training_data = data[:index]
        testing_data = data[index:]
    else:
        training_data = data[:training_amount]
        testing_data = data[training_amount:]
    training_data_labels, testing_data_labels = training_data[:, -1], testing_data[:, -1]
    training_data, testing_data = training_data[:, :-1], testing_data[:, :-1]

    data_name = "spam"
    if data_length == MNIST_DATA_LENGTH:
        data_name = "mnist"
    
    np.save(f"../data/{data_name}_training_data.npy", training_data)
    np.save(f"../data/{data_name}_training_data_labels.npy", training_data_labels)

    np.save(f"../data/{data_name}_testing_data.npy", testing_data)
    np.save(f"../data/{data_name}_testing_data_labels.npy", testing_data_labels)

In [83]:
split_and_write_data(mnist_training_data_2d, training_amount=10000)
split_and_write_data(spam_training_data_2d, training_percent=0.2)