In [4]:
import math
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [2]:
# From https://stackoverflow.com/questions/64890117/what-is-the-best-way-to-generate-all-binary-strings-of-the-given-length-in-pytho
def generate_binary(bit_count):
    binary_strings = []
    def genbin(n, bs=''):
        if len(bs) == n:
            binary_strings.append(bs)
        else:
            genbin(n, bs + '0')
            genbin(n, bs + '1')


    genbin(bit_count)
    return [bitstr_to_tuple(bs) for bs in binary_strings]

def bitstr_to_tuple(bitstr):
  return tuple(int(elem) for elem in bitstr)

In [20]:
# Number of data points
NUM_DATA = 1000
# Number of samples from the same concept Gaussian ( i.e. 'modalities' )
NUM_SAMPLES = 5
# Standard deviation of generated Gaussian distributions
SEP = 0.5 # 0.5,1.0,2.0 with 100 concepts
# Number of concepts
NUM_CONCEPTS = 32 # 2,16,32,64 with 0.5 std

# Generate Concept Means
concept_dim = math.ceil(math.log(NUM_CONCEPTS,2))
concept_means = generate_binary(concept_dim)[:NUM_CONCEPTS]

# Generate means of concepts, so that they're spaced out evenly a la sklearn.datasets.make_classification [
# I. Guyon, 'Design of experiments for the NIPS 2003 variable selection benchmark', 2003.]

total_raw_data = []

for modality_idx in range(NUM_SAMPLES):
  temp_list = []
  for concept_idx in range(NUM_CONCEPTS):
    sample_set = np.random.multivariate_normal(concept_means[concept_idx],np.eye(concept_dim)*SEP, (NUM_DATA,))
    temp_list.append(sample_set)
  total_raw_data.append(temp_list)
total_data = np.array(total_raw_data).transpose((2,1,0,3)).reshape((-1, NUM_SAMPLES, concept_dim))
total_labels = np.tile([i for i in range(NUM_CONCEPTS)], NUM_DATA)
# Data shape: [datapoint_idx, modality_idx, modality_dim], Label shape: [datapoint_idx]

keys = ['a', 'b', 'c', 'd', 'e']
data = dict()
data['train'] = dict()
data['test'] = dict()
X_train, X_test, y_train, y_test = train_test_split(total_data, total_labels, test_size=0.33, random_state=42, stratify=total_labels)
for i, k in enumerate(keys):
  data['train'][k] = X_train[:,i]
  data['test'][k] = X_test[:,i]
data['train']['label'] = y_train
data['test']['label'] = y_test


with open(f"SIMPLE_DATA_CLASS={NUM_CONCEPTS}_DIM={concept_dim}_STD={SEP}.pickle", 'wb') as f:
  pickle.dump(data, f)

In [11]:
concept_means

[(0,), (1,)]