## Create a dataset with heterogenous gaussian clusters in 128D

In [None]:
import torch
import os
import sys
from torch.distributions.multivariate_normal import MultivariateNormal
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
sys.path.append('../..')

In [None]:
# CHOOSE WHERE TO SAVE DATA
dataset_dir = f'./data/highdgaussian_intrinsicdim/' #change to your desired directory
if not os.path.exists(dataset_dir): #create directory if it doesn't exist
    os.makedirs(dataset_dir)

In [None]:
#hyperparameters
numclusters = 5
dimspace = 128 #
numpoints_perconc = 100000*(dimspace//2) #points per concept
intrinsic_dims = torch.tensor([2**i - 2 for i in range(3, 3+numclusters)]) #each concept has different dimensionality

#choose centers (means) of clusters
K = 1/50
Q = 300
Kc = Q*K/dimspace
torch.manual_seed(500)
centers = Kc*torch.rand(numclusters, dimspace)

#choose variances (covariances- isotropic) of clusters
Kv = K/intrinsic_dims.float() #per-dimension variance is inversely proportional to intrinsic dim
variances = [torch.cat((Kv[i]*torch.ones((intrinsic_dims[i],)), torch.zeros((dimspace-intrinsic_dims[i],)))) for i in range(numclusters)]
Covmats = [1e-6*torch.eye(dimspace) + torch.diag(variances[i]) for i in range(numclusters)]
truefeatures = {'centers': centers, 'variances': variances, 'intrinsic_dims': intrinsic_dims}
#sample multivariate gaussians from mean and covariance
torch.manual_seed(754)


data_all = torch.zeros((numclusters*numpoints_perconc, dimspace))
class_id_all = torch.zeros((numclusters*numpoints_perconc,), dtype=int)
for k in range(numclusters):
    clusterk = MultivariateNormal(centers[k,:], Covmats[k])
    data_all[k*numpoints_perconc:(k+1)*numpoints_perconc, :] = clusterk.sample((numpoints_perconc,))
    class_id_all[k*numpoints_perconc:(k+1)*numpoints_perconc] = k
numpoints_total = data_all.shape[0]

In [None]:
#visualize 2D PCA of data
# pca = PCA(n_components=2)
# pca.fit(data_all)
# data_all_pca = pca.transform(data_all)
# for k in range(numclusters):
#     plt.scatter(data_all_pca[class_id_all==k,0], data_all_pca[class_id_all==k,1], label=f"dim={str(intrinsic_dims[k].item())}")
# plt.legend()
# plt.title(f"PCA of {dimspace}dim data")
# plt.show()

In [None]:

#shuffle data
torch.manual_seed(41)
shuffle_indices = torch.randperm(numpoints_total)
data_all = data_all[shuffle_indices,:]
class_id_all = class_id_all[shuffle_indices]

#CREATE TRAIN, TEST SPLITS
torch.manual_seed(4)
frac_train = 0.7 #70% train, 30% test
total_points = numpoints_total
train_data_size = int(frac_train*total_points)
test_data_size = total_points-train_data_size
random_ordering = torch.randperm(total_points)
train_indices = random_ordering[:train_data_size]
test_indices = random_ordering[train_data_size:]

train_data_all = data_all[train_indices,:]
test_data_all = data_all[test_indices,:]
train_class_id_all = class_id_all[train_indices]
test_class_id_all = class_id_all[test_indices]

In [None]:
# SAVE DATA
#location to save data
# labdir = os.environ['USERDIR']
# data_loc = labdir+'/data/'
# dataset_dir = data_loc+f'/{dimspace}dgaussian_intrinsicdim/'
dim = dimspace #data dimension



torch.save({'numclusters':numclusters,\
            'dim':dim,\
            'data':train_data_all,\
            'labels':train_class_id_all,\
            'truefeatures':truefeatures}, dataset_dir+f'traindata.pt')

torch.save({'numclusters':numclusters,\
            'dim':dim,\
            'data':test_data_all,\
            'labels':test_class_id_all,\
            'truefeatures':truefeatures}, dataset_dir+f'testdata.pt')