## Create a dataset with gaussian clusters in 2D- with linearly and nonlinearly separable clusters

In [None]:
import torch
import os
import sys
import matplotlib.pyplot as plt
from torch.distributions.multivariate_normal import MultivariateNormal
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
sys.path.append('../..')

In [None]:
# CHOOSE WHERE TO SAVE THE DATA
dataset_dir = './data/2dgaussian_diffmag/' #change this to your location
if not os.path.exists(dataset_dir): #create directory if it doesn't exist
    os.makedirs(dataset_dir)

In [None]:
torch.manual_seed(0)

#hyperparameters
dimspace = 2 
numpoints_perconc = 1000000*(dimspace//2) #points per concept
numclusters = 6 #number of concepts
intrinsic_dims = 2*torch.ones((numclusters,), dtype=int) #all concepts are 2D here

#choose centers of clusters
K = 1
Kc = K/dimspace #scale of centers
rad = Kc*torch.tensor([3.0, 1.0, 3.0, 1.0, 3.0, 1.0]) #radius (magnitude) of clusters
angles = torch.arange(0, 2*torch.pi, 2*torch.pi/numclusters)
centers = torch.stack((rad*torch.cos(angles), rad*torch.sin(angles)), dim=1)

# choose variances of clusters
scaler_alpha = 4.5
Qv = 1/(2**scaler_alpha)
Kv = Qv*K/intrinsic_dims.float().max() #variance per dimension
torch.manual_seed(625)
variances = [Kv*torch.ones((intrinsic_dims[i],)) for i in range(numclusters)]
Covmats = [1e-6*torch.eye(dimspace) + torch.diag(variances[i]) for i in range(numclusters)] #isotropic
truefeatures = {'centers': centers, 'variances': variances}

#sample gaussian clusters from centers and variances
data_all = torch.zeros((numclusters*numpoints_perconc, dimspace))
class_id_all = torch.zeros((numclusters*numpoints_perconc,), dtype=int)
for k in range(numclusters):
    clusterk = MultivariateNormal(centers[k,:], Covmats[k])
    data_all[k*numpoints_perconc:(k+1)*numpoints_perconc, :] = clusterk.sample((numpoints_perconc,))
    class_id_all[k*numpoints_perconc:(k+1)*numpoints_perconc] = k
numpoints_total = data_all.shape[0]

#check separability using logistic regression- can skip 
X_train, X_test, y_train, y_test = train_test_split(data_all, class_id_all, test_size=0.33, random_state=42)
clf = LogisticRegression(random_state=0, max_iter=1000, C=1e-1,penalty='l2').fit(X_train, y_train)
score = clf.score(X_test, y_test)
#get score separately for each concept (one vs all)
scoresperconcept = []
for k in range(numclusters):
    y_train_concept = (y_train == k)
    y_test_concept = (y_test == k)
    clf = LogisticRegression(random_state=0, max_iter=1000, C=1e-1,penalty='l2').fit(X_train, y_train_concept)
    score = clf.score(X_test, y_test_concept)
    scoresperconcept.append(score)
print(f"Scaler={scaler_alpha}, Score={score}")

In [None]:
# visualize clusters
numpoints_viz = 1000
id_viz = torch.randint(0, numpoints_total, (numpoints_viz,))
plt.scatter(data_all[id_viz,0], data_all[id_viz,1], c=class_id_all[id_viz], cmap='tab10')
plt.title(f"Data with different magnitudes")
plt.show()

In [None]:
torch.manual_seed(41)

#shuffle data
shuffle_indices = torch.randperm(numpoints_total)
datax = data_all[shuffle_indices,:]
classidx = class_id_all[shuffle_indices]

#CREATE TRAIN, TEST SPLITS
torch.manual_seed(4)
frac_train = 0.7 #70% train, 30% test
total_points = numpoints_total
train_data_size = int(frac_train*total_points)
test_data_size = total_points-train_data_size
random_ordering = torch.randperm(total_points)
train_indices = random_ordering[:train_data_size]
test_indices = random_ordering[train_data_size:]

train_datax = datax[train_indices,:]
test_datax = datax[test_indices,:]
train_classidx = classidx[train_indices]
test_classidx = classidx[test_indices]

In [None]:
# SAVE DATA

#location to save data
# labdir = os.environ['USERDIR']
# data_loc = labdir+'/data/'
# dataset_dir = data_loc+f'/2dgaussian_diffmag/'
dim = dimspace #data dimension

torch.save({'numclusters':numclusters,\
            'dim':dim,\
            'data':train_datax,\
            'labels':train_classidx,\
            'truefeatures':truefeatures}, dataset_dir+f'traindata.pt')

torch.save({'numclusters':numclusters,\
            'dim':dim,\
            'data':test_datax,\
            'labels':test_classidx,\
            'truefeatures':truefeatures}, dataset_dir+f'testdata.pt')