In [None]:
#   %pip install --upgrade pip

#   %pip install scikit-learn
#   %pip install seaborn


In [None]:
import os
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from pandas import DataFrame
import seaborn as sns
import pandas as pd



In [None]:
#default 1000
number_points = [1000, 10000, 50000]  

#default 2,3
dimensions_span = [2, 3]    

#default 7
number_cluster_span = [3, 5, 7] 

#default 0.5
std_span = [0.75]



len_number_points =  len(number_points)
len_dimensions_span =  len(dimensions_span)
len_number_cluster_span =  len(number_cluster_span)
len_std_span =  len(std_span)
print("number_points :", len_number_points)
print("dimensions_span :", len_dimensions_span)
print("number_cluster_span :", len_number_cluster_span)
print("std_span :", len_std_span)

print("expected cardinality: ", len_number_points * len_dimensions_span * len_number_cluster_span * len_std_span)

In [None]:
valid_parameters = True
min_dimensions = 2  # hard coded in printf
min_centers = 1
min_nodes = 3
min_cluster_std = 0
max_cluster_std = 3

# generate all tuples

total_tuples = []
total_tuples.clear()

for istd in range(0, len(std_span)):
    this_std = std_span[istd]
    if( this_std < min_cluster_std  or  this_std > max_cluster_std  or  (not isinstance(this_std, float)) ):
        print("cluster_std must be in range [%9f, %9f], you called %9f,  this is unaccettable" % ( min_cluster_std, max_cluster_std, this_std))
        valid_parameters = False
        continue
    
    for idims in range(0, len(dimensions_span)):
        this_dimensions = dimensions_span[idims]
        if( this_dimensions < 2  or  (not isinstance(this_dimensions, int)) ):
            print("dimensions need to be at the least (2D)  %i , you called for : %.9f,  this is unaccettable" % (min_dimensions, this_dimensions))
            valid_parameters = False
            continue
        
        for inclus in range(0, len(number_cluster_span)):
            this_clusters = number_cluster_span[inclus]
            if( this_clusters < 1  or  (not isinstance(this_clusters, int)) ):
                print("there need to be at the least 1 centers (bruh), you called for : %.9f,  this is unaccettable" % (this_clusters))
                valid_parameters = False
                continue

            for inump in range(0, len(number_points)):
                this_nodes = number_points[inump]            
                if( this_nodes < min_nodes  or  this_nodes < this_clusters  or  (not isinstance(this_nodes, int)) ):
                    print("need to be at the least as many nodes as centers <i.e. %i>, and in general at the least <%i nodes> ; you called for : <%.9f nodes>,  this is unaccettable" % (max(min_centers, this_clusters), min_nodes, this_nodes))
                    valid_parameters = False
                    continue

                this_tuple = ( this_nodes , this_dimensions , this_clusters , this_std )
                total_tuples.append(this_tuple)

if(valid_parameters != True):
    print("some parameters were invaid")
    raise SystemExit("Stop right there!")

print("tuples_len : ", len(total_tuples))
#   print(total_tuples)

In [None]:
base_path = ".."
data_folder = "data"

# create ../data/
if not os.path.exists(f'{base_path}/{data_folder}'):
    print(f'folder {base_path}/{data_folder} not found, I shall create it (once)')
    os.mkdir(f'{base_path}/{data_folder}')
else:
    print(f'folder {base_path}/{data_folder} already exists')


genera_plots_folder = "plots"
#   folder for 2D and 3D plots  :
# create ../plots/
if not os.path.exists(f'{base_path}/{genera_plots_folder}'):
    print(f'folder {base_path}/{genera_plots_folder} not found, I shall create it (once)')
    os.mkdir(f'{base_path}/{genera_plots_folder}')
else: 
    print(f'folder {base_path}/{genera_plots_folder} already exists')

# create ../plots/datasetPlots
dataset_plot_folder = "datasetPlots"
if not os.path.exists(f'{base_path}/{genera_plots_folder}/{dataset_plot_folder}'):
    print(f'folder {base_path}/{genera_plots_folder}/{dataset_plot_folder} not found, I shall create it (once)')
    os.mkdir(f'{base_path}/{genera_plots_folder}/{dataset_plot_folder}')
else:
    print(f'folder {base_path}/{genera_plots_folder}/{dataset_plot_folder} already exists')

In [None]:
# decide if you want the image once per repetition, or never
do_images = True


In [None]:
# now we repeat everything, except the plot generation that is done only once per repetition
number_repetitions = 1

for ituple in range(0, len(total_tuples)):
    n, d, k, c_std = total_tuples[ituple]

    for irepet in range(0, number_repetitions):
        # set tuple repetition name
        combination_name = f"n_{n}_d_{d}_k_{k}_cstd_{c_std}_iteration{irepet}"
        
        RND_random_seed = 3
        points, y = make_blobs(n_samples=n, centers=k, n_features=d, random_state=RND_random_seed, cluster_std=c_std, shuffle=True)
        df = pd.DataFrame(points)
        df = df.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

        # save this dataset
        df.to_csv(f'{base_path}/{data_folder}/{combination_name}_dataset_test.csv', index=False, header=False)

        # save centroids
        centroids = df.sample(n=n)
        centroids.index = range(n)
        #kmeans = KMeans(n_clusters=centers, init='k-means++', n_init=1)
        #kmeans.fit(df)
        #centroids = pd.DataFrame(kmeans.cluster_centers_)
        #centroids.index = range(centers)

        centroids.to_csv(f'{base_path}/{data_folder}/{combination_name}_true_centroids_test.csv', index=True, header=False)


        # produce image
        if irepet == 0 and do_images==True:
            if d==2 or d==3:
                palette = sns.color_palette("hsv", n_colors=k)
                if d == 2:
                    # Create a DataFrame
                    df = DataFrame(dict(x=points[:, 0], y=points[:, 1], label=y))

                    # Create a 2D scatterplot of the data points with different colors for each label
                    fig, ax = plt.subplots()
                    grouped = df.groupby('label')
                    
                    for key, group in grouped:
                        group.plot(ax=ax, kind='scatter', x='x', y='y', label=key+1, color=palette[key])

                elif d == 3:
                    # Create a DataFrame
                    df = DataFrame(dict(x=points[:, 0], y=points[:, 1], z=points[:, 2], label=y))

                    # Create a 3D scatterplot of the data points with different colors for each label
                    fig = plt.figure()
                    ax = fig.add_subplot(111, projection='3d')
                    grouped = df.groupby('label')
                    
                    for key, group in grouped:
                        ax.scatter(group['x'], group['y'], group['z'], label=key+1, color=[palette[key]], alpha=0.25)
                    ax.set_xlabel('X')
                    ax.set_ylabel('Y')
                    ax.set_zlabel('Z')
                
                # Set labels and legend
                plt.legend()
                plt.title(f"Results (n={n}, d={d}, k={k}, cstd={c_std})")

                # Save the plot as an image file
                #   os.mkdir(f'{base_path}/{genera_plots_folder}/{dataset_plot_folder}')
                plt.savefig(f'{base_path}/{genera_plots_folder}/{dataset_plot_folder}/{combination_name}_dataset_plot.png')

