# Imports

In [0]:
from PIL import Image, ImageDraw
import numpy as np
from sklearn import datasets

# Setup

In [0]:
# Mount CU Drive
from google.colab import drive
drive.mount('/content/drive')

!mkdir drive/My\ Drive/synthetic-ds
!mkdir drive/My\ Drive/synthetic-ds/ct2
!mkdir drive/My\ Drive/synthetic-ds/ctp2
!mkdir drive/My\ Drive/synthetic-ds/squares
!mkdir drive/My\ Drive/synthetic-ds/point
!mkdir drive/My\ Drive/synthetic-ds/point/circles
!mkdir drive/My\ Drive/synthetic-ds/point/s_curves
!mkdir drive/My\ Drive/synthetic-ds/point/swiss_roll

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds’: File exists
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds/ct2’: File exists
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds/ctp2’: File exists
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds/squares’: File exists
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds/point’: File exists
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds/point/circles’: File exists
mkdir: cannot create directory ‘drive/My Drive/synthetic-ds/point/s_curves’: File exists


# Image Dataset Generation

## Squares

In [0]:
def get_sample(mean, stdev):
    return np.random.multivariate_normal(mean, (stdev**2)*np.identity(2))

def gen_mixture(no_of_dist, dataset_size, scaling_factor = (100, 10)):
    # Generate means and stdev of the mixture
    mean = np.random.rand(no_of_dist, 2)* scaling_factor[0]
    stdev = np.random.rand(no_of_dist)* scaling_factor[1]

    # Generate dataset
    idx = np.random.randint(0, high = no_of_dist, size = dataset_size)
    mean_arr = mean[idx]
    stdev_arr = stdev[idx]

    data = np.zeros((dataset_size, 2))
    for i in range(dataset_size):
        data[i] = get_sample(mean_arr[i], stdev_arr[i])
    
    return idx, data, mean, stdev

# Fixed size of images : 64 x 64 x 1
# For simplicity keep size of squares to be exponents of 2 i.e. 2, 4, 8 ...
def draw_square(im, square_size, im_size, top_left = None):
    if top_left is None:
      top_left = np.random.randint(0, high = im_size - square_size, size = 2)
    
    # Set colour 
    im[top_left[0] : top_left[0] + square_size, top_left[1] : top_left[1] + square_size] = 1

    return im, top_left

def non_overlapping_square(im, no_of_squares, square_size, im_size):
    bad_corners = True
    while bad_corners:
        top_left = np.random.randint(0, high = im_size - square_size - 1, size = (no_of_squares, 2))
        new_im = np.copy(im)
        found_combination = False
        for i in range(0, no_of_squares):
            relevant_square = new_im[top_left[i][0]: top_left[i][0] + square_size, top_left[i][1] : top_left[i][1] + square_size] 
            neighbors = relevant_square[relevant_square == 1]
            if neighbors.shape[0] == 0:
                draw_square(new_im, square_size, im_size, top_left[i])
                if i == no_of_squares - 1:
                    found_combination = True
            else:
                bad_corners = True
                break

        if bad_corners and not found_combination:
            continue
        bad_corners = False
    return new_im



def gen_square_image(no_of_squares, square_size, overlap, im_size):
    im = np.zeros((im_size, im_size))

    idx = []
    if overlap:
        for i in range(no_of_squares):
            im, ind = draw_square(im, square_size, im_size)
            idx.append(ind)
    else:
        im = non_overlapping_square(im, no_of_squares, square_size, im_size)

    return im
  
def gen_square_dataset(no_of_squares, square_size, dataset_size, overlap, im_size = 128):
  ds = np.zeros((dataset_size, im_size, im_size))
  for i in range(dataset_size):
    ds[i] = gen_square_image(no_of_squares, square_size, overlap, im_size)
  return ds

In [1]:
# Save dataset with 1 square of size 32 
ds = gen_square_dataset(1, 32, 5000, False)
np.save("drive/My Drive/synthetic-ds/squares/1-32.npy", ds)

# Save dataset with 4 squares of size 16 
ds = gen_square_dataset(4, 16, 5000, False)
np.save("drive/My Drive/synthetic-ds/squares/4-16.npy", ds)

# Save dataset with 10 squares of size 8 
ds = gen_square_dataset(10, 8, 5000, False)
np.save("drive/My Drive/synthetic-ds/squares/4-16.npy", ds)

# Save dataset with 10 squares of size 8 
ds = gen_square_dataset(1, 16, 5000, False)
np.save("drive/My Drive/synthetic-ds/squares/1-16-28x28.npy", ds, im_size = 28)

NameError: ignored

## Mixture of Polygons

### Code

In [0]:
def generate_ct2(im_path, im_size):
  image = Image.new('RGB', (im_size, im_size))
  image.save(im_path, "PNG")
  
  image = Image.open(im_path)
  draw = ImageDraw.Draw(image)

  # Draw 2 circles
  x = np.random.randint(10, high = im_size - 10)
  y = np.random.randint(10, high = im_size - 10)
  r = np.random.randint(min(x, y, im_size - x, im_size - y))
  draw.ellipse((x-r, y-r, x+r, y+r), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  x = np.random.randint(10, high = im_size - 10)
  y = np.random.randint(10, high = im_size - 10)
  r = np.random.randint(min(x, y, im_size - x, im_size - y))
  draw.ellipse((x-r, y-r, x+r, y+r), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  # Draw 2 triangles
  points = ((np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)))
  draw.polygon((points), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  points = ((np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)))
  draw.polygon((points), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))

  image.save(im_path)
  
def generate_ctp2(im_path, im_size):
  # Create new image (all black)
  image = Image.new('RGB', (im_size, im_size))
  draw = ImageDraw.Draw(image)

  # Draw 2 circles
  x = np.random.randint(10, high = im_size - 10)
  y = np.random.randint(10, high = im_size - 10)
  r = np.random.randint(min(x, y, im_size - x, im_size - y))
  draw.ellipse((x-r, y-r, x+r, y+r), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  x = np.random.randint(10, high = im_size - 10)
  y = np.random.randint(10, high = im_size - 10)
  r = np.random.randint(min(x, y, im_size - x, im_size - y))
  draw.ellipse((x-r, y-r, x+r, y+r), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  # Draw 2 triangles
  points = ((np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)))
  draw.polygon((points), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  points = ((np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)))
  draw.polygon((points), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  # Draw 2 polygons (4 sided)
  points = ((np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)))
  draw.polygon((points), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))
  
  points = ((np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)), (np.random.randint(im_size), np.random.randint(im_size)))
  draw.polygon((points), fill= (np.random.randint(256), np.random.randint(256), np.random.randint(256)))

  image.save()

def gen_image_dataset(image_name, im_size, dataset_size, image_generator, path):
  im_path = path + image_name
  for i in range(dataset_size):
    image_generator(im_path + str(i) + '.png', im_size)

### Generation

In [0]:
gen_image_dataset('image', 128, 5000, generate_ct2, 'drive/My Drive/synthetic-ds/ct2/')
gen_image_dataset('image', 128, 5000, generate_ctp2, 'drive/My Drive/synthetic-ds/ctp2/')

### Utilites for testing

In [0]:
# Check if the files have been generated 
!ls drive/My\ Drive/synthetic-ds/ct2/
!ls drive/My\ Drive/synthetic-ds/ctp2/

In [0]:
# Delete all files in relevant folders
!rm drive/My\ Drive/synthetic-ds/ct2/*
!rm drive/My\ Drive/synthetic-ds/ctp2/*

rm: cannot remove 'drive/My Drive/synthetic-ds/ctp2/*': No such file or directory


# Point (in $R^n$) Dataset Generation 

## Circles

In [0]:
circle_factor_1 = datasets.make_circles(n_samples = 5000, factor = 0.1, random_state = 0)
circle_factor_5 = datasets.make_circles(n_samples = 5000, factor = 0.5, random_state = 0)
circle_factor_9 = datasets.make_circles(n_samples = 5000, factor = 0.9, random_state = 0)

circle_noise_5 = datasets.make_circles(n_samples = 5000, factor = 0.5, noise = 0.05, random_state = 0)
circle_noise_10 = datasets.make_circles(n_samples = 5000, factor = 0.5, noise = 0.1, random_state = 0)
circle_noise_50 = datasets.make_circles(n_samples = 5000, factor = 0.5, noise = 0.5, random_state = 0)

In [0]:
np.save('drive/My Drive/synthetic-ds/point/circles/circle_factor_1', np.column_stack((circle_factor_1[0], circle_factor_1[1])))
np.save('drive/My Drive/synthetic-ds/point/circles/circle_factor_5', np.column_stack((circle_factor_5[0], circle_factor_5[1])))
np.save('drive/My Drive/synthetic-ds/point/circles/circle_factor_9', np.column_stack((circle_factor_9[0], circle_factor_9[1])))

np.save('drive/My Drive/synthetic-ds/point/circles/circle_noise_5', np.column_stack((circle_noise_5[0], circle_noise_5[1])))
np.save('drive/My Drive/synthetic-ds/point/circles/circle_noise_10', np.column_stack((circle_noise_10[0], circle_noise_10[1])))
np.save('drive/My Drive/synthetic-ds/point/circles/circle_noise_50', np.column_stack((circle_noise_50[0], circle_noise_50[1])))

## S Curve

In [0]:
scurve_noise_5 = datasets.make_s_curve(n_samples = 5000, noise = 0.05, random_state = 0)
scurve_noise_10 = datasets.make_s_curve(n_samples = 5000, noise = 0.1, random_state = 0)
scurve_noise_50 = datasets.make_s_curve(n_samples = 5000, noise = 0.5, random_state = 0)

In [0]:
np.save('drive/My Drive/synthetic-ds/point/s_curves/scurve_noise_5', np.column_stack((scurve_noise_5[0], scurve_noise_5[1])))
np.save('drive/My Drive/synthetic-ds/point/s_curves/scurve_noise_10', np.column_stack((scurve_noise_10[0], scurve_noise_10[1])))
np.save('drive/My Drive/synthetic-ds/point/s_curves/scurve_noise_50', np.column_stack((scurve_noise_50[0], scurve_noise_50[1])))

## Swiss Roll

In [0]:
roll_noise_5 = datasets.make_swiss_roll(n_samples = 5000, noise = 0.05, random_state = 0)
roll_noise_10 = datasets.make_swiss_roll(n_samples = 5000, noise = 0.1, random_state = 0)
roll_noise_50 = datasets.make_swiss_roll(n_samples = 5000, noise = 0.5, random_state = 0)

In [0]:
np.save('drive/My Drive/synthetic-ds/point/swiss_roll/roll_noise_5', np.column_stack((roll_noise_5[0], roll_noise_5[1])))
np.save('drive/My Drive/synthetic-ds/point/swiss_roll/roll_noise_10', np.column_stack((roll_noise_10[0], roll_noise_10[1])))
np.save('drive/My Drive/synthetic-ds/point/swiss_roll/roll_noise_50', np.column_stack((roll_noise_50[0], roll_noise_50[1])))

## Clusters