
##Setup

You will need to make a copy of this Colab notebook in your Google Drive before you can edit the homework files. You can do so with **File &rarr; Save a copy in Drive**.


# Data Preparation for Meta-Learning 
courtesy: [CS-330 HW1](https://colab.research.google.com/drive/1slBqgKa20iTatoWThMWZTnFysgAVD1vh?usp=sharing)

In [1]:
import os
import numpy as np
import os
import random
import tensorflow as tf
from scipy import misc

In [2]:
def get_images(paths, labels, nb_samples=None, shuffle=True):
    """
    Takes a set of character folders and labels and returns paths to image files
    paired with labels.
    Args:
        paths: A list of character folders
        labels: List or numpy array of same length as paths
        nb_samples: Number of images to retrieve per character
    Returns:
        List of (label, image_path) tuples
    """
    if nb_samples is not None:
        sampler = lambda x: random.sample(x, nb_samples)
    else:
        sampler = lambda x: x
    images_labels = [(i, os.path.join(path, image))
                     for i, path in zip(labels, paths)
                     for image in sampler(os.listdir(path))]
  
    if shuffle:
        random.shuffle(images_labels)
    return images_labels


def image_file_to_array(filename, dim_input):
    """
    Takes an image path and returns numpy array
    Args:
        filename: Image filename
        dim_input: Flattened shape of image
    Returns:
        1 channel image
    """
    import imageio
    image = imageio.imread(filename)  # misc.imread(filename)
    image = image.reshape([dim_input])
    image = image.astype(np.float32) / 255.0
    image = 1.0 - image
    return image

In [8]:
class DataGenerator(object):
    """
    Data Generator capable of generating batches of Omniglot data.
    A "class" is considered a class of omniglot digits.
    """

    def __init__(self, num_classes, num_samples_per_class, config={}):
        """
        Args:
            num_classes: Number of classes for classification (K-way)
            num_samples_per_class: num samples to generate per class in one batch
            batch_size: size of meta batch size (e.g. number of functions)
        """
        self.num_samples_per_class = num_samples_per_class
        self.num_classes = num_classes

        data_folder = config.get('data_folder', os.path.join( '..', 'data', 'omniglot_resized'))
        self.img_size = config.get('img_size', (28, 28))

        self.dim_input = np.prod(self.img_size)
        self.dim_output = self.num_classes

        character_folders = [os.path.join(data_folder, family, character)
                             for family in os.listdir(data_folder)
                             if os.path.isdir(os.path.join(data_folder, family))
                             for character in os.listdir(os.path.join(data_folder, family))
                             if os.path.isdir(os.path.join(data_folder, family, character))]

        random.seed(1)
        random.shuffle(character_folders)
        num_val = 100
        num_train = 1100
        self.metatrain_character_folders = character_folders[: num_train]
        self.metaval_character_folders = character_folders[
            num_train:num_train + num_val]
        self.metatest_character_folders = character_folders[
            num_train + num_val:]

    def sample_batch(self, batch_type, batch_size):
        """
        Samples a batch for training, validation, or testing
        Args:
            batch_type: train/val/test
        Returns:
            A a tuple of (1) Image batch and (2) Label batch where
            image batch has shape [B, K, N, 784] and label batch has shape [B, K, N, N]
            where B is batch size, K is number of samples per class, N is number of classes
        """
        if batch_type == "train":
            folders = self.metatrain_character_folders
        elif batch_type == "val":
            folders = self.metaval_character_folders
        else:
            folders = self.metatest_character_folders

        #############################
        #### YOUR CODE GOES HERE ####
        all_image_batches = np.zeros((batch_size,  self.num_samples_per_class, self.num_classes, 784 ) )
        all_label_batches = np.zeros((batch_size,  self.num_samples_per_class, self.num_classes, self.num_classes ) )
        
        for b in range(batch_size):
          paths = random.sample(folders, self.num_classes)
          labels = [i[-11:] for i in paths]
          images_labels = get_images(paths, range(self.num_classes), 
                                     self.num_samples_per_class, 
                                     shuffle = False)# set shuffle to false for easier visualization. 
          
          
          labels_ = ( ['-'.join(i[1].split(os.path.sep)[3:4]) for i in images_labels])    
          labels_dict = {}
          __ = 0
          for each_label in labels_: 
            if each_label not in labels_dict.keys(): 
              labels_dict[each_label] = __
              __ += 1
          
       
          labels_int = np.array([labels_dict[i] for i in labels_])
          #  convert labels_int to one-hot vector. 
          b_ = np.zeros((labels_int.size, labels_int.max()+1))
          
          b_[np.arange(labels_int.size), labels_int] = 1

          # create a single batch of images. 
          single_batch = np.array([image_file_to_array(i[1], 784) for i in images_labels])
          single_batch = single_batch.reshape(self.num_classes, self.num_samples_per_class,  784)
          single_batch = np.swapaxes(single_batch, 0, 1)   # output shape: [batch_size, sample, num_classes, flat-dim]
          
          
          b_= b_.reshape(self.num_classes, self.num_samples_per_class, self.num_classes)
          b_ = np.swapaxes(b_, 0, 1)
  
          all_image_batches[b,:,:, :] = single_batch
          all_label_batches[b, :, :, :] = b_
       
        return all_image_batches.astype(np.float32), all_label_batches.astype(np.float32)

In [9]:
# num_classes: Number of classes for classification (K-way)
# num_samples_per_class: num samples to generate per class in one batch
num_classes = 2
num_samples_per_class = 3
batch_size = 4
tmp = DataGenerator(num_classes, num_samples_per_class)
images, labels= tmp.sample_batch('train', batch_size)
images.shape, labels.shape

((4, 3, 2, 784), (4, 3, 2, 2))