# Create a Clean HDF5 Dataset

## Create a Generator Class

In [1]:
import HelperFuncs as hfuncs

class myScanGenerator:
    # AWS and Directory information 
    bucketName = hfuncs.RAW_DATA_BUCKET
    dataDir = hfuncs.RAW_DATA_DIRECTORY
    temp_dir = hfuncs.TEMP_DIR
    labels_dir = hfuncs.LABELS_FILE
    
    # Connect to AWS
    key_id, secret_key = hfuncs.GetAWSCredentials()
    client = hfuncs.GetAWSClient(key_id,secret_key)
    bucket = client.Bucket(bucketName)
    extension = hfuncs.EXTENSION
    
    # Labels and keys
    labels_dict = hfuncs.GetLabelsDict(labels_dir)
    key_ary = None
    
    # Samples to load at a time (reduce if memory available is low)
    n_samples = 0
    
    # Requester
    batch_requester = None
    
    def __init__(self,keys,n_samples):
        # Keys of samples to process
        self.key_ary = keys
        
        # Samples to load at a time
        self.n_samples = n_samples
        
        # Initialize AWS Batch Requester
        self.batchrequester = hfuncs.BatchRequester(self.bucket,self.key_ary,self.labels_dict,self.dataDir,self.temp_dir,self.extension)
    
    def GenerateSamples(self):
        '''Returns generator that retireves n_sample scans at a time, applies cleaning function, and yields
        numpy data array with the clean data.  The shape of the array is ()'''
        #While there is data left, yield batch
        while self.batchrequester.DoItemsRemain():
            #Request data
            print("Retrieving samples..")
            pointer = self.batchrequester.key_pointer
            X,y = self.batchrequester.NextBatch(self.n_samples)
            n_angles = X.shape[3] 

            print("Samples retrieved")

            # Initialize output arrays
            X_train = np.zeros((X.shape[0],n_angles,hfuncs.FINAL_WIDTH,hfuncs.FINAL_HEIGHT,hfuncs.CHANNELS))
            y_train = np.zeros((X.shape[0],hfuncs.ZONES))
            

            # Set scan counter to 0, channel to 1
            i = 0
            chan = 0 # Only one channel in these data.  In the future, this can be changed for multichannel videos or scans.

            
            # For each scan, clean each angle slice and store it in output array.
            for i in range(X.shape[0]):
                # Clean each angle slice
                for j in range(n_angles):
                    X_train[i,j,:,:,chan] = hfuncs.CropCleanResize(X[i,:,:,j],hfuncs.FINAL_WIDTH,hfuncs.FINAL_HEIGHT)
                
                # Store label
                y_train[i,:] = y[i,:]
                
                # Yield scans, one by one
                yield X_train[i,:,:,:,:],y_train[i]

Setting demo environment variables...


## Train-Val Split

In [2]:
from sklearn.model_selection import train_test_split

def getTrainTestSplit(labels_dir=hfuncs.LABELS_FILE,extension=hfuncs.EXTENSION,
                        dataDir=hfuncs.RAW_DATA_DIRECTORY,bucketName=hfuncs.RAW_DATA_BUCKET):
    '''Retrieves all samples in raw data that have labels 
    and splits data into a train and val sets. '''
    # Labels        
    labels_dict = hfuncs.GetLabelsDict(labels_dir)

    # AWS Bucket
    key_id, secret_key = hfuncs.GetAWSCredentials()
    client = hfuncs.GetAWSClient(key_id,secret_key)
    bucket = client.Bucket(bucketName)
    
    # Get shuffled keys 
    key_ary = hfuncs.GetShuffledKeys(bucket)
    
    # Make sure keys correspond to samples that have labels
    key_ary = hfuncs.CleanKeyAry(key_ary,labels_dict,dataDir,extension)
    
    # Perform train-test split
    K_train,K_val = train_test_split(key_ary,test_size=0.20,random_state=0)
    
    return K_train, K_val

## Clean Data and Upload to S3 Bucket

In [4]:
import h5py
import os
import setup_file as S
import numpy as np

K_train, K_val = getTrainTestSplit()

# If it is a demo, only get 5 samples
if hfuncs.IS_DEMO:
    n_train = 5
    n_val = 5
    K_train = np.random.choice(K_train,n_train,replace=False)
    K_val = np.random.choice(K_val,n_val,replace=False)


# Connect to AWS S3
key_id, secret_key = hfuncs.GetAWSCredentials()
client = hfuncs.GetAWSClient(key_id,secret_key)
bucket = client.Bucket(hfuncs.CLEAN_DATA_BUCKET)

# Clean and upload data!
for mode, keys in zip(["train_scan","val_scan"],[K_train,K_val]):
    print("Cleaning {} data...".format(mode))
    key_root = mode
    trainGen = myScanGenerator(keys,5)
    i = 0
    
    for X, y in trainGen.GenerateSamples():
        print("Completing batch {} of {}".format(i+1,len(keys)))
        filename = os.path.join(hfuncs.TEMP_DIR,"batch_{}.hdf5".format(i))
        key = "{}/{}".format(key_root,"batch_{}.hdf5".format(i))
        
        # Write data to h5py file
        with h5py.File(filename,"w") as f:
            dset = f.create_dataset('image',data=X)
            dset2 = f.create_dataset('labels',data=y)

        # Upload file to bucket and delete local copy
        bucket.upload_file(Filename=filename,Key=key)
        os.remove(filename)
        i += 1
        



Cleaning train_scan data...
Retrieving samples..
Samples retrieved
Completing batch 1 of 5
Completing batch 2 of 5
Completing batch 3 of 5
Completing batch 4 of 5
Completing batch 5 of 5
Cleaning val_scan data...
Retrieving samples..
Samples retrieved
Completing batch 1 of 5
Completing batch 2 of 5
Completing batch 3 of 5
Completing batch 4 of 5
Completing batch 5 of 5
