In [4]:
import os
import time
import tensorflow.compat.v1 as tf
import pickle

tf.enable_eager_execution() # No need for session to be created. Function instances are run immediately. 

from waymo_open_dataset import dataset_pb2 as open_dataset
from google.cloud import storage

import concurrent.futures as concurr

# CONFIG
project = "Waymo3DObjectDetection"
bucket_name = 'waymo_open_dataset_v_1_2_0_individual_files'
suffix = '.tfrecord'
data_destination = os.getcwd() + "/data/"
download_batch_size = 1

def download_blob(blob, c):
    """
    blob = single file name
    c = file counter
    """
    fname = f"{data_destination}blob_{c}{suffix}"
    blob.download_to_filename(fname)
    return fname

def strip_frame(frame, idx, blob_idx):
    """Strip frame from garbage such as LIDAR data"""
    
    cam_dict = {}
    for i, camera in enumerate(["FRONT", "FRONT_LEFT", "SIDE_LEFT", "FRONT_RIGHT", "SIDE_RIGHT"]):
        cam_dict[camera] = {}
        cam_dict[camera]['image'] = frame.images[i].image
        cam_dict[camera]['velocity'] = frame.images[i].velocity
        cam_dict[camera]['labels'] = frame.camera_labels[i]
        
        cam_dict[camera]['context']={'stats':frame.context.stats, 
                           'name': frame.context.name, 
                           'blob_idx':blob_idx,
                           'time_frame_idx':idx}
    return cam_dict

def save_frames(frames, blob_idx, dataset='training'):
    """Save frames into pickle format. To preprocess later"""
    for frame_idx, frame in enumerate(frames):
        for camera, camera_dict in frame.items():
            with open(f'{data_destination}{dataset}/{camera}/blob_{blob_idx}_frame_{frame_idx}.pickle', 'wb') as f:
                # Pickle the 'data' dictionary using the highest protocol available.
                pickle.dump(camera_dict, f, pickle.HIGHEST_PROTOCOL)
    return None

def load_frame(frame_idx, blob_idx, dataset='training'):
    with open(f'{data_destination}{dataset}/blob_{blob_idx}.pickle', 'rb') as f:
        # Load the 'data' dictionary using the highest protocol available.
        return pickle.load(f, pickle.HIGHEST_PROTOCOL)


# Retrieve frames from selected files to download
def get_and_strip_frames_from_one_blob(downloaded_blob, blob_idx):
    # Load into tf record dataset
    dataset = tf.data.TFRecordDataset(downloaded_blob, compression_type='')
    frames = []
    for idx, data in enumerate(dataset):
        frame = open_dataset.Frame()
        frame.ParseFromString(bytearray(data.numpy()))
        # Function to strip away LIDAR and other garbage from frame
        frame = strip_frame(frame, idx, blob_idx)
        frames.append(frame)
    return frames

def download_process_save_1_blob(blob, blob_idx, dataset='training'):
    """Like dem descriptive func names eh?"""

    print(f"Downloading blob_{blob_idx}")
    blob_fname = download_blob(blob, blob_idx)
    
    print(f"Getting and stripping all frames from blob_{blob_idx}")
    frames = get_and_strip_frames_from_one_blob(blob_fname, blob_idx)

    print(f"Saving frames for blob {blob_idx}")
    save_frames(frames, blob_idx, dataset)

    print(f'No longer need tfrecord blob_{blob_idx}. Deleting now.')
    os.remove(f'data/blob_{blob_idx}.tfrecord')

    return f"blob_{blob_idx}"
          

In [5]:
# Initialise a client
storage_client = storage.Client(project= project) #storage.Client(project= "Waymo3DObjectDetection", credentials=credentials)
# Create a bucket object for our bucket
bucket = storage_client.get_bucket(bucket_name)
# Get blob files in bucket
blobs = [blob for blob in storage_client.list_blobs(bucket_name, prefix='training/')]

n_blobs = len(blobs) # Number of blobs in the training dataset
print(f'Total number of blobs is {n_blobs}')




Total number of blobs is 798


In [None]:
# TRAINING
start = time.time()
downloaded_blobs = []

thread_iterable = ((blob,blob_idx, 'training') for blob_idx, blob in enumerate(blobs))

with concurr.ThreadPoolExecutor(max_workers = 4) as executor:

    results = executor.map(lambda args: download_process_save_1_blob(*args), thread_iterable)
    for r in results:
        print(f'\n Time elapsed {time.time() - start}')
        downloaded_blobs.append(r)
        
end = time.time()
print(f'Total time taken {end - start}')


Downloading blob_0
Downloading blob_1
Downloading blob_2
Downloading blob_3
Getting and stripping all frames from blob_1
Saving frames for blob 1
Getting and stripping all frames from blob_2
No longer need tfrecord blob_1. Deleting now.
Downloading blob_4
Saving frames for blob 2
No longer need tfrecord blob_2. Deleting now.
Downloading blob_5
Getting and stripping all frames from blob_3
Saving frames for blob 3
Getting and stripping all frames from blob_0
Saving frames for blob 0
No longer need tfrecord blob_3. Deleting now.
Downloading blob_6
No longer need tfrecord blob_0. Deleting now.
Downloading blob_7
 Time elapsed 84.47446870803833

 Time elapsed 84.47463488578796

 Time elapsed 84.47467398643494


 Time elapsed 84.47550892829895
Getting and stripping all frames from blob_4
Saving frames for blob 4
No longer need tfrecord blob_4. Deleting now.
Downloading blob_8

 Time elapsed 137.66376519203186
Getting and stripping all frames from blob_5
Getting and stripping all frames from 

No longer need tfrecord blob_47. Deleting now.
Downloading blob_51
 Time elapsed 888.2704586982727

Getting and stripping all frames from blob_48
Saving frames for blob 48
No longer need tfrecord blob_48. Deleting now.
Downloading blob_52

 Time elapsed 900.383006811142
Getting and stripping all frames from blob_49
Saving frames for blob 49
No longer need tfrecord blob_49. Deleting now.
Downloading blob_53
 Time elapsed 922.9177386760712

Getting and stripping all frames from blob_50
Saving frames for blob 50
No longer need tfrecord blob_50. Deleting now.
Downloading blob_54
 Time elapsed 952.661746263504

Getting and stripping all frames from blob_52
Saving frames for blob 52
No longer need tfrecord blob_52. Deleting now.
Downloading blob_55
Getting and stripping all frames from blob_51
Saving frames for blob 51
No longer need tfrecord blob_51. Deleting now.
Downloading blob_56
 Time elapsed 975.0984694957733

 Time elapsed 975.0986273288727

Getting and stripping all frames from blob

Getting and stripping all frames from blob_96
Saving frames for blob 96
No longer need tfrecord blob_96. Deleting now.
Downloading blob_99
Getting and stripping all frames from blob_95
Saving frames for blob 95
No longer need tfrecord blob_95. Deleting now.
Downloading blob_100

 Time elapsed 1775.2033114433289

 Time elapsed 1775.2034094333649
Getting and stripping all frames from blob_98
Saving frames for blob 98
Getting and stripping all frames from blob_97
No longer need tfrecord blob_98. Deleting now.
Downloading blob_101
Saving frames for blob 97
No longer need tfrecord blob_97. Deleting now.
Downloading blob_102

 Time elapsed 1785.7808351516724

 Time elapsed 1785.7839736938477
Getting and stripping all frames from blob_99
Saving frames for blob 99
No longer need tfrecord blob_99. Deleting now.
Downloading blob_103

 Time elapsed 1817.3353748321533
Getting and stripping all frames from blob_101
Saving frames for blob 101
No longer need tfrecord blob_101. Deleting now.
Downloadi

No longer need tfrecord blob_143. Deleting now.
Downloading blob_145
Getting and stripping all frames from blob_145
Saving frames for blob 145
No longer need tfrecord blob_145. Deleting now.
Downloading blob_146
Getting and stripping all frames from blob_144
Saving frames for blob 144
No longer need tfrecord blob_144. Deleting now.
Downloading blob_147
Getting and stripping all frames from blob_146
Saving frames for blob 146
No longer need tfrecord blob_146. Deleting now.
Downloading blob_148
Getting and stripping all frames from blob_147
Saving frames for blob 147
No longer need tfrecord blob_147. Deleting now.
Downloading blob_149
Getting and stripping all frames from blob_148
Saving frames for blob 139
Saving frames for blob 148
No longer need tfrecord blob_139. Deleting now.
Downloading blob_150
 Time elapsed 2597.5842027664185

No longer need tfrecord blob_148. Deleting now.
Downloading blob_151
Getting and stripping all frames from blob_149
Saving frames for blob 149
No longer ne

Downloading blob_191
 Time elapsed 3316.8773443698883

Saving frames for blob 189
Saving frames for blob 188
No longer need tfrecord blob_189. Deleting now.
No longer need tfrecord blob_188. Deleting now.
Downloading blob_192
Downloading blob_193

 Time elapsed 3318.461582660675

 Time elapsed 3318.464714050293
Getting and stripping all frames from blob_190
Saving frames for blob 190
No longer need tfrecord blob_190. Deleting now.
Downloading blob_194
 Time elapsed 3325.869081020355

Getting and stripping all frames from blob_191
Saving frames for blob 191
No longer need tfrecord blob_191. Deleting now.
Downloading blob_195
 Time elapsed 3386.135044336319

Getting and stripping all frames from blob_192
Saving frames for blob 192
Getting and stripping all frames from blob_193
Getting and stripping all frames from blob_194
No longer need tfrecord blob_192. Deleting now.
Downloading blob_196

 Time elapsed 3390.7755250930786
Saving frames for blob 193
Saving frames for blob 194
No longer 

No longer need tfrecord blob_236. Deleting now.
Downloading blob_238
Getting and stripping all frames from blob_237
Saving frames for blob 237
Getting and stripping all frames from blob_238
Saving frames for blob 238
No longer need tfrecord blob_237. Deleting now.
Downloading blob_239
No longer need tfrecord blob_238. Deleting now.
Downloading blob_240
Getting and stripping all frames from blob_240
Getting and stripping all frames from blob_239
Saving frames for blob 240
Saving frames for blob 239
No longer need tfrecord blob_240. Deleting now.
Downloading blob_241
No longer need tfrecord blob_239. Deleting now.
Downloading blob_242
Getting and stripping all frames from blob_242
Saving frames for blob 242
Getting and stripping all frames from blob_241
Saving frames for blob 241
No longer need tfrecord blob_242. Deleting now.
Downloading blob_243
No longer need tfrecord blob_241. Deleting now.
Downloading blob_244
Saving frames for blob 233
No longer need tfrecord blob_233. Deleting now

In [None]:
# VALIDATION
# val blobs
blobs = [blob for blob in storage_client.list_blobs(bucket_name, prefix='validation/')]

start = time.time()
downloaded_blobs = []

thread_iterable = ((blob,blob_idx, 'validation') for blob_idx, blob in enumerate(blobs))

with concurr.ThreadPoolExecutor(max_workers = 2) as executor:

    results = executor.map(lambda args: download_process_save_1_blob(*args), thread_iterable)
    for r in results:
        print(f'\n Time elapsed {time.time() - start}')
        downloaded_blobs.append(r)
        
end = time.time()
print(f'Total time taken {end - start}')
