In [1]:
import os
import time
import tensorflow.compat.v1 as tf
import pickle

tf.enable_eager_execution() # No need for session to be created. Function instances are run immediately. 

from waymo_open_dataset import dataset_pb2 as open_dataset
from google.cloud import storage

import concurrent.futures as concurr

# CONFIG
project = "Waymo3DObjectDetection"
bucket_name = 'waymo_open_dataset_v_1_2_0_individual_files'
suffix = '.tfrecord'
data_destination = os.getcwd() + "/data/"
download_batch_size = 1

def download_blob(blob, c):
    """
    blob = single file name
    c = file counter
    """
    fname = f"{data_destination}blob_{c}{suffix}"
    blob.download_to_filename(fname)
    return fname

def _strip_frame(frame, idx, blob_idx):
    """Strip frame from garbage such as LIDAR data"""
    
    cam_dict = {}
    for i, camera in enumerate(["FRONT", "FRONT_LEFT", "SIDE_LEFT", "FRONT_RIGHT", "SIDE_RIGHT"]):
        cam_dict[camera] = {}
#         cam_dict[camera]['image'] = torch.tensor((tf.image.decode_jpeg(frame.images[i].image)).numpy())
#         cam_dict[camera]['image'] = tf.image.decode_jpeg(frame.images[i].image)
        cam_dict[camera]['image'] = frame.images[i].image
        cam_dict[camera]['velocity'] = frame.images[i].velocity
        cam_dict[camera]['labels'] = frame.camera_labels[i]
        
    cam_dict['context']={'stats':frame.context.stats, 
                       'name': frame.context.name, 
                       'blob_idx':blob_idx,
                       'time_frame_idx':idx}
    return cam_dict

def save_frames(frames, blob_idx):
    """Save frames into pickle format. To preprocess later"""
    with open(f'{data_destination}pickled/blob_{blob_idx}.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(frames, f, pickle.HIGHEST_PROTOCOL)
    return None

def load_frame(frame_idx, blob_idx):
    with open(f'{data_destination}pickled/blob_{blob_idx}.pickle', 'rb') as f:
        # Load the 'data' dictionary using the highest protocol available.
        return pickle.load(f, pickle.HIGHEST_PROTOCOL)


# Retrieve frames from selected files to download
def get_and_strip_frames_from_one_blob(downloaded_blob, blob_idx):
    # Load into tf record dataset
    dataset = tf.data.TFRecordDataset(downloaded_blob, compression_type='')
    frames = []
    for idx, data in enumerate(dataset):
        frame = open_dataset.Frame()
        frame.ParseFromString(bytearray(data.numpy()))
        # Function to strip away LIDAR and other garbage from frame
        frame = _strip_frame(frame, idx, blob_idx)
        frames.append(frame)
    return frames
# Now we just need to do in the same loop (multi-threaded):
# 1 - download a blob
# 2 - process the frames and save that blob
# 3 - discard the blob and move to the next blob (memory efficient)

def download_process_save_1_blob(blob, blob_idx):
    """Like dem descriptive func names eh?"""

    print(f"Downloading blob_{blob_idx}")
    blob_fname = download_blob(blob, blob_idx)
    print(f'Blob_{blob_idx} downloaded')
    
    print("Getting and stripping all frames from blob_{blob_idx}")
    frames = get_and_strip_frames_from_one_blob(blob_fname, blob_idx)
    print("Frames processed")
    
    print(f"Saving frames to pickled/blob_{blob_idx}.pickle")
    save_frames(frames, blob_idx)

    print(f'No longer need tfrecord blob_{blob_idx}. Deleting now.')
    os.remove(f'data/blob_{blob_idx}.tfrecord')

    return f"blob_{blob_idx}"
          

In [2]:
# Initialise a client
storage_client = storage.Client(project= project) #storage.Client(project= "Waymo3DObjectDetection", credentials=credentials)
# Create a bucket object for our bucket
bucket = storage_client.get_bucket(bucket_name)
# Get blob files in bucket
blobs = [blob for blob in storage_client.list_blobs(bucket_name, prefix='training/')]

n_blobs = len(blobs) # Number of blobs in the training dataset
print(f'Total number of blobs is {n_blobs}')




Total number of blobs is 798


In [None]:
start = time.time()
downloaded_blobs = []

thread_iterable = ((blob,blob_idx) for blob_idx, blob in enumerate(blobs))

with concurr.ThreadPoolExecutor(max_workers = 2) as executor:

    results = executor.map(lambda args: download_process_save_1_blob(*args), thread_iterable)
    for r in results:
        print(f'\n Time elapsed {time.time() - start}')
        downloaded_blobs.append(r)
        
end = time.time()
print(f'Total time taken {end - start}')


Downloading blob_0
Downloading blob_1
Blob_1 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_1.pickle
No longer need tfrecord blob_1. Deleting now.
Downloading blob_2
Blob_0 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_0.pickle
No longer need tfrecord blob_0. Deleting now.
Downloading blob_3

 Time elapsed 21.635506629943848

 Time elapsed 21.636046171188354
Blob_2 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_2.pickle
No longer need tfrecord blob_2. Deleting now.
Downloading blob_4
 Time elapsed 34.9662230014801

Blob_3 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_3.pickle
No longer need tfrecord blob_3. Deleting now.
Downloading blob_5
 Time elapsed 42.89931106567383

Blob_4 downloaded
Getting and stripping all frames from b

No longer need tfrecord blob_35. Deleting now.
Downloading blob_37
 Time elapsed 353.2066431045532

Blob_37 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_37.pickle
No longer need tfrecord blob_37. Deleting now.
Downloading blob_38
Blob_36 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_36.pickle
No longer need tfrecord blob_36. Deleting now.
Downloading blob_39

 Time elapsed 371.8528027534485

 Time elapsed 371.85305070877075
Blob_38 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_38.pickle
No longer need tfrecord blob_38. Deleting now.
Downloading blob_40
 Time elapsed 386.27673602104187

Blob_39 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_39.pickle
No longer need tfrecord blob_39. Deleting now.
Downloading blob_41
 Time elaps

Frames processed
Saving frames to pickled/blob_71.pickle
Blob_72 downloaded
Getting and stripping all frames from blob_{blob_idx}
No longer need tfrecord blob_71. Deleting now.
Downloading blob_73
 Time elapsed 701.1238179206848

Frames processed
Saving frames to pickled/blob_72.pickle
No longer need tfrecord blob_72. Deleting now.
Downloading blob_74
 Time elapsed 702.5690317153931

Blob_73 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_73.pickle
No longer need tfrecord blob_73. Deleting now.
Blob_74 downloaded
Getting and stripping all frames from blob_{blob_idx}
Downloading blob_75
 Time elapsed 722.7521514892578

Frames processed
Saving frames to pickled/blob_74.pickle
No longer need tfrecord blob_74. Deleting now.
Downloading blob_76

 Time elapsed 725.1443548202515
Blob_75 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_75.pickle
No longer need tfrecord

Blob_106 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_106.pickle
No longer need tfrecord blob_106. Deleting now.
Downloading blob_109

 Time elapsed 1035.932873249054

 Time elapsed 1035.9331591129303
Blob_108 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_108.pickle
No longer need tfrecord blob_108. Deleting now.
Downloading blob_110

 Time elapsed 1049.6813387870789
Blob_109 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_109.pickle
No longer need tfrecord blob_109. Deleting now.
Downloading blob_111

 Time elapsed 1055.6772420406342
Blob_110 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_110.pickle
No longer need tfrecord blob_110. Deleting now.
Downloading blob_112

 Time elapsed 1068.7522649765015
Blob_111 downloaded
Getting

Downloading blob_143
 Time elapsed 1388.0317614078522

Blob_142 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_142.pickle
No longer need tfrecord blob_142. Deleting now.
Downloading blob_144

 Time elapsed 1398.103273153305
Blob_143 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_143.pickle
No longer need tfrecord blob_143. Deleting now.
Downloading blob_145
 Time elapsed 1409.0107951164246

Blob_144 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_144.pickle
No longer need tfrecord blob_144. Deleting now.
Downloading blob_146

 Time elapsed 1420.8386478424072
Blob_145 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_145.pickle
No longer need tfrecord blob_145. Deleting now.
Downloading blob_147

 Time elapsed 1424.007227897644
Blob_14

Blob_178 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_178.pickle
No longer need tfrecord blob_178. Deleting now.
Downloading blob_179
Blob_177 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_177.pickle
No longer need tfrecord blob_177. Deleting now.
Downloading blob_180
 Time elapsed 1790.524697303772

 Time elapsed 1790.5248188972473

Blob_179 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_179.pickle
No longer need tfrecord blob_179. Deleting now.
Downloading blob_181
 Time elapsed 1801.5469722747803

Blob_180 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_180.pickle
No longer need tfrecord blob_180. Deleting now.
Downloading blob_182
 Time elapsed 1814.1920311450958

Blob_181 downloaded
Getting and stripping all frames from blo

Blob_212 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_212.pickle
No longer need tfrecord blob_212. Deleting now.
Downloading blob_214
 Time elapsed 2139.977907896042

Blob_214 downloaded
Getting and stripping all frames from blob_{blob_idx}
Blob_213 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_214.pickle
Frames processed
Saving frames to pickled/blob_213.pickle
No longer need tfrecord blob_214. Deleting now.
No longer need tfrecord blob_213. Deleting now.
Downloading blob_215Downloading blob_216


 Time elapsed 2159.3850910663605

 Time elapsed 2159.3851823806763
Blob_216 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Saving frames to pickled/blob_216.pickle
No longer need tfrecord blob_216. Deleting now.
Downloading blob_217
Blob_215 downloaded
Getting and stripping all frames from blob_{blob_idx}
Frames processed
Savi