# Satellite Object Detection - Inference

* Jupyter notebook to run inference on a [ModelZoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md) trained model
* Runs in Deep Learning VM on Google Compute Engine using [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection) - compatible with 13 July 2018 updates ([release](https://github.com/tensorflow/models/tree/master/research/object_detection#july-13-2018))
* Runs in a Python 2 notebook

### Requirements:
* Dataset in zip archive on Google Cloud Storage:
```
[dataset archive name].zip
   |- rec [optional - will generate any files not present]
      |- label_map.pbtxt
      |- classes.txt
   |- img_test
      |- [image_class]_0.jpg
      |- [image_class]_1.jpg
      |- ...
   |- xml_test
      |- [image_class]_0.xml
      |- [image_class]_1.xml
      |- ...
```

* Tensorflow Object Detection API in zip archive on Google Drive:
```
[tf archive name].zip
   |- object_detection
      |- [Tensorflow model API files/folders]
   |- slim
      |- [Tensorflow model API files/folders]
```

### To make XML annotations from images:
* Use [LabelImg](https://github.com/tzutalin/labelImg) for images with unknown boxes
* Use `xml_make.py` for images synthesized in Unity dataset generator

## Set Up

In [17]:
## This block deletes all files and folders, and starts from scratch
## (akin to restarting the runtime)

# -------- Parameters --------

data_dir_name = 'data'  # Name of folder where images/annotations/models should be stored
code_dir_name = 'src/od_api'  # Name of folder where Tensorflow Object Detection API should be stored
model_dir_name = 'mdl'  # Name of folder where Tensorflow models should be stored
util_dir_name = 'util'  # Name of folder where utility scripts should be stored

delete_all_uploaded_files = False  # Start anew


# =========================================


import sys
import os, shutil
import re

def clean_slate(directory, keep_these, keep_these_exts=['.ipynb']):
    """ Deletes subdirectories of a given directory, ignoring config files and such.
    Args:
        directory: Directory in which to delete subdirectories.
        keep_these: Subdirectories/files to not delete.
        keep_these_exts: File extensions to not delete.
    """
    
    cwd = os.getcwd()
    try:
        os.chdir(directory)
    except OSError as e:
        print('Cannot access desired directory {}. Caught error: {}'.format(directory, str(e)))
        return
    
    # Make sure given file extensions are lowercase and start with '.'
    for ii in range(len(keep_these_exts)):
        keep_these_exts[ii] = keep_these_exts[ii].lower()
        if not keep_these_exts[ii].startswith('.'):
            keep_these_exts[ii] = '.' + keep_these_exts[ii]
    for ii in range(len(keep_these)):
        keep_these[ii] = keep_these[ii].lower()
    
    print('Cleaning up ' + directory + '...')
    
    # Delete all unnecessary subdirectories
    # (i.e., don't delete anything that starts with '.', such as '.config')
    subdirs = next(os.walk('.'))[1]  # Get list of directories in working directory
    for subdir in os.listdir(directory):
        if not subdir.startswith('.') and subdir.lower() not in keep_these \
                                      and os.path.splitext(subdir)[1].lower() not in keep_these_exts:
            try:
                shutil.rmtree(subdir)
            except OSError as e:
                print('    Failed to remove {}. Caught error: {}'.format(os.path.join(directory,subdir), str(e)))
            print('    Removed ' + os.path.join(directory,subdir))
    print('...Done')
    os.chdir(cwd)

# -----

# Directory names
# "replace" to normalize OS path separator - in future blocks (now that os module is loaded), should only use os.path.join, not manual separator (e.g., 'this/is/a/path')
normpath = lambda path : path.replace('/', os.path.sep).replace('\\', os.path.sep)

WORKING_DIR = normpath('/home/jupyter/')  # Base folder in Jupyter instance (not root) - should already exist
DATA_DIR = os.path.join(WORKING_DIR, normpath(data_dir_name))  # Where images, annotations, and models will be stored
CODE_DIR = os.path.join(WORKING_DIR, normpath(code_dir_name))  # Where Tensorflow Object Detection API code will be stored
MODEL_DIR = os.path.join(WORKING_DIR, normpath(model_dir_name))  # Where Tensorflow models will be stored
UTIL_DIR = os.path.join(WORKING_DIR, normpath(util_dir_name))  # Where utility scripts will be stored


# =========================================


os.chdir(WORKING_DIR)  # If this doesn't exist, then figure out what the actual working directory is and change the variable above

# Delete old directories
if delete_all_uploaded_files:
    try: os.rename(os.path.join(WORKING_DIR, 'src', 'models'), os.path.join(WORKING_DIR, 'src', 'tf_api'))
    except: pass
    try: os.rename(os.path.join(WORKING_DIR, 'tutorials'), os.path.join(WORKING_DIR, 'test', 'gce_ai_demo'))
    except: pass
    
    clean_slate(WORKING_DIR, keep_these=['test', 'src'], keep_these_exts=['.ipynb', '.py'])
    clean_slate(os.path.join(WORKING_DIR, 'src'), keep_these=['tf_api', 'tensorflow', 'tpu'], keep_these_exts=['.ipynb', '.py'])

# Make new directories
print('Creating new directories...')
for directory in [DATA_DIR, CODE_DIR, MODEL_DIR, UTIL_DIR]:
    try:
        os.makedirs(directory)
        print('    Made directory ' + directory)
    except OSError:
        pass  # Directory probably already exists
print('...Done creating directories')

# Add folders to system path
print('\nUpdating system paths...')
if "../" not in sys.path:
    sys.path.append("../")
if ".." not in sys.path:
    sys.path.append("..")
for directory in [WORKING_DIR, CODE_DIR]:
    if directory not in sys.path:
        sys.path.append(directory)
        print('    Added ' + directory + ' to system path')
print('...Done updating paths')

Creating new directories...
...Done creating directories

Updating system paths...
...Done updating paths


In [18]:
GCLOUD_BUCKET = 'csys-ssa1b'


# =========================================


import sys
import os, shutil
import re


# Methods for interfacing with Google Cloud Storage

def get_file_from_gcloud(bucket_name, path_to_file, dir_to_save_in = None, archive_contents='archive'):
    """ Transfers a file from a Google Cloud Storage bucket to the Jupyter instance.
    Args:
        bucket_name: Cloud Storage bucket to get file from.
        path_to_file: Path to file in Cloud Storage bucket.        
        dir_to_save_in: Directory within Jupyter instance in which to save file.
        archive_contents: Name of archive (only used for console output).
    """
    
    if dir_to_save_in is None or dir_to_save_in == '':
        dir_to_save_in = os.getcwd()
    else:
        try: os.makedirs(dir_to_save_in)
        except OSError: pass
    
    print('Downloading {} (in file {} from Google Cloud Storage bucket {})...'.format(archive_contents, path_to_file, bucket_name))
    print('=======================')
    !gsutil cp -n gs://{bucket_name}/{path_to_file} {dir_to_save_in}
    print('=======================')
    print('...Done trying - check above lines for success\n')

def send_file_to_gcloud(path_to_file, bucket_name, dir_to_save_in = None):
    """ Transfers a file from the Jupyter instance to a Google Cloud Storage bucket.
    Note:
        Doesn't currently work (permission error) - run the shell command from an SSH window instead,
        after authenticating account with command  gcloud auth login [account_email].
    Args:
        path_to_file: Path to file in Jupyter instance.
        bucket_name: Cloud Storage bucket to save in.
        dir_to_save_in: Directory within Cloud Storage bucket in which to save file.
    """
    
    print('Uploading file {} to Google Cloud Storage bucket {}...'.format(path_to_file, bucketname))
    print('=======================')
    
    if dir_to_save_in is None:
        !gsutil cp {path_to_file} gs://{bucket_name}
    else:
        !gsutil cp {path_to_file} gs://{bucket_name}/{dir_to_save_in}
    
    print('=======================')
    print('...Done trying - check above lines for success\n')
    
    
# Utility methods

def split_top_path_level(path):
    """ Gets the top object in an OS path (either top folder or filename, whichever is the rightmost entry in the path.)
    Args:
        path: Path in which to split off the top-level object.
    Returns:
        Array: [0] = base path excluding top-level object, [1] = top-level object
    """
    top_obj = path.rstrip('/\\').rsplit('/',1)[-1].rsplit('\\',1)[-1]
    base_path = path.rstrip('/\\').rstrip(top_obj).rstrip('/\\')
    return [base_path, top_obj]

def extract_archive(path_to_archive, dir_to_extract_in = None, delete_after_extracting = True):
    """ Extracts a zip or tar archive into a folder in the current instance.
    Args:
        path_to_archive: Path to archive stored in the current instance.
        dir_to_extract_in: Where to extract the archive.
        delete_archive_after_extracting: Whether to keep or remove the original archive file.
    Raises:
        ValueError if the archive is not a .zip or .tar
    """
    
    # Make directory to extract to
    if dir_to_extract_in is None or dir_to_extract_in == '':
        dir_to_extract_in = os.getcwd()
    else:
        try: os.makedirs(dir_to_extract_in)
        except OSError: pass
    
    # Get archive extension (or extensions, accounting for double extensions like .tar.gz)
    archive_file_extensions = [os.path.splitext(path_to_archive)[1], os.path.splitext(os.path.splitext(path_to_archive)[0])[1]]

    # Open archive
    if any(ext == '.tar' for ext in archive_file_extensions):
        import tarfile
        arch = tarfile.open(path_to_archive)
        print('Extracting tar archive {}... '.format(os.path.basename(path_to_archive))),
    elif any(ext == '.zip' for ext in archive_file_extensions):
        from zipfile import ZipFile
        arch = ZipFile(path_to_archive)
        print('Extracting zip archive {}... '.format(os.path.basename(path_to_archive))),
    else:
        raise ValueError('Archive extensions {} and {} not recognized in file {}'.format(archive_file_extensions[0], archive_file_extensions[1], path_to_archive))
    
    # Extract archive
    arch.extractall(dir_to_extract_in)
    arch.close()
    print('Done! Extracted to ' + dir_to_extract_in)

    # Delete original archive file
    if delete_after_extracting:
        os.remove(path_to_archive)
        
def download_archive_from_url(archive_file_url, dir_to_extract_in, archive_contents='archive'):
    """ Downloads and extracts a tar or zip archive from somewhere in the cloud.
    Args:
        archive_file_url: URL location of archive
        dir_to_extract_in: Where to extract archive (in local directory tree)
        archive_contents: Name of archive (only used for console output)
    """

    cwd = os.getcwd()
    os.chdir('/tmp')

    # Download archive
    import urllib
    print('Getting {}...'.format(archive_contents))
    print('    Downloading archive from {}... '.format(archive_file_url)),
    archive_filename = archive_file_url.rsplit('/', 1)[-1]
    opener = urllib.URLopener()
    opener.retrieve(archive_file_url, archive_filename)
    print('Done')

    # Extract archive
    print('   '),
    extract_archive(archive_filename, dir_to_extract_in, delete_after_extracting = True)
    
    os.chdir(cwd)
    print('...Done getting {}'.format(archive_contents))
    
def archive_files_to_indexed_directory(dir_to_archive, append_str='old'):
    """ Moves all files from a directory to an archive directory (at the same level).
    Args:
        dir_to_archive: Directory whose files to move to an archive
        append_str: String to append to archive directory names (default: 'old')
    Returns:
        archive_dir: Name of (and path to) archive directory
    """

    # Make top-level archive folder
    [base_path, top_folder] = split_top_path_level(dir_to_archive)
    archive_path = os.path.join(base_path, '{}_old'.format(top_folder))
    try:
        os.makedirs(archive_path)
    except OSError:
        pass  # Archive folder probably already exists  
    
    # Find the first index for which the folder '[append_str]_[idx]' doesn't exist yet within the archive directory
    idx = 1
    while True:
        # Get the next potential directory name. If it already exists, increment the counter and try again
        # If the directory doesn't already exist, we've found the right index and we're done
        archive_dir = os.path.join(archive_path, '{}_{}'.format(append_str, idx))
        if os.path.isdir(archive_dir):
            idx += 1
        else:
            break  # Found an index for which the archive directory doesn't exist yet (so can use it now)

    # Change the directory name to the archive name, then recreate the initial directory
    os.rename(dir_to_archive, archive_dir)
    os.makedirs(dir_to_archive)  # Will be empty

    return archive_dir  # Return archived directory name

def list_common_files(dirs, txt_output_path):
    """ Makes a list of all filenames present in all given directories (excluding extension), and outputs list to a .txt file
    Args:
        dirs: Directories whose common files should have their names stored in list_path
        txt_output_path: Path (including filename) where .txt file should be stored
    Raises:
        OSError if file txt_output_path already exists
    Returns:
        files_included: List of all filenames present in all given directories
    """

    if os.path.exists(txt_output_path):
        raise OSError("File " + txt_output_path + " already exists - not creating anew")

    print('Getting list of files common to all given directories (saving to {})... '.format(txt_output_path))
    
    print('    Directories checked:')
    files_in_each_dir = []
    for directory in dirs:
        try:
            files_in_each_dir.append([fn.split('.')[0] for fn in os.listdir(directory)])  # Get filenames without extension
            print('        Including directory ' + directory)
        except OSError:
            print('        Omitting directory ' + directory + ' - could not get file list. Was the full directory path given?')
            

    print('    Checking for common files... '),
    files_included = []

    with open(txt_output_path, 'w') as list_file:
        all_files_found = True

        for fn in files_in_each_dir[0]:
            # If file is found in every given folder, add it to the list
            if all(fn in files_in_each_dir[ii] for ii in range(len(dirs))):
                list_file.write(fn + '\n')
                files_included.append(fn)

            # Otherwise, omit it from the list (this block just gives console output)
            else:
                if all_files_found:
                    all_files_found = False
                    print(' ')
                print('        Warning: could not find file {} in all of the above directories - excluding from list'.format(fn))

        if all_files_found:
            print('Done')
        else:
            print('    ...Saved list of common files, but excluded files not present in every directory (see above)')

    print('...Got and saved list of files')
    return files_included


#  ========================================


print('Done loading utility methods')

Done loading utility methods


## Get Tensorflow Object Detection API 
from Google Cloud Storage

In [19]:
# OBJECT DETECTION API
API_FN = 'code_tf-api-v1.14_training.zip'
        

# =========================================


import sys, os

# Check if "API-like" files are already in the code directory
for subdir in os.listdir(CODE_DIR):
    if 'object_detection' in subdir or 'slim' in subdir:
        print('Found {} subdirectory in {} - Tensorflow Object Detection API might already be downloaded! '.format(subdir, CODE_DIR)
                  + '\n(If it needs to be replaced, remove current directory manually with shutil.rmtree.)')
        break
# If not, download and extract the archive
else:
    check = raw_input('Download Object Detection API from Google Cloud Storage? (Will overwrite current API if present.) [y/n]')
    if check.lower() == 'y':
        get_file_from_gcloud(GCLOUD_BUCKET, API_FN, WORKING_DIR)
        extract_archive(os.path.join(WORKING_DIR, API_FN), CODE_DIR)
    else:
        print('Not downloading API code anew - make sure it is present in {}, or else training will not run.'.format(CODE_DIR))

# If cython is not installed, install it
try:
    if cython_installed:
        pass
except:
    print('\nInstalling Cython... '),
    !pip install -q Cython
    cython_installed = True
    print('Done')
    
# If pycocotools is not installed, install it
try:
    if pycocotools_installed:
        pass
except:
    # Find pycocotools setup file (somewhere in a subdirectory of 'src/object_detection_api/coco')
    for subdir in os.listdir(CODE_DIR):
        if 'coco' in subdir:
            for root, _, fns in os.walk(os.path.join(CODE_DIR, subdir)):
                if 'setup.py' in fns:
                    try:
                        print('Installing pycocotools...')
                        print('===============================')
                        cwd = os.getcwd()
                        os.chdir(os.path.normpath(root))
                        !python 'setup.py' install --user -q
                        pycocotools_installed = True
                    except:
                        raise
                    else:
                        print('===============================')
                        print('...Done installing pycocotools (succeeded if no errors between the "=" lines above)\n')
                        break
                    finally:
                        os.chdir(cwd)
            else:
                continue  # setup.py not found
            break  # setup.py found and run
    else:
        raise RuntimeError('Could not find pycocotools!')
    
# Add Object Detection API folders to system path
if 'PYTHONPATH' not in os.environ.keys():
    os.environ['PYTHONPATH'] = ''  # CODE_DIR + ':' + os.path.join(CODE_DIR, 'object_detection') + ':' + os.path.join(CODE_DIR, 'slim')

print(' ')
print('Updating system paths...')
os.chdir(CODE_DIR)
for directory in os.listdir(CODE_DIR):  # Names of 'object_detection' and 'slim' directories
    if os.path.isdir(directory) and not directory.startswith('.'):  # Make sure it's actually a directory (and not a file)
        path_to_dir = os.path.join(CODE_DIR, directory)  # Full path to directory
        if path_to_dir not in sys.path:
            sys.path.append(path_to_dir)
            print('    Added ' + path_to_dir + ' to system path')
        if path_to_dir not in os.environ['PYTHONPATH']:
            os.environ['PYTHONPATH'] += ':' + path_to_dir
            print('    Added ' + path_to_dir + ' to Python path')
os.chdir(WORKING_DIR)
print('...Done updating paths')

Found object_detection subdirectory in /home/jupyter/src/object_detection_api - Tensorflow Object Detection API might already be downloaded! 
(If it needs to be replaced, remove current directory manually with shutil.rmtree.)
 
Updating system paths...
...Done updating paths


In [None]:
# UTILITY METHODS
UTIL_FN = None
        

# =========================================


import sys, os

# Figure out whether to download util script archive
download_utils = False

if UTIL_FN is not None and UTIL_FN != '':
    # If util files are not already loaded, download them
    if not os.path.exists(UTIL_DIR) or len(os.listdir(UTIL_DIR)) == 0:
        download_utils = True
    # If util files should be overwritten, download them
    else:
        check = raw_input('Utility methods found in {} - overwrite with archive from Google Cloud Storage? [y/n]')
        if check.lower() == 'y':
            download_utils = True
            
            try: shutil.rmtree(UTIL_DIR)
            except OSError: pass
            
if download_utils:
    get_file_from_gcloud(GCLOUD_BUCKET, UTIL_FN, WORKING_DIR)
    extract_archive(os.path.join(WORKING_DIR, UTIL_FN), UTIL_DIR)
else:
    print('Not downloading util scripts')

## Get Trained Neural Net Model
from Google Cloud Storage

In [1]:
# -------- Parameters --------

# ML model location
# Index: 0 = Frozen model in current Jupyter file structure (use iteration MODEL_STARTITER)
#        1 = Google Cloud Storage (download model in bucket GCLOUD_BUCKET [earlier cell] at location MODEL_FN)
model_location = 0

# For model_location == 0:
# Where to find inference graph <frozen_graph>.pb
#     <MODEL_FRZN_DIR>/[<MODEL_ITER_PREFIX>_]<MODEL_ITER> (e.g., mdl/frzn/100000 or mdl/frzn/ssdv1_100000)
# 0 = max available iteration
MODEL_ITER = 0
MODEL_ITER_PREFIX = None

# For model_location == 1:
# - Cloud Storage zip archive containing model with frozen inference graph (i.e., [saved_model].pb)
MODEL_FN = 'model_ssdv1_20190920_frzn__iter57949.zip'


# -----


# Directory names
MODEL_FRZN_DIR = os.path.join(MODEL_DIR, 'frzn')  # Exported model with frozen inference graph (after training)

# =========================================


import os, shutil, glob

# Find directory containing inference graph

# If using model in current instance, get it
if model_location == 0:    
    # If using frozen model at max iteration, find it
    if MODEL_ITER == 0:
        for subdir in os.listdir(MODEL_FRZN_DIR):
            try:
                if MODEL_ITER_PREFIX is None or MODEL_ITER_PREFIX == '':
                    this_iteration = int(subdir)
                else:
                    this_iteration = int(re.search(MODEL_ITER_PREFIX + r'_(\d+)', s).group(1))
            except (ValueError, AttributeError):
                continue  # This subdirectory is not of the required form
            else:
                MODEL_ITER = max(MODEL_ITER, this_iteration)
    
    print('Getting directory for inference at iteration {}... '.format(MODEL_ITER)),
    
    # Get directory
    if MODEL_ITER_PREFIX is None or MODEL_ITER_PREFIX == '':
        model_graph_dir = os.path.join(MODEL_FRZN_DIR, str(MODEL_ITER))
    else:
        model_graph_dir = os.path.join(MODEL_FRZN_DIR, model_iter_prefix + '_' + str(MODEL_ITER))
    
    if os.path.exists(model_graph_dir):
        print('Done! Using directory ' + model_graph_dir)
    else:
        raise OSError('Directory {} does not exist - cannot get graph to run inference'.format(model_graph_dir))    

# If using model from Google Cloud Storage, download it
elif model_location == 1:
    model_graph_dir = os.path.join(MODEL_FRZN_DIR, 'inf')
    get_file_from_gcloud(GCLOUD_BUCKET, MODEL_FN, WORKING_DIR, 'frozen Tensorflow model')
    extract_archive(os.path.join(WORKING_DIR, MODEL_FN), model_graph_dir)
    
# Find graph
print('Getting inference graph... ')
for fn in os.listdir(model_graph_dir):
    if fn.endswith('.pb'):
        model_graph_file = os.path.join(model_graph_dir, fn)
        print('Done! Using graph ' + model_graph_file)
        break
else:
    raise RuntimeError('Graph (.pb) file not found in {} - cannot run inference'.format(model_graph_dir))

SyntaxError: invalid syntax (<ipython-input-1-e296888bf851>, line 58)

## Get Test Images and Annotations
from Google Cloud Storage

In [10]:
# -------- Parameters --------

# ID of zip archive containing image/annotation data (in Google Cloud Storage)
DATASET_FN = (
    'data_20190830_RSO.zip'
    # 'data_20190830_RSO__sample357train.zip'
)

# Whether to delete current data and start again
replace_data = False


                                 
# -----

# Directory names
REC_DIR = os.path.join(DATA_DIR, 'rec')  # Where label map is stored
IMG_TEST_DIR = os.path.join(DATA_DIR, 'img_test')  # Where test images will be stored
XML_TEST_DIR = os.path.join(DATA_DIR, 'xml_test')  # Where XML annotations for test images will be stored


# =========================================


import os, shutil, glob

def find_and_move_folder(from_within_dir, to_dir, file_exts, keyword="not_a_keyword", dir_contents='\b'):
    """ Finds a folder containing files with a given extension, and moves it to a new location
    Args:
        from_within_dir: Directory in which to search for desired folder
        to_dir: Where found folder should be moved
        file_exts: Type of file that should be present in found folder (or list of potential file types - e.g., several image extensions)
        keyword: If several folders with the desired file type are found, keyword denotes which directory should be taken here (e.g., "train" or "test" for image files)
        dir_contents: What folder is being moved (only for console output)
    Raises:
        RuntimeError if record files are found in multiple folders in from_dir
    """

    print('Moving {} folder... '.format(dir_contents)),
    
    cwd = os.getcwd()
    os.chdir(from_within_dir)
    
    # Make sure destination directory doesn't exist yet (archive it if it exists and is nonempty)
    if os.path.isdir(to_dir):
        if len(os.listdir(to_dir)) == 0:
            shutil.rmtree(to_dir)
        else:
            archived_dir = archive_files_to_indexed_directory(to_dir)
            print(' ')
            print('    WARNING: destination directory {} is not empty - archived to {}'.format(to_dir, archived_dir))
            print('...'),

    # Parse file extensions
    # If single string, make it a list
    if file_exts == str(file_exts):
        file_exts = [file_exts]
    # Make extensions lower-case, and start with '.'
    for ii in range(len(file_exts)):
        file_exts[ii] = file_exts[ii].lower()
        if not file_exts[ii].startswith('.'):
            file_exts[ii] = '.' + file_exts[ii]            
            
    # Get all directories in from_dir containing desired file type
    valid_dirs = []
    for subdir in os.listdir(from_within_dir):
        # Look for desired file type(s) in this folder
        if any(len(glob.glob(os.path.join(subdir, '*{}*'.format(ext)))) > 0 for ext in file_exts):
            valid_dirs.append(subdir)

    # Check found directories
    dir_found = False
    # If only one directory contains desired file type, it's the right one
    if len(valid_dirs) == 1:
        try:
            shutil.move(valid_dirs[0], to_dir)
        except OSError as e:
            print('Failed! Could not move {} to {}'.format(os.path.join(from_within_dir, valid_dirs[0]), to_dir))
            raise e
        else:
            dir_found = True
            print('Done! Moved {} to {}'.format(os.path.join(from_within_dir, valid_dirs[0]), to_dir))
    # If multiple directories contain desired file type, look for one labelled with the given keyword
    elif len(valid_dirs) > 1:
        valid_dirs_with_keyword = [d for d in valid_dirs if keyword in d]
        # If only one directory containing the desired file type has the keyword in its name, it's the right one
        if len(valid_dirs_with_keyword) == 1:
            try:
                shutil.move(valid_dirs_with_keyword[0], to_dir)
            except OSError as e:
                print('Failed! Could not move {} to {}'.format(os.path.join(from_within_dir, valid_dirs_with_keyword[0]), to_dir))
                raise e
            else:
                dir_found = True
                print('Done! Moved {} to {}'.format(os.path.join(from_within_dir, valid_dirs_with_keyword[0]), to_dir))
        # If multiple directories containing the desired file type have the keyword in their names, can't choose the right one
        elif len(valid_dirs_with_keyword) > 1:
            print('Failed! Cannot choose among {} folders with keyword "{}" - use shutil.move manually'.format(len(valid_dirs_with_keyword), keyword))

        # If no directory containing the desired file type has the keyword in its name, can't choose the right one among the several initially-found directories
        else:
            print('Failed! Could not find folder with keyword "{}" within {} - use shutil.move manually'.format(keyword, from_within_dir))
    # If no directories contain the desired file type, can't move anything
    else:
        print('Failed! No folder in {} contains {} files'.format(from_within_dir, file_exts))

    if False:
        if dir_found:
            print('...Done moving {} folder'.format(dir_contents))
        else:
            print('...Done with errors')
            raise OSError('Could not parse directory properly - see failures in comments above.')
        
    os.chdir(cwd)
    
# ====================================================


ready_to_get_new_dataset = False
got_dirs = {
    'IMG_TEST': False,
    'XML_TEST': False
}

# If replacing any current data, delete and remake data directory
if replace_data:
    try: shutil.rmtree(DATA_DIR)
    except OSError: pass

    try: os.makedirs(DATA_DIR)
    except OSError: pass
    
    ready_to_get_new_dataset = True

# If not replacing the current data, check if it seems to all be there
# For each directory:
#     - If directory name is None or '', then directory is not important for current run, so don't need it
#     - If directory exists and is nonempty, then there is already data there so leave it as is
else:
    if IMG_TEST_DIR is None or IMG_TEST_DIR == '' or (os.path.exists(IMG_TEST_DIR) and len(os.listdir(IMG_TEST_DIR)) > 0):
        got_dirs['IMG_TEST'] = True
    if XML_TEST_DIR is None or XML_TEST_DIR == '' or (os.path.exists(XML_TEST_DIR) and len(os.listdir(XML_TEST_DIR)) > 0):
        got_dirs['XML_TEST'] = True
    if REC_DIR is None or REC_DIR == '' or (os.path.exists(REC_DIR) and len(os.listdir(REC_DIR)) > 0):
        got_dirs['REC'] = True
    
    if all(dir_nonempty for dir_nonempty in got_dirs.values()):
        print('Test data already loaded - delete manually with shutil.rmtree to replace')
    else:
        ready_to_get_new_dataset = True  # At least some required data is not present, so get the dataset and look for it
        
# Get new dataset
if ready_to_get_new_dataset:
    # Prep temp data directory for holding dataset files - if it already exists, delete it
    DATA_TEMP_DIR = os.path.join(WORKING_DIR, 'data_temp')
    try: shutil.rmtree(DATA_TEMP_DIR)
    except OSError: pass
    
    # Load dataset into temp directory
    get_file_from_gcloud(GCLOUD_BUCKET, DATASET_FN, WORKING_DIR, 'dataset')
    extract_archive(os.path.join(WORKING_DIR, DATASET_FN), DATA_TEMP_DIR)
    
    # Move dataset parts to the "default" locations
    print('\nLooking for dataset folders...')
    
    if 'IMG_TEST' in got_dirs.keys() and not got_dirs['IMG_TEST']:
        print('    '),
        try: find_and_move_folder(DATA_TEMP_DIR, IMG_TEST_DIR, ['.jpg', '.jpeg', '.png', '.gif', '.tiff', '.bmp'], keyword='test', dir_contents='test image')
        except OSError as e: print("Caught error: " + repr(e))

    if 'XML_TEST' in got_dirs.keys() and not got_dirs['XML_TEST']:
        print('    '),
        try: find_and_move_folder(DATA_TEMP_DIR, XML_TEST_DIR, '.xml', keyword='test', dir_contents='test annotation')
        except OSError as e: print("Caught error: " + repr(e))

    if 'REC' in got_dirs.keys() and not got_dirs['REC']:
        print('    '),
        try: find_and_move_folder(DATA_TEMP_DIR, REC_DIR, '.record', keyword='rec', dir_contents='record')
        except OSError as e: print("Caught error: " + repr(e))

    print('...Done looking for dataset folders')

    # Delete temp directory
    os.chdir(WORKING_DIR)
    try: shutil.rmtree(DATA_TEMP_DIR)
    except OSError: pass

Full dataset already loaded - delete manually with shutil.rmtree to replace


In [3]:
import sys
if 'contextlib' not in sys.modules.keys():
    print('Installing contextlib... ')
    !pip install -q contextlib2
    print('Done\n')

import os, shutil, glob
import re
import random
import contextlib
import tensorflow as tf
import hashlib
import io
import PIL.Image
from lxml import etree
import importlib

# Get label map utility module from Tensorflow Object Detection API
os.chdir(CODE_DIR)
for directory in os.listdir(CODE_DIR):
    if 'utils' in os.listdir(directory):
        dataset_util = importlib.import_module('.dataset_util', package=(directory + '.utils'))
        label_map_util = importlib.import_module('.label_map_util', package=(directory + '.utils'))
        break
else:
    print('WARNING: cannot find "label_map_util.py" in code directory - might cause failure later')
os.chdir(WORKING_DIR)


def find_label_map(directory):
    pbtxts = [fn for fn in os.listdir(directory) if fn.endswith('.pbtxt')]
    if len(pbtxts) == 1:
        return os.path.join(pbtxts[0])
    elif len(pbtxts) > 1:
        for fn in pbtxts:
            if 'label_map' in fn or 'labelmap' in fn:
                return os.path.join(directory, fn)
    else:
        return None
    
    

# ====================================================


print('Getting necessary files in data directory...')
    
# Check data directory for file list and label map
test_file_list = None
label_map_in_data_dir = None
try:
    os.makedirs(REC_DIR)
except OSError:
    if os.path.isdir(REC_DIR) and 'test_files.txt' in os.listdir(REC_DIR):
        test_file_list = dataset_util.read_examples_list(os.path.join(REC_DIR, 'test_files.txt'))
        print('    Test file list found at {}'.format(os.path.join(REC_DIR, 'test_files.txt')))
    
    if os.path.exists(REC_DIR) and os.path.isdir(REC_DIR):
        label_map_in_data_dir = find_label_map(REC_DIR)
        if label_map_in_data_dir is not None:
            print('    Label map found at {}'.format(label_map_in_data_dir))

# Create txt file of training data filenames
if test_file_list is None:
    print('    Creating test file list... '),
    try:
        test_file_list = list_common_files([IMG_TEST_DIR, XML_TEST_DIR], os.path.join(REC_DIR, 'test_files.txt'))  # Returns list of objects in files
        print('Done')
    except OSError as e:
        print('Failed!')
        print('      Error creating file {}.'.format(os.path.join(REC_DIR, 'test_files.txt')))
        print('      Error caught: ' + str(e))

        
# Get list of annotated objects contained in test annotations
print('    Building dict of annotated objects... '),
annotated_objects = dict()
for fn in test_file_list:
    # Add classes of all objects in image to annotated_objects dict (objects should have xml tag <name>)
    xml_tree = etree.parse(os.path.join(XML_TEST_DIR, fn + '.xml'))
    for elem in xml_tree.iter():
        if elem.tag == 'name' and elem.text not in annotated_objects.keys():
            annotated_objects[elem.text] = -1       
# If label map is available, get ids of each annotated object name
if label_map_in_data_dir is not None:
    data_classes = label_map_util.get_label_map_dict(label_map_in_data_dir)
    for label in annotated_objects.keys():
        if label in data_classes.keys():
            annotated_objects[label] = data_classes[label]  # Set id corresponding to label name
print('Done! Found objects:' + '\n      '.join([label for label in annotated_objects.keys()]))

print('...Done getting files in data directory')


# Check classes in label map from frozen model    
print(' ')
print('Checking label map...')

if label_map_in_data_dir is None:
    print('    WARNING: label map not found in data directory - data labels might not line up with model (check manually after inference)')

label_map_in_model_dir = find_label_map(model_graph_dir)
if label_map_in_model_dir is None:
    print('    WARNING: label map not found in model directory - class ids might not correspond to data (check manually after inference)')
else:
    model_classes = label_map_util.get_label_map_dict(label_map_in_model_dir)
    for label in annotated_objects.keys():
        if label not in model_classes.keys():
            print('    WARNING: inference graph is not looking for object {}, which is present in some images'.format(label))
        elif annotated_objects[label] != model_classes[label]:
            print('    WARNING: label {} has mismatched ids: {} in data and {} in model'.format(label, annotated_objects[label], model_classes[label]))
        else:
            print('    Label found: {} (id {})'.format(label, annotated_objects[label]))

print('...Done checking label map. Look for any discrepancies above and fix manually (in model_classes dict)')

SyntaxError: invalid syntax (<ipython-input-3-db03b5c447ea>, line 118)

## Run Inference

In [None]:
output_image_width  = 25  # inches
output_image_height = 25  # inches

min_confidence = 1  # Only draw box if <min_confidence> percent sure that it's the right one
max_boxes = 10  # Draw the <max_boxes> most likely boxes (as long as they're above <min_confidence>)

max_num_img_to_infer = 0  # Will infer on [x] randomly-chosen images from full set of test images (0 = all available images)

default_class_list = ['RSO']  # Overridden if model zip archive has a 'classes.txt' file


# -------

# Directory names
import os
OUT_BOX_DIR = os.path.join(DATA_DIR, 'out_test_box')


# =========================================


import sys, os, shutil, glob
import random
import importlib
from timeit import default_timer as timer




import numpy as np
import tensorflow as tf
from PIL import Image, ImageFile

# Import object detection utilities from some folder in Object Detection API (search for it since folder structure could change)
# If this code is implemented somewhere with a persistent directory structure (where the API location is known), can replace the following block with these two lines:
#    from [object_detection_folder].utils import ops as ops_util
os.chdir(CODE_DIR)
for directory in os.listdir(CODE_DIR):
    if 'utils' in os.listdir(directory):
        ops_util = importlib.import_module('.ops', package=(directory + '.utils'))
        break
else:
    print('WARNING: ops module not found - inference will probably fail')

# Method to prep and detect objects in one image
def run_inference_for_single_image(image, graph):
    start_time = timer()

    with graph.as_default():
        with tf.Session() as sess:
            # Get handles to input and output tensors
            ops = tf.get_default_graph().get_operations()
            all_tensor_names = {output.name for op in ops for output in op.outputs}
            tensor_dict = {}
            for key in ['num_detections', 'detection_boxes', 'detection_scores', 'detection_classes', 'detection_masks']:
                tensor_name = key + ':0'
                if tensor_name in all_tensor_names:
                    tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(tensor_name)
            
            image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

            # Run inference
            output_dict = sess.run(tensor_dict, feed_dict={image_tensor: image})  # np.expand_dims(image, 0)})

            # All outputs are float32 numpy arrays, so convert types as appropriate
            output_dict['num_detections'] = int(output_dict['num_detections'][0])
            output_dict['detection_classes'] = output_dict['detection_classes'][0].astype(np.uint8)
            output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
            output_dict['detection_scores'] = output_dict['detection_scores'][0]
    
    end_time = timer()
    return output_dict, (end_time - start_time)

# Method to get data from an image into a NumPy array
def load_image_into_numpy_array(image):
    (im_width, im_height) = image.size
    return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)


# =========================================


## SET UP

run_inference = True

# Get inference graph from model
detection_graph = tf.Graph()
with detection_graph.as_default():
    od_graph_def = tf.GraphDef()
    with tf.gfile.GFile(model_graph_file, 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')

# Compile paths to test images
print('Getting paths to test images... '),
test_img_paths = []
all_imgs_found = True
num_test_imgs = 0
for fn in test_files:
    try:
        test_img_paths.append(glob.glob(os.path.join(IMG_TEST_DIR, fn + '.*'))[0])
        num_test_imgs += 1
    except:
        if all_imgs_found:
            all_imgs_found = False
            print('')
        print('    Skipping {} - could not find image file in {}'.format(fn, IMG_TEST_DIR))

print('Done!' if all_imgs_found else '...Done getting image paths.'),
print('Found {} valid test images'.format(num_test_imgs))

if (max_num_img_to_infer > 0 and max_num_img_to_infer < len(test_img_paths)):
    test_img_paths = random.sample(test_img_paths, max_num_img_to_infer)
num_test_imgs = len(test_img_paths)

# Sidestep error with large image files
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Size of output images (inches)
img_size = (output_image_width, output_image_height)

# Array to hold inference times
inference_times = []

# Directory to output bounding box data
try:
    os.makedirs(OUT_BOX_DIR)
except OSError:
    check = raw_input('Output box directory {} is not empty. Continue? [y/n]'.format(OUT_BOX_DIR))
    if check.lower() != 'y':
        run_inference = False
        print('Stopping run - handle the old output boxes, then rerun this cell')

        
## RUN INFERENCE

if run_inference:
    print(' ')
    print('Running inference on ' + str(num_test_imgs) + ' images')
    print('      - Min. score threshold to draw box: ' + str(0.01 * min_confidence))
    print('      - Max. number of boxes drawn: ' + str(max_boxes))

    for img_idx, img_path in enumerate(test_img_paths):
        # If file is not an image, skip it
        fn_ext = os.path.splitext(img_path)[1]
        if fn_ext not in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
            print("  ({}) Skipping {} - file extension {} not recognized as image".format(img_idx + 1, os.path.basename(img_path), fn_ext))
            continue

        # Get image
        image = Image.open(img_path)
        image_np = load_image_into_numpy_array(image)  # Get array representation of image
        image_np_expanded = np.expand_dims(image_np, axis=0)  # Expand dimensions (model expects images to have shape [1, None, None, 3])

        # Detect objects
        output_dict, inference_time = run_inference_for_single_image(image_np_expanded, detection_graph)
        inference_times.append(inference_time)

        # Save detection boxes to file
        fn_base = os.path.basename(img_path).rsplit('.', 1)[0]  # Without file extension
        out_path = os.path.join(OUT_BOX_DIR, fn_base + '.txt')

        try:
            os.remove(out_path)  # If file exists, delete it (to be overwritten)
        except OSError:
            pass

        with open(out_path, 'w+') as outfile:
            box_str   = '[' + ','.join('[' + ','.join(str(sub_e) for sub_e in e) + ']' for e in output_dict['detection_boxes']) + ']'
            class_str = '[' + ','.join(str(e) for e in output_dict['detection_classes']) + ']'
            score_str = '[' + ','.join(str(e) for e in output_dict['detection_scores'])  + ']'

            outfile.write(os.path.basename(img_path) + '\n')
            outfile.write('detection_boxes:\n')
            outfile.write(box_str)
            outfile.write('\ndetection_class_ids:\n')
            outfile.write(class_str)
            outfile.write('\ndetection_scores:\n')
            outfile.write(score_str)
            outfile.write('\ndetection_labels_indexed_to_id:\n')

            if (len(class_list) > 1):
                outfile.write('[' + ','.join(("'" + str(e) + "'") for e in class_list) + ']')  # Labels by name (dummy 0 as first entry)
            else:
                outfile.write('[' + ','.join(str(e) for e in range(max(output_dict['detection_classes']) + 1)))  # Labels by numerical index

            outfile.write('\ninference_time_seconds:\n')
            outfile.write(str(inference_time))
    
        print("  ({}) Done {}".format(img_idx + 1, os.path.basename(img_path)).ljust(70, ' ') + " Time: {} s".format(inference_time))
        sys.stdout.flush()

## Freeze Trained Model
and get zip archives ready to upload to Google Cloud Storage

In [7]:
# -------- Parameters --------

export_iteration = 0  # Iteration at which to export graph (0 = export latest iteration)

get_pbtxt = False  # Whether to get graph as a .pbtxt file in addition to .pb (ASCII-readable, but much larger)

archive_model = True  # Whether to save frozen model into a zip archive
archive_ckpts = True  # Whether to also save all available intermediate checkpoints into a zip archive


# -----

export_code_file = 'export_inference_graph.py'


# =========================================


def pb_from_pbtxt(path_to_pbtxt):
    import tensorflow as tf
    from google.protobuf import text_format

    saved_graph_dir, pbtxt_fn = split_top_path_level(path_to_pbtxt)
    pb_fn = os.path.splitext(pb_fn)[0] + '.pb'

    with open(path_to_pbtxt, 'r') as f:
        graph_def = tf.GraphDef()
        text_format.Merge(f.read(), graph_def)
        tf.graph_util.import_graph_def(graph_def, name='')
        tf.train.write_graph(graph_def, saved_graph_dir, pb_fn, as_text=False)
    return os.path.join(saved_graph_dir, pb_fn)

def pbtxt_from_pb(path_to_pb):
    import tensorflow as tf
    from tensorflow.python.platform import gfile

    saved_graph_dir, pb_fn = split_top_path_level(path_to_pb)
    pbtxt_fn = os.path.splitext(pb_fn)[0] + '.pbtxt'

    with gfile.FastGFile(path_to_pb, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        tf.graph_util.import_graph_def(graph_def, name='')
        tf.train.write_graph(graph_def, saved_graph_dir, pbtxt_fn, as_text=True)
    return os.path.join(saved_graph_dir, pbtxt_fn)


# ==========================================


import os, glob
from datetime import datetime
import traceback


# PREP

print('Getting ready to export model...')

# Get iteration to export
min_iteration = 99999999
max_iteration = 0
for fn in glob.glob(os.path.join(MODEL_CKPT_DIR, 'model.ckpt*.meta')):
    this_iteration = int(re.search(r'model\.ckpt-(\d+)', fn).group(1))
    if this_iteration > max_iteration: max_iteration = this_iteration
    if this_iteration < min_iteration: min_iteration = this_iteration
# If export_iteration == 0, use latest iteration
if export_iteration == 0:
    export_iteration = max_iteration

print("    Exporting model at iteration {}".format(export_iteration))

# Try to create directory to export model to
export_dir = os.path.join(MODEL_FRZN_DIR, str(export_iteration))
try:
    os.makedirs(export_dir)
    print("    Exporting model to directory {}".format(export_dir))
except OSError as e:
    if os.path.isdir(export_dir):
        if len(os.listdir(export_dir)) != 0:
            print("    Desired export directory {} is nonempty - archiving... ".format(export_dir))
            export_archive_dir = archive_files_to_indexed_directory(export_dir)
            os.makedirs(export_dir)
            print("...Done archiving old files to {}".format(export_archive_dir))
        else:
            pass  # Directory is empty, so it's fine to export there
    else:
        raise e  # Something else happened

# Find config file in initial model directory
config_files = glob.glob(os.path.join(MODEL_INIT_DIR, '*.config'))
if len(config_files) == 1:
    config_file = config_files[0]
    print('    Using config file {}'.format(config_file))
elif len(config_files) == 0:
    raise RuntimeError('Cannot find a .config file in {}!'.format(MODEL_INIT_DIR))
else:
    raise RuntimeError('There are multiple .config files in {} - cannot decide which to use! (Files to choose from: {})'.format(MODEL_INIT_DIR, config_files))

# Find export file in code directory
os.chdir(CODE_DIR)
for directory in os.listdir(CODE_DIR):
    if export_code_file in os.listdir(directory):
        export_model_fxn = os.path.join(CODE_DIR, directory, export_code_file)
        print('    Exporting model using code file {}'.format(export_model_fxn))
        break
else:
    raise RuntimeError('Could not find file {} in any first-level subdirectory of {} - cannot export model'.format(train_code_file, CODE_DIR))
os.chdir(WORKING_DIR)

print('...Done getting ready')


# EXPORT INFERENCE GRAPH

print(' ')
print('Exporting model...')
print('===================================================================================\n')
tf.logging.set_verbosity(tf.logging.WARN)

trained_ckpt_prefix = os.path.join(MODEL_CKPT_DIR, 'model.ckpt-{}'.format(export_iteration))

!python {export_model_fxn} \
      --input_type=image_tensor \
      --pipeline_config_path={config_file} \
      --output_directory={export_dir} \
      --trained_checkpoint_prefix={trained_ckpt_prefix}

print('\n===================================================================================')
print('...Export complete (if no errors between the "=" lines above)!')


# CONVERT GRAPH TO PBTXT

if get_pbtxt:
    print(' ')
    print("Converting graph at iteration {} to pbtxt...".format(export_iteration)),
    try:
        pbtxt_from_pb(os.path.join(MODEL_FRZN_DIR, str(export_iteration), 'frozen_inference_graph.pb'))
    except:
        print("Failed")
        raise
    else:
        print("Done")


# MAKE ARCHIVE WITH EXPORTED MODEL

if archive_model:
    # Make zip archive
    try: os.makedirs(os.path.join(MODEL_FRZN_DIR, 'zips'))
    except OSError: pass  # Directory probably already made

    print(' ')
    print('Archiving exported model at iteration {} to {}... '.format(export_iteration, os.path.join(MODEL_FRZN_DIR, 'zips'))),
    now = datetime.now()
    export_archive_path = os.path.join(MODEL_FRZN_DIR, 'zips', 'model_{}_frzn__iter{}'.format(now.strftime('%Y%m%d'), export_iteration))
    shutil.make_archive(export_archive_path, 'zip', export_dir)  # Make archive with all files in exported data folder
    print('Done')
    print('  -- Upload to Google Cloud Storage by running command in SSH window:')
    print('     "gsutil cp {}.zip gs://{}"'.format(export_archive_path, GCLOUD_BUCKET))

if archive_ckpts:
    print(' ')
    print('Archiving checkpoints to {}... '.format(os.path.join(MODEL_FRZN_DIR, 'zips'))),
    ckpt_archive_path = os.path.join(MODEL_FRZN_DIR, 'zips', 'model_{}_ckpt__iter{}-{}'.format(now.strftime('%Y%m%d'), min_iteration, max_iteration))
    shutil.make_archive(ckpt_archive_path, 'zip', MODEL_CKPT_DIR)  # Make archive with all files in exported data folder
    print('Done')
    print('  -- Upload to Google Cloud Storage by running command in SSH window:')
    print('     "gsutil cp {}.zip gs://{}"'.format(ckpt_archive_path, GCLOUD_BUCKET))

Getting ready to export model...
    Exporting model at iteration 57949
    Exporting model to directory /home/jupyter/models/frzn/57949
    Using config file /home/jupyter/models/init/pipeline.config
    Exporting model using code file /home/jupyter/src/object_detection_api/object_detection/export_inference_graph.py
...Done getting ready
 
Exporting model...

2019-09-23 15:05:54.128063: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
Traceback (most recent call last):
  File "/home/jupyter/src/object_detection_api/object_detection/export_inference_graph.py", line 108, in <module>
    from object_detection import exporter
ImportError: No module named object_detection

...Export complete (if no errors between the "=" lines above)!
 
Archiving exported model at iteration 57949 to /home/jupyter/models/frzn/zips...  Done
  -- Upload to Google Cloud Storage by running command in SSH window:
     "gsutil cp /home/jupyter/m

## Continue here...