In [None]:
# Version History
#print('Version 1.0.0: 09/08/2022 07:56pm - Nate Calvanese - Initial Version')
#print('Version 1.0.1: 09/26/2022 01:41pm - Nate Calvanese - Fixed bug when MD5 was not available')
#print('Version 1.0.1: 09/26/2022 09:26pm - Nate Calvanese - Updated to be able to point to multiple buckets at once')
#print('Version 1.0.2: 09/29/2022 02:59pm - Nate Calvanese - Added progress logging and retry logic for cloud storage calls')
#print('Version 1.0.3: 10/04/2022 4:26pm - Nate Calvanese - Flattened target file path to help with ingest performance')
print('Version 1.0.4: 10/12/2022 10:26am - Nate Calvanese - Added support for specifying additional buckets/dirs')



In [None]:
## Imports and environment variables

# Imports
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import re
import uuid
import logging

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)


In [None]:
## Functions

# Function to return objects in specified bucket
def get_objects_list(bucket_name, user_proj, dirs_to_include=[], dirs_to_exclude=[]):
    
    # Collect list of objects/blobs from bucket 
    obj_list = []
    storage_client = storage.Client()
    storage_bucket = storage_client.bucket(bucket_name, user_project=user_proj)
    objects = list(storage_client.list_blobs(storage_bucket))
    
    # Loop through list of objects and append names to final list based on the directories to include and exclude
    for obj in objects:
        # Process inclusion
        if len(dirs_to_include) > 0:
            for entry in dirs_to_include:
                entry_str = (entry + "/").replace("//", "/")
                if entry_str in obj.name:
                    obj_list.append(obj.name)
        else:
            obj_list.append(obj.name)
        # Process exclusions
        if len(dirs_to_exclude) > 0:
            for entry in dirs_to_exclude:
                entry_str = (entry + "/").replace("//", "/")
                if entry_str in obj.name and obj.name in obj_list:
                    obj_list.remove(obj.name)
    return obj_list

# Function to return object metadata
def get_object(bucket_name, user_proj, object_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name, user_project=user_proj)
    obj = bucket.get_blob(object_name)
    return obj

# Function to pull full file extension (including compression extensions)
def get_full_file_ext(filepath):
    full_ext_string = filepath
    compression_extension = ''
    compression_extensions = ['.7z', '.zip', '.gz', '.tar.gz', '.tgz']
    for item in compression_extensions:
        pattern = item + '$'
        if re.search(pattern, full_ext_string):
            full_ext_string = re.sub(pattern, '', full_ext_string)
            compression_extension = item
            break
    full_ext_string = os.path.splitext(full_ext_string)[1] + compression_extension
    return full_ext_string

# Function to build file inventory
def build_inventory(params):

    # Collect parameters
    data_files_src_buckets = params["data_files_src_buckets"]
    user_project = params["google_project"]
    file_inventory_dir = params["file_inventory_dir"]
    
    # Initialize variables
    record_list = []
    retry_count = 0
    prior_record_list_len = 0

    # Loop through object list to construct inventory entry for each non-directory object 
    if data_files_src_buckets == None:
        data_files_src_buckets[ws_bucket_name] = {
            "include_dirs": [],
            "exclude_dirs": []
        }
    for bucket, criteria in data_files_src_buckets.items():
        # Get object list for bucket in question
        attempt_counter = 0
        while True:
            try:
                object_list = get_objects_list(bucket, user_project, criteria["include_dirs"], criteria["exclude_dirs"])
                break
            except Exception as e:
                attempt_counter += 1
                if attempt_counter <= 2:
                    continue
                else:
                    raise Exception(str(e))
        # Loop through object list for bucket in question
        object_count = len(object_list)
        if object_count >= 100:
            object_count_decile = object_count//10
            log_deciles = True
        else:
            object_count_decile = object_count
            log_deciles = False
        logging.info("Recording inventory entries from {} ({} objects total)".format(bucket, str(object_count)))
        for entry in object_list:
            if not re.search('/$', entry):
                # Get object to build inventory entry record
                attempt_counter = 0
                while True:
                    try:
                        # Collect information from object
                        entry_obj = get_object(bucket, user_project, entry)
                        entry_obj_uri = "gs://" + bucket + "/" + entry_obj.name
                        entry_obj_id_str = "".join(filter(None, [entry_obj_uri, entry_obj.md5_hash]))
                        entry_obj_id = str(uuid.uuid5(uuid.NAMESPACE_OID, str(entry_obj_id_str)))
                        entry_obj_file_name = os.path.split(entry_obj.name)[1]
                        entry_obj_path = entry_obj.name
                        entry_obj_full_ext = get_full_file_ext(entry_obj_file_name)
                        # Construct fileref object
                        fileref_obj = {}
                        fileref_obj['sourcePath'] = entry_obj_uri
                        fileref_obj['targetPath'] = "/" + entry_obj_path.replace("/", "_")
                        fileref_obj['description'] = f"Ingest of {entry_obj_uri}"
                        fileref_obj['mimeType'] = entry_obj.content_type if entry_obj.content_type else "application/octet-stream"
                        # Construct inventory entry record and append to record list
                        entry_obj_record = []
                        entry_obj_record = [entry_obj_id, entry_obj_file_name, entry_obj_path, entry_obj_uri, entry_obj.content_type, entry_obj_full_ext, entry_obj.size, entry_obj.crc32c, entry_obj.md5_hash, fileref_obj]  
                        record_list.append(entry_obj_record)
                        record_list_len = len(record_list) - prior_record_list_len
                        if log_deciles:
                            if record_list_len%object_count_decile == 0:
                                completed_perc = (record_list_len//object_count_decile) * 10
                                logging.info("{} files recorded (~{}%)".format(str(record_list_len), str(completed_perc)))
                        else:
                            if record_list_len == object_count_decile:
                                logging.info("{} files recorded (~100%)".format(str(record_list_len)))
                        break
                    except Exception as e:
                        attempt_counter += 1
                        retry_count += 1
                        if attempt_counter <= 2:
                            continue
                        else:
                            raise Exception(str(e))
        prior_record_list_len = len(record_list)
    
    # Build inventory dataframe, drop duplicates, and build JSON object
    logging.info("All inventory entries recorded ({} objects total).".format(str(len(record_list))))
    column_list = ['file_id', 'name', 'path', 'uri', 'content_type', 'full_extension', 'size_in_bytes', 'crc32c', 'md5_hash', 'file_ref']
    df_file_inventory = pd.DataFrame(record_list, columns = column_list)
    df_file_inventory.drop_duplicates(['name', 'md5_hash'], keep='first', inplace=True, ignore_index=True)
    file_inventory = df_file_inventory.to_dict(orient='records')
    
    # Write out inventory as file
    destination_file = "file_inventory.tsv"
    df_file_inventory.to_csv(destination_file, index=False, sep='\t')
    !gsutil cp $destination_file $ws_bucket/$file_inventory_dir/ 2> stdout
    
    return file_inventory, retry_count


In [None]:
# Test
# params = {}
# params["data_files_src_buckets"] = {
#     "fc-secure-0aedc988-3736-496c-b7ac-20cca5b3ceb9": {
#         "include_dirs": [], # Leave empty to include all
#         "exclude_dirs": []
#     },
#     "fc-secure-4859bab0-bf7e-4eb0-8ded-c6caeb89feba": {
#         "include_dirs": ["146623", "146629"], # Leave empty to include all
#         "exclude_dirs": []
#     },
#     "fc-secure-34f13712-8698-47cb-9b1e-a1b87fae14fa": {
#         "include_dirs": [], # Leave empty to include all
#         "exclude_dirs": ["199168"]
#     } 
# }
# params["google_project"] = "terra-349c8d95"
# params["file_inventory_dir"] = "ingest_pipeline/input/test/data_files/file_inventory"
# inventory, retry_count = build_inventory(params)


In [None]:
# print(retry_count)
# print(inventory)