In [None]:
# Version History
#print('Version 1.0.0: 09/08/2022 07:56pm - Nate Calvanese - Initial Version')
#print('Version 1.0.1: 09/26/2022 01:41pm - Nate Calvanese - Fixed bug when MD5 was not available')
#print('Version 1.0.1: 09/26/2022 09:26pm - Nate Calvanese - Updated to be able to point to multiple buckets at once')
#print('Version 1.0.2: 09/29/2022 02:59pm - Nate Calvanese - Added progress logging and retry logic for cloud storage calls')
#print('Version 1.0.3: 10/04/2022 4:26pm - Nate Calvanese - Flattened target file path to help with ingest performance')
#print('Version 1.0.4: 10/12/2022 10:26am - Nate Calvanese - Added support for specifying additional buckets/dirs')
#print('Version 1.0.5: 3/6/2023 3:39pm - Nate Calvanese - Added support for remote file references')
#print('Version 2.0.0: 3/7/2023 9:32pm - Nate Calvanese - Massive performance improvement with use of gsutil parsing')
#print('Version 2.0.1: 3/23/2023 8:29pm - Nate Calvanese - Added support for a global file exclusion')
#print('Version 2.0.2: 5/25/2023 9:29am - Nate Calvanese - Updated target path logic to better support remote file references')
#print('Version 2.0.3: 10/6/2023 9:29am - Nate Calvanese - Tweaked file extension parsing logic')
#print('Version 2.0.4: 4/12/2024 2:30pm - Nate Calvanese - Fixed target path logic to remove unsupported characters')
print('Version 2.0.4: 10/18/2024 2:19pm - Nate Calvanese - Updated get_objects_list function to not use fuzzy matching for full file paths')


In [None]:
## Imports and environment variables

# Imports
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import re
import uuid
import logging
import subprocess

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)


In [None]:
## Functions

# Function to parse gsutil ls -L contents into dict structure
def parse_ls_output(subprocess_output):
    records = []
    file_dict = {}
    for line in subprocess_output.split("\n"):
        if line[0:2] == "gs":
            if file_dict:
                records.append(file_dict)
            file_dict = {}
            if "/:" not in line:
                file_dict["GlobalPath"] = re.sub(":$", "", line)
        else:
            if file_dict:
                if "Content-Length:" in line:
                    clm = re.match("\s*Content-Length:\s*([0-9]+)", line).group(1)
                    file_dict["Size"] = clm
                elif "Content-Type:" in line:
                    ctm = re.match("\s*Content-Type:\s*(.*)", line).group(1)
                    file_dict["Type"] = ctm 
                elif "Hash (crc32c):" in line:
                    crcm = re.match("\s*Hash \(crc32c\):\s*(.*)", line).group(1)
                    file_dict["crc32c"] = crcm
                elif "Hash (md5):" in line:
                    md5m = re.match("\s*Hash \(md5\):\s*(.*)", line).group(1)
                    file_dict["md5"] = md5m
                elif "Update time:" in line:
                    updm = re.match("\s*Update time:\s*(.*)", line).group(1)
                    file_dict["Modified"] = updm
    if file_dict:
        records.append(file_dict)
    return records

# Function to return objects in specified bucket
def get_objects_list(bucket_name, user_proj, dirs_to_include=[], dirs_to_exclude=[]):
    
    # Collect list of objects/blobs from bucket 
    obj_list = []
    storage_client = storage.Client()
    storage_bucket = storage_client.bucket(bucket_name, user_project=user_proj)
    objects = list(storage_client.list_blobs(storage_bucket))
    
    # Loop through list of objects and append names to final list based on the directories to include and exclude
    for obj in objects:
        # Process inclusion
        if len(dirs_to_include) > 0:
            for entry in dirs_to_include:
                if "." in entry:
                    entry_str = entry
                    if entry_str == obj.name:
                        obj_list.append(obj.name)
                else:
                    entry_str = (entry + "/").replace("//", "/")
                    if entry_str in obj.name:
                        obj_list.append(obj.name)
        else:
            obj_list.append(obj.name)
        # Process exclusions
        if len(dirs_to_exclude) > 0:
            for entry in dirs_to_exclude:
                if "." in entry:
                    entry_str = entry
                    if entry_str == obj.name and obj.name in obj_list:
                        obj_list.remove(obj.name)
                else:
                    entry_str = (entry + "/").replace("//", "/")
                    if entry_str in obj.name and obj.name in obj_list:
                        obj_list.remove(obj.name)
    return obj_list

# Function to return object metadata
def get_object(bucket_name, user_proj, object_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name, user_project=user_proj)
    obj = bucket.get_blob(object_name)
    return obj

# Function to pull full file extension (including compression extensions)
def get_full_file_ext(filepath):
    full_ext_string = filepath
    compression_extension = ''
    compression_extensions = ['.7z', '.zip', '.gz', '.tar.gz', '.tgz', '.bgz', '.tar']
    for item in compression_extensions:
        pattern = "\\" + item + "$"
        if re.search(pattern, full_ext_string):
            full_ext_string = re.sub(pattern, '', full_ext_string)
            compression_extension = item
            break
    full_ext_string = os.path.splitext(full_ext_string)[1] + compression_extension
    return full_ext_string

# Function to build file inventory
def build_inventory(params):

    # Collect parameters
    data_files_src_buckets = params["data_files_src_buckets"]
    user_project = params["google_project"]
    file_inventory_dir = params["file_inventory_dir"]
    global_file_exclusions = params["global_file_exclusions"]
    
    # Initialize variables
    record_list = []
    retry_count = 0
    prior_record_list_len = 0

    # Loop through object list to construct inventory entry for each non-directory object 
    if data_files_src_buckets == None:
        data_files_src_buckets[ws_bucket_name] = {
            "include_dirs": [],
            "exclude_dirs": []
        }
    for bucket, criteria in data_files_src_buckets.items():
        
        # Get object list for bucket in question
        attempt_counter = 0
        while True:
            try:
                object_list = get_objects_list(bucket, user_project, criteria["include_dirs"], criteria["exclude_dirs"])
                break
            except Exception as e:
                attempt_counter += 1
                if attempt_counter <= 2:
                    continue
                else:
                    raise Exception(str(e))
        
        # Loop through object list for bucket in question
        object_count = len(object_list)
        if object_count >= 100:
            object_count_decile = object_count//10
            log_deciles = True
        else:
            object_count_decile = object_count
            log_deciles = False
        logging.info("Recording inventory entries from {} ({} objects total)".format(bucket, str(object_count)))
        
        # For buckets without criteria specified, pull file metadata information using gsutil parsing
        if len(criteria["include_dirs"]) == 0 and len(criteria["exclude_dirs"]) == 0:
            gcs_path = "gs://" + bucket
            cmd = f"gsutil -u anvil-datastorage ls -L {gcs_path}/**"
            output = subprocess.check_output(cmd, shell=True, universal_newlines=True)
            records = parse_ls_output(output)
            df = pd.DataFrame(records)
            for index, row in df.iterrows():
                # Collect information from object
                row.mask(row.isna(), other=None, inplace=True)
                entry_obj_uri = row["GlobalPath"]
                entry_obj_id_str = "".join(filter(None, [entry_obj_uri, row["md5"]]))
                entry_obj_id = str(uuid.uuid5(uuid.NAMESPACE_OID, str(entry_obj_id_str)))
                entry_obj_file_name = os.path.basename(row["GlobalPath"])
                entry_obj_path = row["GlobalPath"].replace(gcs_path + "/", "")
                entry_obj_full_ext = get_full_file_ext(entry_obj_file_name)
                if row["Type"] == "None":
                    entry_obj_type = None
                else:
                    entry_obj_type = row["Type"]
                # Construct fileref object
                fileref_obj = {}
                fileref_obj['sourcePath'] = entry_obj_uri
                fileref_obj['targetPath'] = "/" + entry_obj_path.replace("/", "_").replace("#", "").replace("?", "")
                fileref_obj['description'] = f"Ingest of {entry_obj_uri}"
                fileref_obj['mimeType'] = entry_obj_type if entry_obj_type else "application/octet-stream"
                # Construct inventory entry record and append to record list
                entry_obj_record = []
                entry_obj_record = [entry_obj_id, entry_obj_file_name, entry_obj_path, entry_obj_uri, entry_obj_type, entry_obj_full_ext, row["Size"], row["crc32c"], row["md5"], fileref_obj, fileref_obj['targetPath']]  
                record_list.append(entry_obj_record)
                record_list_len = len(record_list) - prior_record_list_len
                if log_deciles:
                    if record_list_len%object_count_decile == 0:
                        completed_perc = (record_list_len//object_count_decile) * 10
                        logging.info("{} files recorded (~{}%)".format(str(record_list_len), str(completed_perc)))
                else:
                    if record_list_len == object_count_decile:
                        logging.info("{} files recorded (~100%)".format(str(record_list_len)))
            prior_record_list_len = len(record_list)
        
        # For buckets with criteria specified, pull file metadata information using GCS API calls
        else:
            for entry in object_list:
                if not re.search('/$', entry):
                    # Get object to build inventory entry record
                    attempt_counter = 0
                    while True:
                        try:
                            # Collect information from object
                            entry_obj = get_object(bucket, user_project, entry)
                            entry_obj_uri = "gs://" + bucket + "/" + entry_obj.name
                            entry_obj_id_str = "".join(filter(None, [entry_obj_uri, entry_obj.md5_hash]))
                            entry_obj_id = str(uuid.uuid5(uuid.NAMESPACE_OID, str(entry_obj_id_str)))
                            entry_obj_file_name = os.path.split(entry_obj.name)[1]
                            entry_obj_path = entry_obj.name
                            entry_obj_full_ext = get_full_file_ext(entry_obj_file_name)
                            # Construct fileref object
                            fileref_obj = {}
                            fileref_obj['sourcePath'] = entry_obj_uri
                            fileref_obj['targetPath'] = "/" + entry_obj_path.replace("/", "_").replace("#", "").replace("?", "")
                            fileref_obj['description'] = f"Ingest of {entry_obj_uri}"
                            fileref_obj['mimeType'] = entry_obj.content_type if entry_obj.content_type else "application/octet-stream"
                            # Construct inventory entry record and append to record list
                            entry_obj_record = []
                            entry_obj_record = [entry_obj_id, entry_obj_file_name, entry_obj_path, entry_obj_uri, entry_obj.content_type, entry_obj_full_ext, entry_obj.size, entry_obj.crc32c, entry_obj.md5_hash, fileref_obj, fileref_obj['targetPath']]  
                            record_list.append(entry_obj_record)
                            record_list_len = len(record_list) - prior_record_list_len
                            if log_deciles:
                                if record_list_len%object_count_decile == 0:
                                    completed_perc = (record_list_len//object_count_decile) * 10
                                    logging.info("{} files recorded (~{}%)".format(str(record_list_len), str(completed_perc)))
                            else:
                                if record_list_len == object_count_decile:
                                    logging.info("{} files recorded (~100%)".format(str(record_list_len)))
                            break
                        except Exception as e:
                            attempt_counter += 1
                            retry_count += 1
                            if attempt_counter <= 2:
                                continue
                            else:
                                raise Exception(str(e))
            prior_record_list_len = len(record_list)
    
    # Remove file inventory records that match a global exclusion term
    logging.info("All inventory entries recorded ({} objects total).".format(str(len(record_list))))
    global_exclusion_text = "; ".join(global_file_exclusions)
    logging.info("Removing file objects in global file exclusion list: " + global_exclusion_text)
    for term in global_file_exclusions:
        temp_record_list = []
        for record in record_list:
            if term not in record[3]:
                temp_record_list.append(record)
        record_list = []
        record_list = temp_record_list.copy()
    logging.info(f"{str(len(record_list))} objects remain after file removal.")
    
    # Build inventory dataframe, drop duplicates, and build JSON object
    column_list = ['file_id', 'name', 'path', 'uri', 'content_type', 'full_extension', 'size_in_bytes', 'crc32c', 'md5_hash', 'file_ref', 'target_path']
    df_file_inventory = pd.DataFrame(record_list, columns = column_list) 
    df_file_inventory.drop_duplicates(['target_path', 'md5_hash', 'size_in_bytes'], keep='first', inplace=True, ignore_index=True)
    df_file_inventory.drop(columns=['target_path'], inplace=True)
    file_inventory = df_file_inventory.to_dict(orient='records')
    
    # Write out inventory as file
    destination_file = "file_inventory.tsv"
    df_file_inventory.to_csv(destination_file, index=False, sep='\t')
    !gsutil cp $destination_file $ws_bucket/$file_inventory_dir/ 2> stdout
    
    return file_inventory, retry_count


In [None]:
# Test
# params = {}
# params["data_files_src_buckets"] = {
# #     "fc-a9e7890c-3902-4647-8b82-273490a7ce54": {
# #         "include_dirs": [], # Leave empty to include all
# #         "exclude_dirs": []
# #     },
# #     "fc-secure-4859bab0-bf7e-4eb0-8ded-c6caeb89feba": {
# #         "include_dirs": ["146623", "146629"], # Leave empty to include all
# #         "exclude_dirs": []
# #     },
# #     "fc-secure-34f13712-8698-47cb-9b1e-a1b87fae14fa": {
# #         "include_dirs": [], # Leave empty to include all
# #         "exclude_dirs": ["199168"]
# #     },
# #     "fc-secure-4984306f-6ceb-48a0-87d5-a6c4ef499867": {
# #         "include_dirs": [], # Leave empty to include all
# #         "exclude_dirs": []
# #     },
#     "fc-4310e737-a388-4a10-8c9e-babe06aaf0cf": {
#         "include_dirs": [], # Leave empty to include all
#         "exclude_dirs": []
#     },
# }
# params["google_project"] = "terra-349c8d95"
# params["file_inventory_dir"] = "ingest_pipeline/input/test/data_files/file_inventory"
# params["global_file_exclusions"] = ["SubsetHailJointCall", ".vds/"]
# inventory, retry_count = build_inventory(params)


In [None]:
# print(retry_count)
# df_final = pd.DataFrame(inventory)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option("display.max_colwidth", None)
# pd.set_option('display.width', 1000)
# pd.set_option('display.colheader_justify', 'center')
# pd.set_option('display.precision', 3)
# # display(df_final)
# output_file_path = "file_inventory_test_1.tsv"
# df_final.to_csv(output_file_path, index=False, sep="\t")
# !gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/ 2> stdout
# !rm $output_file_path