In [1]:
## imports and environment variables
# imports
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import re
import hashlib

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

In [2]:
## Functions

# Function to return objects in specified bucket
def get_objects_list(bucket_name, dirs_to_exclude=[], dirs_to_include=[]):
    
    # Collect list of objects/blobs from bucket 
    obj_list = []
    storage_client = storage.Client()
    storage_bucket = storage_client.bucket(bucket_name, user_project='dsp-data-ingest')
    objects = list(storage_client.list_blobs(storage_bucket))
    
    # Loop through list of objects and append names to final list based on the roots_to_include and roots_to_exclude variables
    for obj in objects:
        obj_root = obj.name.split('/')[0]
        if len(dirs_to_include) > 0:
            for entry in dirs_to_include:
                if entry in obj.name:
                    obj_list.append(obj.name)
        elif len(dirs_to_exclude) > 0:
            for entry in dirs_to_exclude:
                if entry not in obj.name:
                    obj_list.append(obj.name)
        else:
            obj_list.append(obj.name)
    return obj_list

# Function to return object metadata
def get_object(bucket_name, object_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name, user_project='dsp-data-ingest')
    obj = bucket.get_blob(object_name)
    return obj

# Function to build file manifest
def build_manifest(params):

    # Collect parameters
    data_files_src_bucket = params["data_files_src_bucket"]
    data_files_src_dirs = params["data_files_src_dirs"]
    data_files_src_dirs_exclude = params["data_files_src_dirs_exclude"]
    
    # Define record list
    record_list = []

    # Loop through object list to construct manifest entry for each non-directory object 
    if data_files_src_bucket == None:
        data_files_src_bucket = ws_bucket_name
    object_list = get_objects_list(data_files_src_bucket, data_files_src_dirs_exclude, data_files_src_dirs)
    for entry in object_list:
        if not re.search('/$', entry):
            # Collect information for manifest entry record
            entry_obj_record = []
            entry_obj = get_object(data_files_src_bucket, entry)
            entry_obj_uri = 'gs://' + data_files_src_bucket + '/' + entry_obj.name
            entry_obj_id_str = entry_obj_uri + entry_obj.md5_hash
            entry_obj_id = hashlib.md5(entry_obj_id_str.encode())
            entry_obj_file_name = os.path.split(entry_obj.name)[1]
            # Construct fileref object
            fileref_obj = {}
            fileref_obj['sourcePath'] = entry_obj_uri
            fileref_obj['targetPath'] = ('/' + entry_obj.name).replace('//', '/')
            fileref_obj['description'] = f'Ingest of {entry_obj_uri}'
            fileref_obj['mimeType'] = 'text/plain'
            # Construct manifest entry record and append to record list
            entry_obj_record = [entry_obj_id.hexdigest(), entry_obj_file_name, entry_obj.name, entry_obj_uri, entry_obj.content_type, entry_obj.size, entry_obj.crc32c, entry_obj.md5_hash, fileref_obj]  
            record_list.append(entry_obj_record)

    # Build manifest dataframe, drop duplicates, and build JSON object
    column_list = ['file_id', 'name', 'path', 'uri', 'content_type', 'size_in_bytes', 'crc32c', 'md5_hash', 'file_ref']
    df_file_manifest = pd.DataFrame(record_list, columns = column_list)
    df_file_manifest.drop_duplicates(['name', 'md5_hash'], keep='first', inplace=True, ignore_index=True)
    file_manifest = df_file_manifest.to_dict(orient='records')
    return file_manifest


In [3]:
# Test
params = {}
params["tf_input_dir"] = "ingest_pipeline/input/metadata"
params["tf_output_dir"] = "ingest_pipeline/output/tim_core/metadata"
params["data_files_src_bucket"] = "fc-secure-723963fa-faec-4de1-bf37-12d3c22a417a"
params["data_files_src_dirs"] = []  # Leave empty to include all
params["data_files_src_dirs_exclude"] = ["uw_phs000693_gru"] 
manifest = build_manifest(params)

In [None]:
print(manifest)