In [3]:
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import re
import hashlib
import logging

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")


In [4]:
## Transform functions

# Function to convert list represented as string to a list data type
def str_list_to_list(in_str, list_delim):
    out_list = []
    out_list = in_str.split(sep=list_delim)
    return out_list

# Function to concatenate a string value to each entry in a list (either 'prefix' or 'suffix')
def concat_str_to_list(in_str, in_list, delim='_', mode='prefix'):
    out_list = []
    for item in in_list:
        if mode == 'prefix':
            out_list.append(in_str + delim + item)
        elif mode == 'suffix':
            out_list.append(item + delim + instr)
        else:
            out_list.append(item)
    return out_list

# Function to convert non-null values from a list of columns into a list
def df_cols_to_list(in_list):
    out_list = []
    for item in in_list:
        if pd.notnull(item):
            out_list.append(item)
    return out_list

# Function to add value to existing list (or create new list)
def add_to_list(curr_value, new_value):
    return_list = []
    if new_value == None:
        if type(curr_value) == list:
            return_list = curr_value
        else:
            return_list.append(curr_value)
    elif type(new_value) == list:
        if curr_value == None:
            return_list = new_value
        elif type(curr_value) == list:
            return_list = curr_value
            for item in new_value:
                if item not in curr_value:
                    return_list.append(item)      
        elif type(curr_value) != list:
            return_list.append(curr_value)
            for item in new_value:
                if item != curr_value:
                    return_list.append(item) 
    elif type(new_value) != list:
        if curr_value == None:
            return_list.append(new_value)
        elif type(curr_value) == list:
            return_list = curr_value
            if new_value not in curr_value:
                return_list.append(new_value)         
        elif type(curr_value) != list:
            return_list.append(curr_value)
            if new_value != curr_value:
                return_list.append(new_value)
    return return_list

# Function to build file reference for file based on a given file name or path
def find_file_in_manifest(search_string, file_manifest):
    # Loop through file manifest and record fileref_obj_entry where matches are found
    fileref_obj = []
    match_cnt = 0
    for entry in file_manifest:
        file_path = ''
        if search_string in entry['name']:
            match_cnt += 1
            fileref_obj.append(entry['file_id'])
    
    # Return fileref object
    if match_cnt == 0:
        return None
    else:
        return fileref_obj

#print(find_file_in_manifest('112610'))
#print(add_to_list(None, find_file_in_manifest('112610')))

# Function to return objects in specified bucket
def get_objects_list(bucket_name, dirs_to_exclude=[], dirs_to_include=[]):
    
    # Collect list of objects/blobs from bucket 
    obj_list = []
    storage_client = storage.Client()
    storage_bucket = storage_client.bucket(bucket_name, user_project='dsp-data-ingest')
    objects = list(storage_client.list_blobs(storage_bucket))
    
    # Loop through list of objects and append names to final list based on the roots_to_include and roots_to_exclude variables
    for obj in objects:
        obj_root = obj.name.split('/')[0]
        if len(dirs_to_include) > 0:
            for entry in dirs_to_include:
                if entry in obj.name:
                    obj_list.append(obj.name)
        elif len(dirs_to_exclude) > 0:
            for entry in dirs_to_exclude:
                if entry not in obj.name:
                    obj_list.append(obj.name)
        else:
            obj_list.append(obj.name)
    return obj_list

# Function to return object metadata
def get_object(bucket_name, object_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name, user_project='dsp-data-ingest')
    obj = bucket.get_blob(object_name)
    return obj

# Function to pull full file extension (including compression extensions)
def get_full_file_ext(filepath):
    full_ext_string = filepath
    compression_extension = ''
    compression_extensions = ['.7z', '.zip', '.gz', '.tar.gz', '.tgz']
    for item in compression_extensions:
        pattern = item + '$'
        if re.search(pattern, full_ext_string):
            full_ext_string = re.sub(pattern, '', full_ext_string)
            compression_extension = item
            break
    full_ext_string = os.path.splitext(full_ext_string)[1] + compression_extension
    return full_ext_string

# Function to build file manifest
def build_manifest(params):

    # Collect parameters
    data_files_src_bucket = params["data_files_src_bucket"]
    data_files_src_dirs = params["data_files_src_dirs"]
    data_files_src_dirs_exclude = params["data_files_src_dirs_exclude"]
    
    # Define record list
    record_list = []

    # Loop through object list to construct manifest entry for each non-directory object 
    if data_files_src_bucket == None:
        data_files_src_bucket = ws_bucket_name
    object_list = get_objects_list(data_files_src_bucket, data_files_src_dirs_exclude, data_files_src_dirs)
    for entry in object_list:
        if not re.search('/$', entry):
            # Collect information for manifest entry record
            entry_obj_record = []
            entry_obj = get_object(data_files_src_bucket, entry)
            entry_obj_uri = 'gs://' + data_files_src_bucket + '/' + entry_obj.name
            entry_obj_id_str = entry_obj_uri + entry_obj.md5_hash
            entry_obj_id = hashlib.md5(entry_obj_id_str.encode())
            entry_obj_file_name = os.path.split(entry_obj.name)[1]
            entry_obj_full_ext = get_full_file_ext(entry_obj_file_name)
            # Construct fileref object
            fileref_obj = {}
            fileref_obj['sourcePath'] = entry_obj_uri
            fileref_obj['targetPath'] = ('/' + entry_obj.name).replace('//', '/')
            fileref_obj['description'] = f'Ingest of {entry_obj_uri}'
            fileref_obj['mimeType'] = 'text/plain'
            # Construct manifest entry record and append to record list
            entry_obj_record = [entry_obj_id.hexdigest(), entry_obj_file_name, entry_obj.name, entry_obj_uri, entry_obj.content_type, entry_obj_full_ext, entry_obj.size, entry_obj.crc32c, entry_obj.md5_hash, fileref_obj]  
            record_list.append(entry_obj_record)

    # Build manifest dataframe, drop duplicates, and build JSON object
    column_list = ['file_id', 'name', 'path', 'uri', 'content_type', 'full_extension', 'size_in_bytes', 'crc32c', 'md5_hash', 'file_ref']
    df_file_manifest = pd.DataFrame(record_list, columns = column_list)
    df_file_manifest.drop_duplicates(['name', 'md5_hash'], keep='first', inplace=True, ignore_index=True)
    file_manifest = df_file_manifest.to_dict(orient='records')
    return file_manifest


In [7]:
def transform(params):
    
    # Retrieve parameters of interest
    tf_input_dir = params["tf_input_dir"]
    tf_output_dir = params["tf_output_dir"]
    data_files_src_bucket = params["data_files_src_bucket"]
    data_files_src_dirs = params["data_files_src_dirs"]
    data_files_src_dirs_exclude = params["data_files_src_dirs_exclude"]
    fileref_columns = params["fileref_columns"]
    file_manifest = params["file_manifest"]
    
    # Build file manifest
    df_file = pd.DataFrame.from_dict(file_manifest)
    if df_file.empty:
        logging.error('Source file manifest is empty.')
        return
    
    # Attempt to read in additional optional files
    try:
        src_file = 'sequencing.tsv'
        src_file_path = ws_bucket + '/' + tf_input_dir + '/' + src_file
        df_seq = pd.read_csv(src_file_path, delimiter = '\t').rename(columns = {'entity:sequencing_id':'sequencing_id'})
        # Add generated_file_id column
        df_seq['generated_file_id'] = [[] for _ in range(len(df_seq))]
        for item in fileref_columns:
            if {item}.issubset(df_seq.columns):
                df_seq['generated_file_id'] = df_seq.apply(lambda x: add_to_list(x['generated_file_id'], find_file_in_manifest(x[item], file_manifest)) if(pd.notnull(x[item])) else x['generated_file_id'], axis=1)
        # Explode sequencing dataframe to have one row per file_id
        df_seq_explode = df_seq.explode('generated_file_id')
        # Determine sequencing columns to pull into file dataframe, then merge in file dataframe
        file_transform_columns = ['generated_file_id', 'file_name', 'md5sum', 'data_type']
        final_file_transform_col_list = []
        for item in file_transform_columns:
            if item in df_seq.columns:
                final_file_transform_col_list.append(item)
        df_file = df_file.merge(df_seq_explode[final_file_transform_col_list], how='left', left_on='file_id', right_on='generated_file_id')
    except:
        logging.warning('Optional source file {src} not found. File will not be used.'.format(src = src_file))

    # Transform mapped fields (appending new fields to end of existing DF for now)
    if {'file_id'}.issubset(df_file.columns):
        df_file['file_id'] = df_file['file_id']
    if {'file_ref'}.issubset(df_file.columns):
        df_file['file_ref'] = df_file['file_ref']
    if {'size_in_bytes'}.issubset(df_file.columns):
        df_file['byte_size'] = df_file['size_in_bytes']
    if {'md5_hash'}.issubset(df_file.columns):
        df_file['checksum'] = df_file['md5_hash']
    if {'full_extension'}.issubset(df_file.columns):
        df_file['file_format'] = df_file['full_extension']
    if {'name'}.issubset(df_file.columns):
        df_file['xref'] = df_file.apply(lambda x: [x['path']] if(pd.notnull(x['path'])) else [], axis=1)
        
    # Limit DF to transformed and passthrough fields
    mapped_columns = ['file_id', 'file_ref', 'byte_size', 'checksum', 'file_format', 'xref']
    passthrough_columns = ['file_name', 'md5sum', 'data_type']
    final_col_list = []
    for item in mapped_columns:
        if item in df_file.columns:
            final_col_list.append(item)
    for item in passthrough_columns:
        if item in df_file.columns:
            final_col_list.append(item)
    df_file2 = df_file[final_col_list] # Creating to avoid any cardinality issues when rejoining the passthrough data in the subsequent steps

    # Build passthrough string 
    passthrough_col_list = []
    for item in passthrough_columns:
        if item in df_file2.columns:
            passthrough_col_list.append(item)
    passthrough_col_list.sort()
    passthrough_df_file = df_file2[passthrough_col_list]
    add_data_df_file = passthrough_df_file.apply(lambda x: x.to_json(), axis=1).to_frame()
    add_data_df_file.columns = ['additional_data']
    
    # Merge mapped columns with additional data column to build final df
    final_col_list = []
    for item in mapped_columns:
        if item in df_file.columns:
            final_col_list.append(item)
    df_file_final = df_file[final_col_list].join(add_data_df_file)
    
    # Convert dataframe to new-line delimited JSON and write out to file
    destination_dir = tf_output_dir
    destination_file = 'file.json'
    records_json = df_file_final.to_json(orient='records') # Converting to JSON string first to replace NaN with nulls
    records_list = json.loads(records_json)
    records_cnt = len(records_list)

    with open(destination_file, 'w') as outfile:
        for idx, val in enumerate(records_list):
            json.dump(val, outfile) # Adds escape characters to additional_data field --> Not sure it's a problem
            if idx < (records_cnt - 1):
                outfile.write('\n')

    # Copy file to workspace bucket
    !gsutil cp $destination_file $ws_bucket/$destination_dir/ 2> stdout
    
    # Delete tsv files from notebook env - they will persist in designated workspace bucket directory
    !rm $destination_file
    

In [8]:
# Test
# params = {}
# params["tf_input_dir"] = "ingest_pipeline/input/metadata"
# params["tf_output_dir"] = "ingest_pipeline/output/tim_core/metadata"
# params["data_files_src_bucket"] = "fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46"
# params["data_files_src_dirs"] = []  # Leave empty to include all
# params["data_files_src_dirs_exclude"] = [] 
# params["fileref_columns"] = ["sequencing_id", "seq_filename", "capture_region_bed_file"]
# #params["file_manifest"] = []
# params["file_manifest"] = build_manifest(params)
# transform(params)