In [None]:
# Version History
# print('Version: 1.0.0: 9/8/2022 8:43pm - Nate Calvanese - First version')
print('Version: 1.0.1: 9/16/2022 10:57am - Nate Calvanese - Fixed bug in file_inventory table creation')


In [4]:
## Imports and Environment Variables

# Imports
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import re
import hashlib
import logging
import import_ipynb
import ingest_pipeline_utilities as utils
import build_file_inventory as bfi

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# Workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

In [5]:
## Main table data processing function
def process_table_data(params):
    
    # Collect parameters
    log_status = "Success"
    log_dict = {}
    google_project = params["google_project"]
    input_dir = params["input_dir"]
    el_output_dir = params["el_output_dir"]
    el_schema_file = params["el_schema_file"]
    data_file_ref_mode = params["data_file_ref_mode"]
    data_file_ref_table_name = params["data_file_ref_table_name"]
    data_file_refs = params["data_file_refs"]
    file_inventory = params["file_inventory"]
    file_inventory_dir = params["file_inventory_dir"]
    
    # Attempt to read in file_inventory file if it's empty in the params dictionary
    if len(file_inventory) == 0:
        logging.info("File inventory not populated. Attempting to populate from latest file (if one exists).")
        try: 
            inventory_file_path = "gs://" + ws_bucket_name + "/" + file_inventory_dir + "/file_inventory.tsv"
            df_inv = pd.read_csv(inventory_file_path, delimiter = "\t")
            file_manifest = df_inv.to_dict(orient='records')
            logging.info("File inventory populated successfully.")
            log_dict["file_inventory_population"] = "File inventory populated"
        except Exception as e:
            error_message = "File inventory not populated. Error attempting to populate from file: {}".format(e)
            logging.error("File inventory not populated. Error attempting to populate from file: {}".format(e))
            log_status = "Error"
            log_dict["file_inventory_population"] = "File inventory not populated. Error attempting to populate from file: {}".format(e)
    else:
        log_dict["file_inventory_population"] = "File inventory populated"
    
    # Empty destination directory
    !gsutil -m rm -r $ws_bucket/$el_output_dir/* 2> /dev/null || true
    
    # Get list of table data files to process
    obj_list = bfi.get_objects_list(ws_bucket_name, google_project, dirs_to_include=[input_dir])
    target_table_dict = {}
    for item in obj_list:
        path_split = item.split("/")
        tar_table_idx = len(path_split) - 2
        if re.match(".+(\.(tsv|csv))$", item):
            source_file = item
            target_table = path_split[tar_table_idx]
            if target_table_dict.get(target_table) == None:
                target_table_dict[target_table] = [source_file]
            else:
                source_file_list = target_table_dict[target_table]
                source_file_list.append(source_file)
                target_table_dict[target_table] = source_file_list
    if data_file_ref_mode == "fileref_table":
        target_table_dict[data_file_ref_table_name] = [file_inventory_dir + "/file_inventory.tsv"]
    logging.info("Target tables and files to be processed: " + json.dumps(target_table_dict))

    # Loop through target tables and process files associated with them (limited to tsv/csv for the moment)
    table_list = []
    relationship_list = [] 
    for key in target_table_dict:
        logging.info("Processing files for target table: {}.".format(key))
        try:
            tablename = key
            table_dict = {}
            table_dict["name"] = tablename
            column_list = []

            # Build data frame from source files
            file_iterator = 0
            filename_list = []
            for file_entry in target_table_dict[key]:
                filename = os.path.split(file_entry)[1]
                filename_list.append(filename)
                file_iterator += 1
                full_file_path = "gs://" + ws_bucket_name + "/" + file_entry
                if re.match(".+(\.tsv)$", file_entry):
                    delim = "\t"
                else:
                    delim = ","
                if file_iterator == 1:
                    df = pd.read_csv(full_file_path, delimiter = delim)
                else:
                    df_int = pd.read_csv(full_file_path, delimiter = delim) # We"ll want the code to be able to handle the case where a target table has multiple source files. Haven"t tested/developed yet.
                    df = pd.concat([df, df_int])
            
            # Perform some initial dataframe clean up (clean up names, attempt to convert data types, replace NaN, etc.)
            df.rename(columns=lambda x: utils.encode_name(x), inplace=True)
            df = df.where((pd.notnull(df)), None)
            df = df.convert_dtypes()
            
            # Set file reference variables
            fileref_tablename = data_file_ref_table_name
            fileref_columnname = "file_ref"
            fileref_id_columnname = "file_id"
            
            # Loop through columns and build TDR schema table entry from dataframe
            for column in df.columns:

                # Build column dictionary
                column_dict = {}
                column_dict["name"] = column
                column_dict["array_of"] = False   #CSV/TSV can"t support arrays, so this is safe for now.
                base_type = str(df[column].dtype)
                mapped_type = utils.map_datatype(base_type)

                # Force convert objects to strings (or fileref for file inventory file_ref field)
                if data_file_ref_mode == "fileref_table" and tablename == fileref_tablename and column == fileref_columnname:
                    mapped_type = "fileref"
                    df[column] = df.apply(lambda x: json.loads(x[column].replace("\'", "\"")), axis=1)
                elif mapped_type == "other":
                    mapped_type = "string"
                    df[column] = df[column].astype("string")

                # Set column datatype and append to column list
                column_dict["datatype"] = mapped_type
                column_list.append(column_dict)

            # Check for and build file references as necessary
            for filename in filename_list:
                if filename in data_file_refs:
                    for column_entry in data_file_refs[filename]:
                        encoded_col_name = utils.encode_name(column_entry["column"])

                        # If column exists in the data frame, determine whether or not a new field will be created, and the method that will be used for building the file reference
                        # Once determined, execute the appropriate file reference building function and then update the final schema to match the inputted parameters
                        if encoded_col_name in df.columns and column_entry["method"] in ["file_path_match", "tdr_file_id"]:

                            # Derive common variables
                            if data_file_ref_mode == "fileref_table":
                                return_field = "file_id"
                            else:
                                return_field = "file_ref"
                            if column_entry["match_multiple_files"] == True:
                                array_of = True
                            else:
                                array_of = False

                            # If not creating a new field, update existing field
                            if column_entry["create_new_field"] == False:
                                # If method is file_path_match, replace column values with file IDs or references, otherwise leave columns alone
                                if column_entry["method"] in ["file_path_match"]:
                                    df[encoded_col_name] = df.apply(lambda x: utils.find_file_in_inventory(x[encoded_col_name], file_inventory, return_field, column_entry["match_multiple_files"], column_entry["match_regex"]), axis=1)
                                # Update the column_list entry for the schema as appropriate
                                for idx, val in enumerate(column_list):
                                    if val["name"] == encoded_col_name:
                                        col_list_idx = idx
                                column_list[col_list_idx]["array_of"] = array_of
                                if data_file_ref_mode == "fileref_in_line" or column_entry["method"] == "tdr_file_id":
                                    column_list[col_list_idx]["datatype"] = "fileref"
                                else:
                                    column_list[col_list_idx]["datatype"] = "string"
                                    # If file reference mode is "fileref_table" and method is not "tdr_file_id", add the appropriate relationship to link the file reference to the new file table
                                    rel_dict = utils.construct_relationship(tablename, encoded_col_name, fileref_tablename, fileref_id_columnname)
                                    relationship_list.append(rel_dict)

                            # Otherwise, create new field
                            else:
                                # Record or derive new field name
                                if column_entry["new_field_name"] != None:
                                    new_col_name = utils.encode_name(column_entry["new_field_name"])
                                else:
                                    new_col_name = encoded_col_name + "_fileref"
                                # If method is file_path_match, replace column values with file IDs or references, otherwise leave columns alone
                                if column_entry["method"] in ["file_path_match"]:
                                    df[new_col_name] = df.apply(lambda x: utils.find_file_in_inventory(x[encoded_col_name], file_inventory, return_field, column_entry["match_multiple_files"], column_entry["match_regex"]), axis=1)
                                # Create new column list entry for schema
                                column_dict = {}
                                column_dict["name"] = new_col_name
                                column_dict["array_of"] = array_of 
                                if data_file_ref_mode == "fileref_in_line" or column_entry["method"] == "tdr_file_id":
                                    column_dict["datatype"] = "fileref"
                                else:
                                    column_dict["datatype"] = "string"
                                    # If file reference mode is "fileref_table" and method is not "tdr_file_id", add the appropriate relationship to link the file reference to the new file table
                                    rel_dict = utils.construct_relationship(tablename, new_col_name, fileref_tablename, fileref_id_columnname)
                                    relationship_list.append(rel_dict)
                                column_list.append(column_dict)
            
            # Add column list to table dict and table_dict to table_list
            table_dict["columns"] = column_list
            table_dict["primaryKey"] = []
            table_list.append(table_dict)

            # Write out file
            records_json = df.to_json(orient="records")
            records_list = json.loads(records_json)
            records_cnt = len(records_list)
            destination_file = tablename + ".json"
            with open(destination_file, "w") as outfile:
                for idx, val in enumerate(records_list):
                    json.dump(val, outfile)
                    if idx < (records_cnt - 1):
                        outfile.write("\n")

            # Copy file to workspace bucket and delete from notebook environment
            !gsutil cp $destination_file $ws_bucket/$el_output_dir/ 2> stdout 
            log_dict[key] = "No errors raised"
        except Exception as e:
            error_message = "Error processing files for target table: {}. Error message: {}".format(key, e)
            logging.info(error_message)
            log_dict[key] = error_message
            log_status = "Error"

    # Finish building TDR schema object
    logging.info("Creating schema object and copying to cloud storage.")
    schema_dict = {}
    schema_dict["tables"] = table_list
    schema_dict["relationships"] = relationship_list

    # Write out schema file, copy to workspace bucket, and delete from notebook environment
    destination_file = "tdr_schema_object.json"
    with open(destination_file, "w") as outfile:
        json.dump(schema_dict, outfile)
    !gsutil cp $destination_file $ws_bucket/$el_schema_file 2> stdout
    
    # Remove .json files
    !rm *.json
    
    # Build and return table list for use in ingest
    table_list = list(target_table_dict.keys())
    log_string = json.dumps(log_dict)
    logging.info("File processing complete. Status: {}. Details: {}. Tables to ingest: {}".format(log_status, log_string, ", ".join(table_list)))
    return table_list, log_status, log_string

    

In [6]:
# Tests
# params = {}
# ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
# params["google_project"] = ws_attributes["googleProject"]
# params["input_dir"] = "ingest_pipeline/input/table_data"
# params["file_inventory_dir"] = "ingest_pipeline/input/data_files/file_inventory"
# params["el_output_dir"] = "ingest_pipeline/output/source/table_data"
# params["el_schema_file"] = "ingest_pipeline/output/source/schema/tdr_schema_object.json"
# params["data_file_ref_mode"] = "fileref_table"
# params["data_file_ref_table_name"] = "ws_file_inventory"
# params["data_file_refs"] = {
#     "ws_sequencing.tsv": [{
#         "column": "sequencing_id",
#         "method": "file_path_match",
#         "match_multiple_files": True, 
#         "match_regex": None,
#         "create_new_field": True,
#         "new_field_name": "sequencing_id_file_id"
#     }, {
#         "column": "seq_filename",
#         "method": "file_path_match",
#         "match_multiple_files": True, 
#         "match_regex": None,
#         "create_new_field": True,
#         "new_field_name": "seq_filename_file_id"
#     }, {
#         "column": "capture_region_bed_file",
#         "method": "file_path_match",
#         "match_multiple_files": True, 
#         "match_regex": None,
#         "create_new_field": True,
#         "new_field_name": "capture_region_bed_file_file_id"
#     }, {
#         "column": "file_id",
#         "method": "file_path_match",
#         "match_multiple_files": True, 
#         "match_regex": None,
#         "create_new_field": True,
#         "new_field_name": "file_id_file_id"
#     }, {
#         "column": "cram",
#         "method": "file_path_match",
#         "match_multiple_files": True, 
#         "match_regex": None,
#         "create_new_field": True,
#         "new_field_name": "cram_file_id"
#     }, {
#         "column": "cram",
#         "method": "file_path_match",
#         "match_multiple_files": True, 
#         "match_regex": None,
#         "create_new_field": True,
#         "new_field_name": "cram_file_id"
#     }]
# }
# params["data_files_src_bucket"] = ws_attributes["attributes"]["data_files_src_bucket"]
# params["data_files_src_dirs"] = []  # Leave empty to include all
# params["data_files_src_dirs_exclude"] = [] 
# file_inventory = bfi.build_inventory(params)
# params["file_inventory"] = file_inventory
# #params["file_inventory"] = []
# target_tables, log_status, log_string = process_table_data(params)


09/16/2022 02:50:07 PM - INFO: Target tables and files to be processed: {"ws_family": ["ingest_pipeline/input/table_data/ws_family/ws_family.tsv"], "ws_sample": ["ingest_pipeline/input/table_data/ws_sample/ws_sample.tsv"], "ws_sequencing": ["ingest_pipeline/input/table_data/ws_sequencing/ws_sequencing.tsv"], "ws_subject": ["ingest_pipeline/input/table_data/ws_subject/ws_subject.tsv"], "ws_workspace_attributes": ["ingest_pipeline/input/table_data/ws_workspace_attributes/ws_workspace_attributes.tsv"], "ws_file_inventory": ["ingest_pipeline/input/data_files/file_inventory/file_inventory.tsv"]}
09/16/2022 02:50:07 PM - INFO: Processing files for target table: ws_family.
09/16/2022 02:50:10 PM - INFO: Processing files for target table: ws_sample.
09/16/2022 02:50:12 PM - INFO: Processing files for target table: ws_sequencing.
09/16/2022 02:50:15 PM - INFO: Processing files for target table: ws_subject.
09/16/2022 02:50:18 PM - INFO: Processing files for target table: ws_workspace_attributes