In [None]:
# Version History
#print('Version: 1.0.0: 9/8/2022 8:43pm - Nate Calvanese - First version')
#print('Version: 1.0.1: 9/16/2022 10:57am - Nate Calvanese - Fixed bug in file_inventory table creation')
#print('Version: 1.0.2: 10/4/2022 4:00pm - Nate Calvanese - Added support for arrays in TSVs')
#print('Version: 1.0.3: 10/5/2022 11:34am - Nate Calvanese - Chunked pre-processed table data into multiple files')
#print('Version: 1.0.4: 10/12/2022 12:31pm - Nate Calvanese - Added support for column-level file reference configuration')
#print('Version: 1.0.5: 10/13/2022 12:40pm - Nate Calvanese - Fixed bug building file refs when reading in file inventory')
#print('Version: 1.0.6: 10/26/2022 3:05pm - Nate Calvanese - Added logic to deal with duplicate column names in source files')
#print('Version: 1.0.7: 1/30/2023 3:00pm - Nate Calvanese - Added logic to deal with workspaces that do not contain files')
#print('Version: 1.0.8: 1/31/2023 10:52am - Nate Calvanese - Added logic to deal with workspaces that are completely empty')
#print('Version: 1.0.9: 3/8/2023 12:09pm - Nate Calvanese - Performance improvements for file ref lookups')
print('Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable')


In [None]:
# Install additional modules (one time effort per cloud environment)
#!pip install --upgrade pip import_ipynb data_repo_client urllib3 xmltodict

In [None]:
## Imports and Environment Variables

# Imports
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import re
import hashlib
import logging
import import_ipynb
import ingest_pipeline_utilities as utils
import build_file_inventory as bfi
import math

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# Workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

In [None]:
## Main table data processing function
def process_table_data(params):
    
    # Collect parameters
    log_status = "Success"
    log_dict = {}
    google_project = params["google_project"]
    input_dir = params["input_dir"]
    el_output_dir = params["el_output_dir"]
    el_schema_file = params["el_schema_file"]
    create_file_table = params["create_file_table"]
    file_table_name = params["file_table_name"]
    data_file_refs = params["data_file_refs"]
    file_inventory = params["file_inventory"]
    file_inventory_dir = params["file_inventory_dir"]
    max_combined_rec_ref_size = params["max_combined_rec_ref_size"]
    
    # Attempt to read in file_inventory file if it's empty in the params dictionary
    if len(file_inventory) == 0:
        logging.info("File inventory not populated. Attempting to populate from latest file (if one exists).")
        try: 
            inventory_file_path = "gs://" + ws_bucket_name + "/" + file_inventory_dir + "/file_inventory.tsv"
            df_inv = pd.read_csv(inventory_file_path, delimiter = "\t")
            df_inv["file_ref"] = df_inv.apply(lambda x: json.loads(x["file_ref"].replace("\'", "\"")), axis=1)
            file_inventory = df_inv.to_dict(orient='records')
            logging.info("File inventory populated successfully.")
            log_dict["file_inventory_population"] = "File inventory populated"
        except Exception as e:
            error_message = "File inventory not populated. Unable to populate from file: {}".format(e)
            logging.warning("File inventory not populated. Unable to populate from file: {}".format(e))
            log_status = "Warning"
            log_dict["file_inventory_population"] = "File inventory not populated. Unable to populate from file: {}".format(e)
    else:
        log_dict["file_inventory_population"] = "File inventory populated"
    
    # Format file_inventory to support file ref lookup
    file_lookup = {}
    for file in file_inventory:
        file_lookup[file["uri"]] = {
            "file_id": file["file_id"],
            "file_ref": file["file_ref"] 
        } 
    
    # Empty destination directory
    !gsutil -m rm -r $ws_bucket/$el_output_dir/* 2> /dev/null || true
    
    # Get list of table data files to process
    obj_list = bfi.get_objects_list(ws_bucket_name, google_project, dirs_to_include=[input_dir])
    target_table_dict = {}
    for item in obj_list:
        path_split = item.split("/")
        tar_table_idx = len(path_split) - 2
        if re.match(".+(\.(tsv|csv))$", item):
            source_file = item
            target_table = path_split[tar_table_idx]
            if target_table_dict.get(target_table) == None:
                target_table_dict[target_table] = [source_file]
            else:
                source_file_list = target_table_dict[target_table]
                source_file_list.append(source_file)
                target_table_dict[target_table] = source_file_list
    if create_file_table == True and len(file_inventory) > 0:
        target_table_dict[file_table_name] = [file_inventory_dir + "/file_inventory.tsv"]
    logging.info("Target tables and files to be processed: " + json.dumps(target_table_dict))

    # If no table data or data files, error out pipeline
    if (len(target_table_dict) == 0) or (len(target_table_dict) == 1 and target_table_dict.get("workspace_attributes") != None):
        if len(target_table_dict) == 0:
            error_message = "No data files or tabular data found. Unable to process table data."
        else:
            error_message = "No data file or tabular data found (workspace attributes only). Unable to process table data."
        logging.error(error_message)
        log_dict["table_data_processing"] = error_message
        log_status = "Error"
        table_list = []
        log_string = json.dumps(log_dict)
        return table_list, log_status, log_string
    
    # Loop through target tables and process files associated with them (limited to tsv/csv for the moment)
    table_list = []
    relationship_list = [] 
    for key in target_table_dict:
        logging.info("Processing files for target table: {}.".format(key))
        try:
            tablename = key
            table_dict = {}
            table_dict["name"] = tablename
            column_list = []

            # Build data frame from source files
            file_iterator = 0
            for file_entry in target_table_dict[key]:
                filename = os.path.split(file_entry)[1]
                file_iterator += 1
                full_file_path = "gs://" + ws_bucket_name + "/" + file_entry
                if re.match(".+(\.tsv)$", file_entry):
                    delim = "\t"
                else:
                    delim = ","
                if file_iterator == 1:
                    df = pd.read_csv(full_file_path, delimiter = delim)
                    df["ingest_provenance"] = filename
                else:
                    df_int = pd.read_csv(full_file_path, delimiter = delim)
                    df_int["ingest_provenance"] = filename
                    df = pd.concat([df, df_int], ignore_index=True)
            
            # Perform initial dataframe clean up (encode names, deal with duplicate names, attempt to convert data types, replace NaN, etc.)
            original_cols = list(df.columns)
            df.rename(columns=lambda x: utils.encode_name(x), inplace=True)
            processed_cols = []
            duplicate_cols = []
            for idx, column in enumerate(list(df.columns)):
                if column not in processed_cols:
                    processed_cols.append(column)
                else:
                    duplicate_cols.append(column)
                    col_suffix = "_" + str(duplicate_cols.count(column) + 1)
                    new_column_name = column + col_suffix
                    processed_cols.append(new_column_name)
            df.columns = processed_cols
            column_lookup = {}
            for idx, orig_col in enumerate(original_cols):
                column_lookup[orig_col] = processed_cols[idx]
            df = df.where((pd.notnull(df)), None)
            df = df.convert_dtypes()
            
            # Set file reference variables
            fileref_tablename = file_table_name
            fileref_columnname = "file_ref"
            fileref_id_columnname = "file_id"
            
            # Loop through columns and build TDR schema table entry from dataframe
            array_cols = []
            for column in df.columns:
                
                # Scan column to determine if it contains arrays
                scan_rows = min(df[column].size, 100)
                contains_lists = False
                for i in range(0, scan_rows):
                    col_val = df.at[i, column]
                    try:
                        col_val = json.loads(col_val)
                        if isinstance(col_val, list):
                            contains_lists = True
                            array_cols.append(column)
                            break
                    except:
                        pass

                # If list column, convert values to lists and build column dictionary
                if contains_lists:
                    df[column] = df.apply(lambda x: utils.convert_to_list(x[column]) if(pd.notnull(x[column])) else [], axis=1)
                    column_dict = {}
                    column_dict["name"] = column
                    column_dict["array_of"] = True 
                    column_dict["datatype"] = "string"
                    column_list.append(column_dict)
                else:
                    # Build column dictionary
                    column_dict = {}
                    column_dict["name"] = column
                    column_dict["array_of"] = False 
                    base_type = str(df[column].dtype)
                    mapped_type = utils.map_datatype(base_type)
                    
                    # Force convert unknown type objects to strings (or fileref for file inventory file_ref fields)
                    if create_file_table == True and tablename == fileref_tablename and column == fileref_columnname:
                        mapped_type = "fileref"
                        df[column] = df.apply(lambda x: json.loads(x[column].replace("\'", "\"")), axis=1)
                    elif mapped_type == "other":
                        mapped_type = "string"
                        df[column] = df[column].astype("string")
                    
                    # Set column datatype and append to column list
                    column_dict["datatype"] = mapped_type
                    column_list.append(column_dict)

            # Check for and build file references as necessary
            if tablename in data_file_refs:
                for column_entry in data_file_refs[tablename]:
                    if column_lookup.get(column_entry["column"]):
                        encoded_col_name = column_lookup[column_entry["column"]]
                    else:
                        encoded_col_name = utils.encode_name(column_entry["column"])
                    data_file_ref_mode = column_entry["mode"]

                    # If column exists in the data frame, determine whether or not a new field will be created, and the method that will be used for building the file reference
                    # Once determined, execute the appropriate file reference building function and then update the final schema to match the inputted parameters
                    if encoded_col_name in df.columns and column_entry["method"] in ["file_path_match", "tdr_file_id"]:
                        
                        # Derive common variables
                        if data_file_ref_mode == "fileref_table_ref" and create_file_table == True:
                            return_field = "file_id"
                        else:
                            return_field = "file_ref"
                        if encoded_col_name in array_cols or column_entry["match_multiple_files"] == True:
                            array_of = True
                        else:
                            array_of = False
                        
                        # If not creating a new field, update existing field
                        if column_entry["create_new_field"] == False:
                            # If method is file_path_match, replace column values with file IDs or references, otherwise leave columns alone
                            if column_entry["method"] in ["file_path_match"]:
                                df[encoded_col_name] = df.apply(lambda x: utils.find_file_in_inventory(x[encoded_col_name], file_lookup, return_field, column_entry["match_multiple_files"], column_entry["match_regex"], column_entry["match_type"]), axis=1)
                            # Update the column_list entry for the schema as appropriate
                            for idx, val in enumerate(column_list):
                                if val["name"] == encoded_col_name:
                                    col_list_idx = idx
                            column_list[col_list_idx]["array_of"] = array_of
                            if data_file_ref_mode == "fileref_in_line" or column_entry["method"] == "tdr_file_id":
                                column_list[col_list_idx]["datatype"] = "fileref"
                            else:
                                column_list[col_list_idx]["datatype"] = "string"
                                # If file reference mode is "fileref_table" and method is not "tdr_file_id", add the appropriate relationship to link the file reference to the new file table
                                rel_dict = utils.construct_relationship(tablename, encoded_col_name, fileref_tablename, fileref_id_columnname)
                                relationship_list.append(rel_dict)
                        
                        # Otherwise, create new field
                        else:
                            # Record or derive new field name
                            if column_entry["new_field_name"] != None:
                                new_col_name = utils.encode_name(column_entry["new_field_name"])
                            else:
                                new_col_name = encoded_col_name + "_fileref"
                            # If method is file_path_match, replace column values with file IDs or references, otherwise leave columns alone
                            if column_entry["method"] in ["file_path_match"]:
                                df[new_col_name] = df.apply(lambda x: utils.find_file_in_inventory(x[encoded_col_name], file_lookup, return_field, column_entry["match_multiple_files"], column_entry["match_regex"], column_entry["match_type"]), axis=1)
                            # Create new column list entry for schema
                            column_dict = {}
                            column_dict["name"] = new_col_name
                            column_dict["array_of"] = array_of 
                            if data_file_ref_mode == "fileref_in_line" or column_entry["method"] == "tdr_file_id":
                                column_dict["datatype"] = "fileref"
                            else:
                                column_dict["datatype"] = "string"
                                # If file reference mode is "fileref_table" and method is not "tdr_file_id", add the appropriate relationship to link the file reference to the new file table
                                rel_dict = utils.construct_relationship(tablename, new_col_name, fileref_tablename, fileref_id_columnname)
                                relationship_list.append(rel_dict)
                            column_list.append(column_dict)

            # Add column list to table dict and table_dict to table_list
            table_dict["columns"] = column_list
            table_dict["primaryKey"] = []
            table_list.append(table_dict)

            # Write out file in chunks (based on rows*fileref columns)
            records_json = df.to_json(orient="records")
            records_list = json.loads(records_json)
            records_cnt = len(records_list)
            fileref_cnt = 0
            for col_entry in column_list:
                if col_entry["datatype"] == "fileref":
                    fileref_cnt += 1
            if fileref_cnt > 0:
                combined_rec_ref_size = records_cnt*fileref_cnt
                chunk_cnt = math.ceil(combined_rec_ref_size/max_combined_rec_ref_size)
                chunk_size = math.ceil(records_cnt/chunk_cnt)
            else:
                chunk_cnt = 1
                chunk_size = records_cnt
            for i in range(0, chunk_cnt):
                if i == 0:
                    start_row = 0
                    end_row = chunk_size
                else:
                    start_row = (i*chunk_size) + 1
                    end_row = min((i+1)*chunk_size, records_cnt)
                destination_file = tablename + "_" + str(i) + ".json"
                with open(destination_file, "w") as outfile:
                    for idx, val in enumerate(records_list):
                        if idx >= start_row and idx <= end_row:
                            json.dump(val, outfile)
                            if idx < end_row:
                                outfile.write("\n")
                !gsutil cp $destination_file $ws_bucket/$el_output_dir/$tablename/ 2> stdout 
            log_dict[key] = "No errors raised"
        except Exception as e:
            error_message = "Error processing files for target table: {}. Error message: {}".format(key, e)
            logging.info(error_message)
            log_dict[key] = error_message
            log_status = "Error"

    # Finish building TDR schema object
    logging.info("Creating schema object and copying to cloud storage.")
    schema_dict = {}
    schema_dict["tables"] = table_list
    schema_dict["relationships"] = relationship_list

    # Write out schema file, copy to workspace bucket, and delete from notebook environment
    destination_file = "tdr_schema_object.json"
    with open(destination_file, "w") as outfile:
        json.dump(schema_dict, outfile)
    !gsutil cp $destination_file $ws_bucket/$el_schema_file 2> stdout
    
    # Remove .json files
    !rm *.json
    
    # Build and return table list for use in ingest
    table_list = list(target_table_dict.keys())
    log_string = json.dumps(log_dict)
    logging.info("File processing complete. Status: {}. Details: {}. Tables to ingest: {}".format(log_status, log_string, ", ".join(table_list)))
    return table_list, log_status, log_string
    

In [None]:
# # Test - General
# params = {}
# ws_project = "anvil-datastorage"
# ws_name = "AnVIL_NIA_CARD_LR_WGS_HBCC_Staging"
# ws_attributes = utils.get_workspace_attributes(ws_project, ws_name, "https://api.firecloud.org")
# params["workspace_bucket"] = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else "" 
# params["data_files_src_buckets"] = {}
# params["data_files_src_buckets"][params["workspace_bucket"]] = {
#             "include_dirs": [],
#             "exclude_dirs": []
# }
# params["google_project"] = "terra-92e58ed4"
# params["max_combined_rec_ref_size"] = 40000
# #params["input_dir"] = "ingest_pipeline/input/{}/table_data".format("test")
# #params["file_inventory_dir"] = "ingest_pipeline/input/{}/data_files/file_inventory".format("test")
# params["input_dir"] = "ingest_pipeline/input/{}/table_data".format(ws_name)
# params["file_inventory_dir"] = "ingest_pipeline/input/{}/data_files/file_inventory".format(ws_name)
# params["el_output_dir"] = "ingest_pipeline/output/source/test/table_data"
# params["el_schema_file"] = "ingest_pipeline/output/source/test/schema/tdr_schema_object.json"
# params["create_file_table"] = True
# params["file_table_name"] = "file_inventory"
# data_file_refs_dict = {   
# }
# file_ref_list, params["data_file_refs"], params["data_files_src_buckets"], remote_list = utils.find_and_add_fileref_fields(ws_project, ws_name, params["workspace_bucket"], data_file_refs_dict, params["data_files_src_buckets"], base_url="https://api.firecloud.org")
# params["run_env"] = "prod"
# params["global_file_exclusions"] = ["SubsetHailJointCall", ".vds/", "ingest_ignore"]
# file_inventory, retry_count = bfi.build_inventory(params)
# params["file_inventory"] = file_inventory
# #params["file_inventory"] = []
# target_tables, log_status, log_string = process_table_data(params)
# # file_lookup = {}
# # for file in file_inventory:
# #     file_lookup[file["uri"]] = {
# #         "file_id": file["file_id"],
# #         "file_ref": file["file_ref"] 
# #     } 


In [None]:
# # Test - File reference building
# ws_name = "test"
# file_inventory_dir = "ingest_pipeline/input/{}/data_files/file_inventory".format(ws_name)
# inventory_file_path = "gs://" + ws_bucket_name + "/" + file_inventory_dir + "/file_inventory.tsv"
# df_inv = pd.read_csv(inventory_file_path, delimiter = "\t")
# file_inventory = df_inv.to_dict(orient='records')

# # Format file_inventory to support file ref lookup
# file_lookup = {}
# for file in file_inventory:
#     file_lookup[file["uri"]] = {
#         "file_id": file["file_id"],
#         "file_ref": file["file_ref"] 
#     } 

# value = "gs://fc-secure-6513d7e1-2dbb-41a2-baea-3f7fdbcbb620/AnVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES/RP-2098/Exome/AF100017/v1/AF100017.cram"
# return_field = "file_ref"
# match_multiple_files = True
# match_regex = None
# match_type = "exact"
# return_val = utils.find_file_in_inventory(value, file_lookup, return_field, match_multiple_files, match_regex, match_type)
# print(return_val)
