In [1]:
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
from typing import List, Set
from enum import Enum
from dataclasses import asdict, dataclass, field
from typing import Optional, List, Set

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]

print(f"workspace name = {ws_name}")
print(f"workspace project = {ws_project}")
print(f"workspace bucket = {ws_bucket}")

workspace name = tdr-anvil-ingest-bjt
workspace project = dsp-data-ingest
workspace bucket = gs://fc-secure-e7856519-5bea-4fec-88ec-dad61673d22f


In [2]:
## Transform Classes
@dataclass
class TransformType(Enum):
    # Adds prefix to every item in list in a new column if both columns exist
    CONCAT_STR_TO_LIST_PREFIX = "concat_str_to_list_prefix"
    
    # Adds suffix to every item in list in a new column if both columns exist
    CONCAT_STR_TO_LIST_SUFFIX = "concat_str_to_list_suffix"
    
    # Combines multiple columns values into a list in a new column
    COLS_TO_LIST = "cols_to_list"
    
    # Sets any row with null value in a list column to empty list in a new column
    #NA_TO_LIST = "na_to_list"
    
    CUSTOM = "custom"

@dataclass
class TransformerMap:
    source_column: str
    target_column: str

@dataclass
class TransformerTransform:
    source_columns: List[str]
    target_column: str
    transform_type: TransformType
    custom_code: Optional[str] = None
    custom_defs: Optional[str] = None
        
@dataclass
class TransformerSource:
    file_name: str
    primary_key: str
    rename_primary_key: str
    join_main: Optional[bool] = None
    
@dataclass
class TransformerRequest:
    input_directory: str
    output_directory: str
    source_files: List[TransformerSource]
    destination_table: str
    passthrough_cols: List[str]
    maps: List[TransformerMap] # Column renames
    transforms: List[TransformerTransform]

def as_dict(obj):
    return {
        field: value.value if isinstance(value, Enum) else value for field, value in obj if value is not None
    }

In [14]:
## Transform functions

# Function to convert list represented as string to a list data type
def str_list_to_list(in_str, list_delim):
    out_list = []
    out_list = in_str.split(sep=list_delim)
    return out_list

# Function to concatenate a string value to each entry in a list (either 'prefix' or 'suffix')
def concat_str_to_list(in_str, in_list, delim='_', mode='prefix'):
    out_list = []
    for item in in_list:
        if mode == 'prefix':
            out_list.append(in_str + delim + item)
        elif mode == 'suffix':
            out_list.append(item + delim + instr)
        else:
            out_list.append(item)
    return out_list

# Function to convert non-null values from a list of columns into a list
def df_cols_to_list(in_list):
    out_list = []
    for item in in_list:
        if pd.notnull(item):
            out_list.append(item)
    return out_list

def execute_custom_definitions(custom_definitions: str):
    exec(custom_definitions)
    
def execute_custom_code(df: pd.DataFrame, target_column: str, custom_code: str):
    # TODO: Code validation
    df[target_column] = df.apply(exec(custom_code))
    
#TODO Verify Cols Exist Before running transform_cols
def columns_in_df(df: pd.DataFrame, cols: List[str]) -> List[str]:
    return [col for col in cols if col in df]
    
# Will apply requested transform transform_mapped_fields(["this_col", "that_col"], "new_col", "transform_type")
def transform_cols(df: pd.DataFrame, tfs: List[TransformerTransform]) -> List[str]:
    transformed_cols = []
    for tf in tfs:
        transform_type = tf.transform_type
        print(f"Applying transform_type {transform_type.value}")
        transformed_cols.append(tf.target_column)
        transform_type = tf.transform_type
        cols = columns_in_df(df, tf.source_columns)
        new_col = tf.target_column
        if transform_type.value == "concat_str_to_list_prefix":
            df[new_col] = df.apply(lambda x: concat_str_to_list(str(x[cols[0]]), str_list_to_list(str(x[cols[1]]), '|'), '_', 'prefix') if(pd.notnull(x[cols[1]])) else [], axis=1)
        elif transform_type.value == "concat_str_to_list_suffix":
            df[new_col] = df.apply(lambda x: concat_str_to_list(str(x[cols[0]]), str_list_to_list(str(x[cols[1]]), '|'), '_', 'prefix') if(pd.notnull(x[cols[1]])) else [], axis=1)
        elif transform_type.value == "cols_to_list":
            df[new_col] = df.apply(lambda x: df_cols_to_list(x[cols]), axis=1)
        elif transform_type.value == "custom":
            exec(f"""
{tf.custom_defs}
""")
            df[tf.target_column] = eval(f"df.apply({tf.custom_code}, axis=1)")
    return transformed_cols

# Apply simple transformations and return list of cols transformed
def map_cols(df: pd.DataFrame, maps: List[TransformerMap]) -> List[str]:
    mapped_columns = []
    for tmap in maps:
        if tmap.source_column in df.columns:
            print(f"Mapping field {tmap.source_column} to {tmap.target_column}")
            df[tmap.target_column] = df[tmap.source_column]
            mapped_columns.append(tmap.target_column)
    return mapped_columns

# Read source file into a data frame
def transform(req: TransformerRequest) -> pd.DataFrame:

    source_file = req.source_files[0]
    
    ## TODO:  Loop through source_files and build dataframes
    src_file_path = req.input_directory + '/' + source_file.file_name
    
    print(f"transform src_file_path: {src_file_path}")
    
    df = pd.read_csv(src_file_path, delimiter = '\t').rename(columns = {source_file.rename_primary_key:source_file.primary_key})
    
    # Apply simple col transforms
    
    # TODO: drop any maps or transforms with missing cols
    mapped_cols = map_cols(df, req.maps)
    tf_cols = transform_cols(df, req.transforms)
    passthrough_cols = columns_in_df(df, req.passthrough_cols)
        
    print(f"mapped_cols = {mapped_cols}")
    print(f"tf_cols = {tf_cols}")
    print(f"passthrough_cols = {passthrough_cols}")

    final_col_list = mapped_cols + tf_cols
    df2 = df[final_col_list] # Creating to avoid any cardinality issues when rejoining the passthrough data in the subsequent steps
    
    # Build passthrough string 
    passthrough_cols.sort()
    passthrough_df = df[passthrough_cols]
    add_data_df = passthrough_df.apply(lambda x: x.to_json(), axis=1).to_frame()
    add_data_df.columns = ['additional_data']

    # Merge mapped columns with additional data column to build final df
    df_final = df2[final_col_list].join(add_data_df)

    # Convert dataframe to new-line delimited JSON and write out to file
    destination_dir = req.output_directory
    destination_file = f"{req.destination_table}2.json" #TODO: Remove 2 to write final file
    records_json = df_final.to_json(orient='records') # Converting to JSON string first to replace NaN with nulls
    records_list = json.loads(records_json)
    records_cnt = len(records_list)

    with open(destination_file, 'w') as outfile:
        for idx, val in enumerate(records_list):
            json.dump(val, outfile) # Adds escape characters to additional_data field --> Not sure it's a problem
            if idx < (records_cnt - 1):
               outfile.write('\n')
            
    print(f"Writing file {destination_file} {ws_bucket}/{destination_dir}")
    
    # Copy file to workspace bucket
    !gsutil cp $destination_file $ws_bucket/$destination_dir/ 2> stdout    


    # Delete tsv files from notebook env - they will persist in designated workspace bucket directory
    #!rm $dest_file
    
    return df_final
