In [1]:
## imports and environment variables
# imports
from firecloud import api as fapi
import json
import os
import pandas as pd
import csv
from io import StringIO
from google.cloud import storage
import logging

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")

In [2]:
## Transform functions

# Function to convert list represented as string to a list data type
def str_list_to_list(in_str, list_delim):
    out_list = []
    out_list = in_str.split(sep=list_delim)
    return out_list

# Function to concatenate a string value to each entry in a list (either 'prefix' or 'suffix')
def concat_str_to_list(in_str, in_list, delim='_', mode='prefix'):
    out_list = []
    for item in in_list:
        if mode == 'prefix':
            out_list.append(in_str + delim + item)
        elif mode == 'suffix':
            out_list.append(item + delim + instr)
        else:
            out_list.append(item)
    return out_list

# Function to convert non-null values from a list of columns into a list
def df_cols_to_list(in_list):
    out_list = []
    for item in in_list:
        if pd.notnull(item):
            out_list.append(item)
    return out_list

In [3]:
def transform(params):
    
    # Retrieve parameters of interest
    tf_input_dir = params["tf_input_dir"]
    tf_output_dir = params["tf_output_dir"]
    
    # Attempt to read source files into data frame, checking for missing files or key fields 
    try:
        src_file = 'sample.tsv'
        src_file_path = ws_bucket + '/' + tf_input_dir + '/' + src_file
        df_sample = pd.read_csv(src_file_path, delimiter = '\t').rename(columns = {'entity:sample_id':'sample_id'})
        key_fields = ['sample_id']
        missing_key_fields = []
        for item in key_fields:
            if item not in df_sample.columns:
                missing_key_fields.append(item)
        if len(missing_key_fields) > 0:
            missing_fields_str = ', '.join(missing_key_fields)
            logging.error('Key source fields ({fields}) not found in file ({file}).'.format(fields = missing_fields_str, file = src_file))
            return
    except:
        logging.error('Source file {src} not found.'.format(src = src_file))
        return
    
    # Attempt to read in additional optional files
    try:
        src_file = 'subject.tsv'
        src_file_path = ws_bucket + '/' + tf_input_dir + '/' + src_file
        df_subject = pd.read_csv(src_file_path, delimiter = '\t').rename(columns = {'entity:subject_id':'subject_id'})
        key_fields = ['subject_id']
        missing_key_fields = []
        for item in key_fields:
            if item not in df_subject.columns:
                missing_key_fields.append(item)
        if len(missing_key_fields) > 0:
            missing_fields_str = ', '.join(missing_key_fields)
            logging.warning('Key source fields ({fields}) not found in optional file ({file}). File will not be used'.format(fields = missing_fields_str, file = src_file))
            df = df_sample
        else:
            # Join in fields from optional file if present and no cardinality issues found
            if df_subject['subject_id'].duplicated().sum() > 0:
                logging.warning('Field subject_id is not unique in optional file subject.tsv. File will not be used.')
                df = df_sample
            else:
                subject_cols_list = ['subject_id', 'phenotype_group', 'phenotype_description']
                subject_cols_list_final = []
                for item in subject_cols_list:
                    if item in df_subject.columns:
                        subject_cols_list_final.append(item)
                df = df_sample.merge(df_subject[subject_cols_list_final], on='subject_id', how='left', suffixes=(None,'_sub'))
    except:
        logging.warning('Optional source file {src} not found. File will not be used.'.format(src = src_file))
        df = df_sample

    # View DF
    df

    # Transform mapped fields (appending new fields to end of existing DF for now)
    if {'sample_id'}.issubset(df.columns):
        df['biosample_id'] = df['sample_id']
    if {'subject_id'}.issubset(df.columns):
        df['donor_id'] = df.apply(lambda x: [x['subject_id']] if(pd.notnull(x['subject_id'])) else [], axis=1)   
    if {'sample_source'}.issubset(df.columns):
        df['sample_type'] = df.apply(lambda x: [x['sample_source']] if(pd.notnull(x['sample_source'])) else [], axis=1)   
    if {'dbgap_sample_id'}.issubset(df.columns):
        df['xref'] = df.apply(lambda x: [x['dbgap_sample_id']] if(pd.notnull(x['dbgap_sample_id'])) else [], axis=1)

    # Limit DF to transformed and passthrough fields
    mapped_columns = ['biosample_id', 'donor_id', 'sample_type', 'xref']
    passthrough_columns = ['phenotype_group', 'phenotype_description', 'tissue_affected_status']
    final_col_list = []
    for item in mapped_columns:
        if item in df.columns:
            final_col_list.append(item)
    for item in passthrough_columns:
        if item in df.columns:
            final_col_list.append(item)
    df2 = df[final_col_list] # Creating to avoid any cardinality issues when rejoining the passthrough data in the subsequent steps

    # Build passthrough string 
    passthrough_col_list = []
    for item in passthrough_columns:
        if item in df2.columns:
            passthrough_col_list.append(item)
    passthrough_col_list.sort()
    passthrough_df = df2[passthrough_col_list]
    add_data_df = passthrough_df.apply(lambda x: x.to_json(), axis=1).to_frame()
    add_data_df.columns = ['additional_data']

    # Merge mapped columns with additional data column to build final df
    final_col_list = []
    for item in mapped_columns:
        if item in df.columns:
            final_col_list.append(item)
    df_final = df2[final_col_list].join(add_data_df)

    # View DF
    #df_final

    # Convert dataframe to new-line delimited JSON and write out to file
    destination_dir = tf_output_dir
    destination_file = 'biosample.json'
    records_json = df_final.to_json(orient='records') # Converting to JSON string first to replace NaN with nulls
    records_list = json.loads(records_json)
    records_cnt = len(records_list)

    #print(records_cnt)
    #print(records_json)
    #print(records_list)

    with open(destination_file, 'w') as outfile:
        for idx, val in enumerate(records_list):
            json.dump(val, outfile) # Adds escape characters to additional_data field --> Not sure it's a problem
            if idx < (records_cnt - 1):
                outfile.write('\n')

    # Copy file to workspace bucket
    !gsutil cp $destination_file $ws_bucket/$destination_dir/ 2> stdout

    # Delete tsv files from notebook env - they will persist in designated workspace bucket directory
    !rm $destination_file

In [4]:
# Test
# params = {}
# params["tf_input_dir"] = "ingest_pipeline/input/metadata"
# params["tf_output_dir"] = "ingest_pipeline/output/tim_core/metadata"
# transform(params)