In [10]:
## imports and environment variables
# imports
import pandas as pd
import json
import re
import os
from google.cloud import storage

# Configure pandas display
pd.set_option('display.max_rows', 1000)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

print(f"workspace name = {ws_name}")
print(f"workspace project = {ws_project}")
print(f"workspace bucket = {ws_bucket}")
print(f"workspace bucket name = {ws_bucket_name}")

workspace name = anvil_cmg_ingest_resources
workspace project = dsp-data-ingest
workspace bucket = gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46
workspace bucket name = fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46


In [11]:
# Function to clean column name
def clean_column_name(string):
    final_string = string.replace('-', '_')
    final_string = re.sub('^[0-9]+_', '', final_string)
    return final_string

# Read the workspace_schemas.csv file into dataframe, clean column names, and convert to dict
ws_schema_filepath = ws_bucket + '/utility/transforms_compatibility/input/workspace_schemas.csv'
df = pd.read_csv(ws_schema_filepath)
df['clean_column_name'] = df.apply(lambda x: clean_column_name(str(x['column_name'])) if(pd.notnull(x['column_name'])) else x['column_name'], axis=1)
df['clean_table_column_name'] = df.apply(lambda x: x['table_name'] + '.' + x['clean_column_name'], axis=1)
df_tabs = df.groupby('workspace_name')['table_name'].apply(set).reset_index(name='table_list')
df_cols = df.groupby('workspace_name')['clean_table_column_name'].apply(set).reset_index(name='column_list')
df_join = df_tabs.merge(df_cols, on='workspace_name')
int_ws_dict = df_join.to_dict(orient='records')
#print(int_ws_dict)

# Re-organize workspace dict to make it easier to work with
workspace_dict = {}
for entry in int_ws_dict:
    inner_dict = {}
    workspace_name = entry['workspace_name']
    table_list = list(entry['table_list'])
    column_list = list(entry['column_list'])
    for table in table_list:
        id_col = table + '.' + table + '_id'
        if id_col not in column_list:
            column_list.append(id_col)
    inner_dict['table_list'] = sorted(table_list)
    inner_dict['column_list'] = sorted(column_list)
    workspace_dict[workspace_name] = inner_dict
#print(workspace_dict)
    
# Read transforms.json into transforms dict
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
blob = bucket.blob('utility/transforms_compatibility/input/transforms.json')
transforms_dict = json.loads(blob.download_as_string(client=None))

# Loop through workspaces and evaluate against transforms
results_dict = {}
for ws_key in workspace_dict:
    summary_dict = {}
    detail_dict = {}
    can_run_list = []
    cannot_run_list = []
    columns_used_set = set()
    unused_tables_list = []
    unused_columns_list = []
    for t_key in transforms_dict:
        
        # Initialize variables
        int_detail_dict = {}
        can_run = True
        mapped_columns_present_count = 0
        mapped_columns_present = []
        mapped_columns_missing_count = 0
        mapped_columns_missing = []
        passthrough_columns_present_count = 0
        passthrough_columns_present = []
        passthrough_columns_missing_count = 0
        passthrough_columns_missing = []        
        
        # Check for required tables and columns and set can_run = True if all are found
        if transforms_dict[t_key]['required_tables'] == [] or all(elem in workspace_dict[ws_key]['table_list'] for elem in transforms_dict[t_key]['required_tables']):
            req_tables_found = True
        else:
            req_tables_found = False
        if transforms_dict[t_key]['required_columns'] == [] or all(elem in workspace_dict[ws_key]['column_list'] for elem in transforms_dict[t_key]['required_columns']):
            req_columns_found = True
        else:
            req_columns_found = False
        if req_tables_found == True and req_columns_found == True:
            can_run = True
            can_run_list.append(t_key)
        else:
            can_run = False
            cannot_run_list.append(t_key)
        int_detail_dict['can_run'] = can_run
        
        # Record columns found and missing
        for map_col in transforms_dict[t_key]['mapped_columns']:
            if map_col in workspace_dict[ws_key]['column_list']:
                mapped_columns_present_count += 1
                mapped_columns_present.append(map_col)
                columns_used_set.add(map_col)
            else:
                mapped_columns_missing.append(map_col)
                mapped_columns_missing_count += 1
        for pt_col in transforms_dict[t_key]['passthrough_columns']:
            if pt_col in workspace_dict[ws_key]['column_list']:
                passthrough_columns_present_count += 1
                passthrough_columns_present.append(pt_col)
                columns_used_set.add(pt_col)
            else:
                passthrough_columns_missing.append(pt_col)
                passthrough_columns_missing_count += 1
        int_detail_dict['mapped_columns_present_count'] = mapped_columns_present_count
        int_detail_dict['mapped_columns_present'] = mapped_columns_present
        int_detail_dict['mapped_columns_missing_count'] = mapped_columns_missing_count
        int_detail_dict['mapped_columns_missing'] = mapped_columns_missing
        int_detail_dict['passthrough_columns_present_count'] = passthrough_columns_present_count
        int_detail_dict['passthrough_columns_present'] = passthrough_columns_present
        int_detail_dict['passthrough_columns_missing_count'] = passthrough_columns_missing_count
        int_detail_dict['passthrough_columns_missing'] = passthrough_columns_missing
        detail_dict[t_key] = int_detail_dict
        
    # Record unused tables and columns
    for col in workspace_dict[ws_key]['column_list']:
        if col not in columns_used_set:
            unused_columns_list.append(col)
    for tab in workspace_dict[ws_key]['table_list']:
        table_column_list = []
        for col in workspace_dict[ws_key]['column_list']:
            if tab in col:
                table_column_list.append(col)
        if all(elem in unused_columns_list for elem in table_column_list):
            unused_tables_list.append(tab)
    
    # Build final results dict entry
    summary_dict['can_run'] = can_run_list
    summary_dict['cannot_run'] = cannot_run_list
    summary_dict['unused_tables'] = unused_tables_list
    summary_dict['unused_columns'] = unused_columns_list
    results_dict[ws_key] = {}
    results_dict[ws_key]['transform_summary'] = summary_dict
    results_dict[ws_key]['transform_details'] = detail_dict

# Write out results dict as json file and transfer to GCS
destination_file = 'transforms_compatibility_results.json'
destination_dir = 'utility/transforms_compatibility/output'
with open(destination_file, 'w') as outfile:
    outfile.write(json.dumps(results_dict))
!gsutil cp $destination_file $ws_bucket/$destination_dir/ 2> stdout  
!rm $destination_file
    

In [12]:
# Loops through workspaces and collect base stats into dataframe
results_list = []
for ws_key in results_dict:
    present_column_count = 0
    missing_column_count = 0
    can_run_count = len(results_dict[ws_key]['transform_summary']['can_run'])
    unused_columns = len(results_dict[ws_key]['transform_summary']['unused_columns'])
    ws_list = [ws_key]
    for t_key in results_dict[ws_key]['transform_details']:
        if results_dict[ws_key]['transform_details'][t_key]['can_run'] == True:
            present_column_count += results_dict[ws_key]['transform_details'][t_key]['mapped_columns_present_count']
            present_column_count += results_dict[ws_key]['transform_details'][t_key]['passthrough_columns_present_count']
    ws_list.append(can_run_count)
    ws_list.append(results_dict[ws_key]['transform_summary']['cannot_run'])
    ws_list.append(present_column_count)
    ws_list.append(unused_columns)
    results_list.append(ws_list)
results_df = pd.DataFrame(results_list, columns = ['workspace', 'compatible_transforms', 'incompatible_transforms', 'transform_columns_found', 'unused_columns'])

# Order dataframe by transform_columns_found desc, unused_columns asc, and compatible_transforms desc
results_df.sort_values(['transform_columns_found', 'unused_columns', 'compatible_transforms'], ascending=[False, True, False], ignore_index=True)



Unnamed: 0,workspace,compatible_transforms,incompatible_transforms,transform_columns_found,unused_columns
0,cmg_input_specification,7,[],97,1
1,anvil_cmg_uwash_gru,6,[variantcall],51,3
2,anvil_cmg_broad_muscle_beggs_wes,7,[],46,189
3,anvil_cmg_uwash_hmb,6,[variantcall],45,0
4,anvil_cmg_broad_heart_seidman_wes,7,[],44,69
5,anvil_cmg_broad_muscle_ravenscroft_wes,7,[],44,69
6,anvil_cmg_broad_muscle_myoseq_wgs,7,[],44,71
7,anvil_cmg_broad_heart_pcgc-tristani_wgs,7,[],44,72
8,anvil_cmg_broad_kidney_hildebrandt_wgs,7,[],44,72
9,anvil_cmg_broad_orphan_manton_wgs,7,[],44,72


In [13]:
# Print detailed results for specific workspace
print(json.dumps(results_dict['cmg_input_specification'], indent=2))

{
  "transform_summary": {
    "can_run": [
      "biosample",
      "donor",
      "diagnosis",
      "familymember",
      "file",
      "sequencingactivity",
      "variantcall"
    ],
    "cannot_run": [],
    "unused_tables": [],
    "unused_columns": [
      "family.solve_state"
    ]
  },
  "transform_details": {
    "biosample": {
      "can_run": true,
      "mapped_columns_present_count": 5,
      "mapped_columns_present": [
        "sample.sample_id",
        "sample.subject_id",
        "sample.sample_source",
        "sample.dbgap_sample_id",
        "subject.subject_id"
      ],
      "mapped_columns_missing_count": 0,
      "mapped_columns_missing": [],
      "passthrough_columns_present_count": 4,
      "passthrough_columns_present": [
        "subject.phenotype_group",
        "subject.phenotype_description",
        "sample.tissue_affected_status",
        "sample.sequencing_center"
      ],
      "passthrough_columns_missing_count": 0,
      "passthrough_columns_miss

In [16]:
print(json.dumps(results_dict['anvil_cmg_uwash_gru'], indent=2))

{
  "transform_summary": {
    "can_run": [
      "biosample",
      "donor",
      "diagnosis",
      "familymember",
      "file",
      "sequencingactivity"
    ],
    "cannot_run": [
      "variantcall"
    ],
    "unused_tables": [],
    "unused_columns": [
      "sample.data_type",
      "sample.date_data_generation",
      "sample.sample_provider"
    ]
  },
  "transform_details": {
    "biosample": {
      "can_run": true,
      "mapped_columns_present_count": 5,
      "mapped_columns_present": [
        "sample.sample_id",
        "sample.subject_id",
        "sample.sample_source",
        "sample.dbgap_sample_id",
        "subject.subject_id"
      ],
      "mapped_columns_missing_count": 0,
      "mapped_columns_missing": [],
      "passthrough_columns_present_count": 2,
      "passthrough_columns_present": [
        "subject.phenotype_description",
        "sample.sequencing_center"
      ],
      "passthrough_columns_missing_count": 2,
      "passthrough_columns_missing":