The BigQuery Python client library provides a magic command that allows you to run queries with minimal code. Type the following Python code into the next cell to import the BigQuery Python client library and initialize a client. The BigQuery client is used to send and receive messages from the BigQuery API.

In [1]:
import sys
# 
#!{sys.executable} -m pip install --upgrade google-cloud-bigquery matplotlib numpy scipy
from google.cloud import bigquery
client = bigquery.Client()
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import json
import re

import datetime
from datetime import timedelta

Enter the Workflow ID you want to plot along with its subworkflows, main workflow first. Also set the cromwell execution bucket the workflow directory is expected to exist. 

In [2]:
# Control 2020-03-20
#workflow_ids= ["\"8bcb5d62-434b-4982-9969-8d50d99a4a98\"", #Control | main | small dataset | 47
#               "\"c3df9d5d-5ec4-4299-8d4a-ccf6c364340f\""] # Control | sub | small dataset | 47 | ScatterAt32_18

# Control 05/09/2020
#workflow_ids= ["\"a0a92771-c925-49b4-887e-877deeacc742\"", # Control | main | typical dataset | 47
#               "\"9837e566-0f37-444b-be3e-a135616f4c6e\""] # Control | sub | typical dataset | 47 | ScatterAt32_18

# Spark 2020/05/08
#workflow_ids= ["\"93b8224b-e7f1-4226-adfa-01f3da15373d\"", # Spark | main | typical dataset | 45
#               "\"2947c1f7-1a84-4724-abb8-c7c25c047dd5\"", # Spark | sub | typical dataset | 45 PBCCSOnlySingleFlowcell.ShardUBAM
#               "\"6e0a7851-8618-4c40-a04d-a6c0ce125be6\""] # Spark | sub | typical dataset | 45 ScatterAt38_18

# Bri 05/29/2020
workflow_ids= ["\"8d61ce56-9d67-4d25-9c54-fe9d09a7830e\"", # Bri | main | typical dataset | 47
               "\"237f38b5-0e01-442b-b876-a6b07a03ea47\""] # Bri | sub | typical dataset | 47
#-------------------------
PARENT_WORKFLOW_ID = workflow_ids[0].strip('"')
formated_workflow_ids = ','.join(workflow_ids)

CROMWELL_EXEC_BUCKET = "broad-methods-cromwell-exec-bucket-v47"

Use the Client.query() method to run a query. In the next cell, enter the following code to run a query to retrieve the annual count of plural births by plurality (2 for twins, 3 for triplets, and so on).

In [None]:
sql = f"""
    
SELECT
  *,
  TIMESTAMP_DIFF(meta_end_time, meta_start_time, SECOND) meta_duration_sec
FROM
  #broad-dsde-methods.cromwell_monitoring.monitor_everything_back_7days
  broad-dsde-methods.cromwell_monitoring.monitor_everything_between_dates
WHERE
  runtime_workflow_id IN ({formated_workflow_ids})
    
"""
df_monitoring = client.query(sql).to_dataframe()
## Show head of table 
df_monitoring.head()

In [None]:
TaskNames = df_monitoring.meta_task_call_name.unique()
print("The tasks in this workflow are:" + "\n" + "--------")
print(*TaskNames, sep ="\n")

The next cell plots the monitoring data from the datafram per task per shard (10 longest running shards). 

In [None]:
### Change the plot size here 
plt.rcParams["figure.figsize"] = (15, 20)

with PdfPages(PARENT_WORKFLOW_ID + '_resource_monitoring.pdf') as pdf:

    for task_name in TaskNames:
        
        # Gets the all shards for a given task name
        shards = df_monitoring.meta_shard.loc[(df_monitoring['meta_task_call_name'] == task_name)]
        shards = shards.sort_values().unique()
        
        # If shard counts is greater than 10 then gets 10 longest running shards for a given task name
        max_shards=10
        if len(shards) > max_shards:
            #create and sort meta table by duration
            df_monitoring_task = df_monitoring.loc[(df_monitoring['meta_task_call_name'] == task_name)]
            df_monitoring_task_sorted_duration = df_monitoring_task.sort_values(by='meta_duration_sec', ascending=False)
            #replace all shards in varaible shards with the first 50 of the sorted duration table
            shards = df_monitoring_task_sorted_duration.meta_shard.head(max_shards)
        
        for shard in shards:

            df_monitoring_task_shard = df_monitoring.loc[(df_monitoring['meta_task_call_name'] == task_name) & (df_monitoring['meta_shard'] == shard)]
            df_monitoring_task_shard = df_monitoring_task_shard.sort_values(by='metrics_timestamp')
            task_shard_meta_duration = df_monitoring_task_shard.meta_duration_sec.iloc[0]
            task_shard_duration = datetime.timedelta.total_seconds(max(df_monitoring_task_shard['metrics_timestamp'])-min(df_monitoring_task_shard['metrics_timestamp']))
            
            # create an array for list coloumns
            cpu_used_percent_array = [np.asarray(x).max() for x in df_monitoring_task_shard.metrics_cpu_used_percent]
            disk_used_gb_array = [np.asarray(x).max() for x in df_monitoring_task_shard.metrics_disk_used_gb]
            disk_read_iops_array = [np.asarray(x).max() for x in df_monitoring_task_shard.metrics_disk_read_iops]
            disk_write_iops_array = [np.asarray(x).max() for x in df_monitoring_task_shard.metrics_disk_write_iops]

            runtime_list= df_monitoring_task_shard.iloc[0].at['meta_inputs']
            runtime_dic={}
            for i, element in enumerate(runtime_list):
                if re.search("default_attr", element["key"]) or re.search("runtime_attr_override", element["key"]):
                    continue
                else:
                    k = element["key"].replace("[", "").replace("]", "").replace("runtime_attr", "").replace("\"", "", 2)
                    v = element["value"]
                    runtime_dic[k]=v

            plt.subplot(5, 1, 1)
            plt.title("Task Name: " + task_name + " Shard: " + str(shard) + " Duration: " +  str(task_shard_meta_duration), fontsize=20)
            plt.plot(df_monitoring_task_shard.metrics_timestamp.astype('O'), cpu_used_percent_array, label='CPU Used')
            plt.plot([], [], ' ', label='Obtained CPU Cores: {}' .format(df_monitoring_task_shard.iloc[0].at['meta_cpu']))
            plt.plot([], [], ' ', label='Requested CPU Cores: {}' .format(runtime_dic["cpu_cores"]))
            plt.legend(loc='upper center', bbox_to_anchor=(1.20, 0.8), shadow=True, ncol=1)
            plt.ylabel('CPU Percentage Used')
            plt.xlabel("Date Time")
            plt2 = plt.twiny()
            plt2.set_xlim(0, task_shard_duration)
            plt2.set_xlabel("Duration Time")
            plt.grid(True)

            plt.subplot(5, 1, 2)
            plt.plot(df_monitoring_task_shard.metrics_timestamp.astype('O'), df_monitoring_task_shard.metrics_mem_used_gb, label='Memory Used')
            plt.axhline(y=df_monitoring_task_shard.iloc[0].at['meta_mem_total_gb'], color='r', label='Max Memory GB: %.2f' %(df_monitoring_task_shard.iloc[0].at['meta_mem_total_gb']))
            plt.plot([], [], ' ', label='Requested Memory GB: {}' .format(runtime_dic["mem_gb"]))
            plt.legend(loc='upper center', bbox_to_anchor=(1.20, 0.8), shadow=True, ncol=1)
            plt.ylabel('Memory Used in GB')
            plt.xlabel("Date Time")
            plt2 = plt.twiny()
            plt2.set_xlim(0, task_shard_duration)
            plt2.set_xlabel("Duration Time")
            plt.grid(True)

            plt.subplot(5, 1, 3)
            plt.plot(df_monitoring_task_shard.metrics_timestamp.astype('O'), disk_used_gb_array, label='Disk Used')
            plt.axhline(y=max(df_monitoring_task_shard.iloc[0].at['meta_disk_total_gb']), color='r', label='Max Disksize GB: %.2f' %(max(df_monitoring_task_shard.iloc[0].at['meta_disk_total_gb'])))
            plt.plot([], [], ' ', label='Requested Disksize GB: {}' .format(runtime_dic["disk_gb"]))
            plt.legend(loc='upper center', bbox_to_anchor=(1.20, 0.8), shadow=True, ncol=1)
            plt.ylabel('Diskspace Used in GB')
            plt.xlabel("Date Time")
            plt2 = plt.twiny()
            plt2.set_xlim(0, task_shard_duration)
            plt2.set_xlabel("Duration Time")
            plt.grid(True)

            plt.subplot(5, 1, 4)
            plt.plot(df_monitoring_task_shard.metrics_timestamp.astype('O'), disk_read_iops_array)
            plt.ylabel('Disk Read IOps')
            plt.xlabel("Date Time")
            plt2 = plt.twiny()
            plt2.set_xlim(0, task_shard_duration)
            plt2.set_xlabel("Duration Time")
            plt.grid(True)

            plt.subplot(5, 1, 5)
            plt.plot(df_monitoring_task_shard.metrics_timestamp.astype('O'), disk_write_iops_array)
            plt.ylabel('Disk Write_IOps')
            plt.xlabel("Date Time")
            plt2 = plt.twiny()
            plt2.set_xlim(0, task_shard_duration)
            plt2.set_xlabel("Duration Time")
            plt.grid(True)

            pdf.savefig(bbox_inches='tight', pad_inches=0.5)
            plt.subplots_adjust(hspace = 0.5)
            plt.show()
            plt.close()

This cell saves the plot pdf into the working directory of the workflow. 

In [None]:
#Requires that user (or Terra user proxy) has edit access to destination bucket
WORKFLOW_NAME=df_meta.iloc[0].at['workflow_name'] 
!gsutil cp ./{PARENT_WORKFLOW_ID}_resource_monitoring.pdf gs://{CROMWELL_EXEC_BUCKET}/{WORKFLOW_NAME}/{WORKFLOWID}/{WORKFLOWID}_resource_monitoring.pdf