In [80]:
import sys
!{sys.executable} -m pip install google-cloud-bigquery matplotlib

In [1]:
from google.cloud import bigquery
import warnings

PROJECT="" # Enter your Google Project ID
LOCATION="US"
DATASET="cromwell_monitoring"
DATE_RANGE="30 DAY"
MAX_GB_PROCESSED=1
MAX_ROWS=100000

warnings.filterwarnings('ignore', '.*user credentials from Google Cloud SDK.*', module='google.auth')

def monitoring_query(dry_run=False):
    client = bigquery.Client()
    job_config = bigquery.QueryJobConfig(dry_run=dry_run)
    query = f"""
        WITH metrics AS (
          SELECT
            instance_id,
            TIMESTAMP_DIFF(MAX(timestamp), MIN(timestamp), SECOND) runtime_duration_sec,
            AVG((SELECT AVG(p) FROM UNNEST(cpu_used_percent) p)) cpu_used_percent_avg,
            MAX(mem_used_gb) mem_used_gb_max,
            [MAX(disk_used_gb[OFFSET(0)]), MAX(disk_used_gb[SAFE_OFFSET(1)])] disk_used_gb_max,
            [AVG(disk_read_iops[OFFSET(0)]), AVG(disk_read_iops[SAFE_OFFSET(1)])] disk_read_iops_avg,
            [AVG(disk_write_iops[OFFSET(0)]), AVG(disk_write_iops[SAFE_OFFSET(1)])] disk_write_iops_avg
          FROM
            `{PROJECT}.{DATASET}.metrics`
          WHERE
            timestamp >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {DATE_RANGE})
          GROUP BY
            instance_id
        ),
        
        results AS (
            SELECT
              r.project_id, r.zone, r.preemptible,
              r.workflow_id, workflow_name, r.task_call_name, r.shard, r.attempt, execution_status,
              m.start_time metadata_start_time, TIMESTAMP_DIFF(m.end_time, m.start_time, SECOND) metadata_duration_sec, runtime_duration_sec,
              cpu_platform, r.cpu_count, cpu_used_percent_avg,
              r.mem_total_gb, mem_used_gb_max,
              r.disk_mounts, disk_types, r.disk_total_gb,
              (SELECT ARRAY_AGG(x IGNORE NULLS) FROM UNNEST(disk_used_gb_max) x) disk_used_gb_max,
              (SELECT ARRAY_AGG(x IGNORE NULLS) FROM UNNEST(disk_read_iops_avg) x) disk_read_iops_avg,
              (SELECT ARRAY_AGG(x IGNORE NULLS) FROM UNNEST(disk_write_iops_avg) x) disk_write_iops_avg,
              (SELECT SUM(CAST(value AS FLOAT64)) FROM UNNEST(inputs) WHERE type = 'file') inputs_size_gb,
              docker_image
            FROM
              `{PROJECT}.{DATASET}.runtime` r
            JOIN
              metrics
            USING (instance_id)
            JOIN
              `{PROJECT}.{DATASET}.metadata` m
            USING (instance_name)
            WHERE
              r.start_time >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {DATE_RANGE})
              AND
              m.start_time >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {DATE_RANGE})
            ORDER BY
              r.start_time DESC
        )
        
        SELECT *
        FROM results
        WHERE RAND() < {MAX_ROWS}/(SELECT COUNT(*) FROM results)
        AND inputs_size_gb IS NOT NULL
    """
    return client.query(
        query,
        location=LOCATION,
        job_config=job_config,
    )

q = monitoring_query(dry_run=True)
gb_processed = q.total_bytes_processed / 1024**3
if gb_processed > MAX_GB_PROCESSED:
    print(f"This query will process {gb_processed:.1f} GB when run. Please adjust DATE_RANGE and retry.")
    exit(1)
else:
    q = monitoring_query()
    print(f"Sample size: {q.result().total_rows} rows.")

In [2]:
import matplotlib.pyplot as plt

MIN_SAMPLE_SIZE=5

calls = {}
for row in q:
    if row.task_call_name in calls:
        calls[row.task_call_name].append(row)
    else:
        calls[row.task_call_name] = [row]

i = 0
n = len(calls)

fig = plt.figure(figsize=(30, 10 * len(calls)))

for name, samples in calls.items():
    failed = [s for s in samples if s.execution_status == 'Failed']
    done = [s for s in samples if s.execution_status == 'Done']
    if len(done) < MIN_SAMPLE_SIZE:
        continue
    
    inputs_size_gb_failed = [s.inputs_size_gb for s in failed]
    inputs_size_gb_done = [s.inputs_size_gb for s in done]
    
    cpu_used_avg_failed = [s.cpu_count * s.cpu_used_percent_avg / 100 for s in failed]
    cpu_used_avg_done = [s.cpu_count * s.cpu_used_percent_avg / 100  for s in done]
    cpu_total_failed = [s.cpu_count for s in failed]
    cpu_total_done = [s.cpu_count for s in done]
    
    mem_used_gb_max_failed = [s.mem_used_gb_max for s in failed]
    mem_used_gb_max_done = [s.mem_used_gb_max for s in done]
    mem_total_gb_failed = [s.mem_total_gb for s in failed]
    mem_total_gb_done = [s.mem_total_gb for s in done]
    
    disk_used_gb_max_failed = [s.disk_used_gb_max[0] for s in failed]
    disk_used_gb_max_done = [s.disk_used_gb_max[0] for s in done]
    disk_total_gb_failed = [s.disk_total_gb[0] for s in failed]
    disk_total_gb_done = [s.disk_total_gb[0] for s in done]
    
    plt.subplot(n, 3, i * 3 + 1)
    if len(inputs_size_gb_failed) > 0:
        plt.plot(inputs_size_gb_failed, cpu_used_avg_failed, 'o', label='avg used (Failed)', color='xkcd:orange')
        plt.plot(inputs_size_gb_failed, cpu_total_failed, 'o', label='total (Failed)', color='xkcd:light orange')
    plt.plot(inputs_size_gb_done, cpu_used_avg_done, '.', label='avg used (Done)', color='xkcd:green')
    plt.plot(inputs_size_gb_done, cpu_total_done, '.', label='total (Done)', color='xkcd:light green')
    plt.xlabel('Inputs (GB)')
    plt.ylabel('CPU (cores)')
    plt.title(name)
    plt.legend()
    
    plt.subplot(n, 3, i * 3 + 2)
    if len(inputs_size_gb_failed) > 0:
        plt.plot(inputs_size_gb_failed, mem_used_gb_max_failed, 'o', label='max used (Failed)', color='xkcd:orange')
        plt.plot(inputs_size_gb_failed, mem_total_gb_failed, 'o', label='total (Failed)', color='xkcd:light orange')
    plt.plot(inputs_size_gb_done, mem_used_gb_max_done, '.', label='max used (Done)', color='xkcd:blue')
    plt.plot(inputs_size_gb_done, mem_total_gb_done, '.', label='total (Done)', color='xkcd:light blue')
    plt.xlabel('Inputs (GB)')
    plt.ylabel('Memory (GB)')
    plt.title(name)
    plt.legend()
    
    plt.subplot(n, 3, i * 3 + 3)
    if len(inputs_size_gb_failed) > 0:
        plt.plot(inputs_size_gb_failed, disk_used_gb_max_failed, 'o', label='max used (Failed)', color='xkcd:orange')
        plt.plot(inputs_size_gb_failed, disk_total_gb_failed, 'o', label='total (Failed)', color='xkcd:light orange')
    plt.plot(inputs_size_gb_done, disk_used_gb_max_done, '.', label='max used (Done)', color='xkcd:magenta')
    plt.plot(inputs_size_gb_done, disk_total_gb_done, '.', label='total (Done)', color='xkcd:light magenta')
    plt.xlabel('Inputs (GB)')
    plt.ylabel('Disk (GB)')
    plt.title(name)
    plt.legend()
    
    i += 1

plt.show()