In [2]:
import altair as alt
import pyspark
import pyspark.sql.functions as F
import json

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
metrics = spark.read.json(".private/app-20201007191711-0071")
# metrics.printSchema()

In [4]:
def collect_and_dictify(df):
    return [json.loads(row[0]) for row in df.selectExpr("to_json(*)").collect()]

def executor_info(df):
    info = df.select("Executor Info").dropna()
    return collect_and_dictify(info)

def plan_info(df):
    return collect_and_dictify(df.select("sparkPlanInfo").dropna())

def stageInfo(df):
    return collect_and_dictify(df.select("Stage Info").dropna())

In [5]:
# plan_info(metrics)

In [6]:
def wide_metrics(df, mcol='Task Info', idcol='Task ID', interesting_metrics=None):
    acc_cols = [F.col('Accumulable.%s' % s).alias('Metric %s' % s) for s in ['ID', 'Name', 'Value']]
    obs = df.select(mcol).dropna().select('%s.*' % mcol)
    cols = [F.col(elt) for elt in sorted(set(obs.columns) - set([idcol, 'Accumulables']))]
    
    if interesting_metrics is None:
        interesting_metrics = F.col('Metric Name').isin(
            'internal.metrics.resultSerializationTime',
            'write time',
            'shuffle write time',
            'join time',
            'GPU time',
            'GPU decode time',
            'fetch wait time',
            'internal.metrics.executorCpuTime',
            'internal.metrics.executorDeserializeTime',
            'internal.metrics.jvmGCTime',
            'internal.metrics.jvmGCTime'
        )
    
    return obs.select(
        idcol, 
        F.explode('Accumulables').alias('Accumulable'), 
        *cols
    ).select(
        idcol, 
        *(cols + acc_cols)
    ).where(interesting_metrics)

def wide_tasks(df):
    return wide_metrics(df, 'Task Info', 'Task ID')

def wide_stages(df):
    return wide_metrics(df, 'Stage Info', 'Stage ID')


In [7]:
task_metrics = wide_tasks(metrics).toPandas()

In [8]:
alt.Chart(task_metrics).mark_bar().encode(
    x='Task ID:N',
    y='Metric Value:Q',
    color='Metric Name:N',
    tooltip=['Metric Name', 'Metric Value', 'Task ID']
).interactive()

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

In [13]:
stage_metrics = wide_stages(metrics).toPandas()
alt.Chart(stage_metrics).mark_bar().encode(
    x='Stage ID:N',
    y='Metric Value:Q',
    color='Metric Name:N',
    tooltip=['Details', 'Metric Name', 'Metric Value', 'Stage ID']
).interactive()

In [None]:
stage_metrics