# Critical Validation Errors

[GH issue](https://github.com/cal-itp/data-infra/issues/513)

In [1]:
import warnings
warnings.filterwarnings("ignore")

import altair as alt
import pandas as pd

from datetime import date
from IPython.display import Markdown

import create_accessibility_data
import setup_charts
import utils
from shared_utils import geography_utils, styleguide
from shared_utils import calitp_color_palette as cp

today_date_formatted = date.today().strftime('%m-%d-%Y')
display(Markdown(
        f"<b>Report updated / data available through: "
        f"{today_date_formatted}</b>"
    )
)

<b>Report updated / data available through: 06-03-2022</b>

In [2]:
validations = pd.read_parquet(f"{utils.GCS_FILE_PATH}validations.parquet")

In [3]:
## Note:
# 1st df is by feed (comes from warehouse_queries)
# 2nd df is aggregated to date to get aggregate stats about % feeds with no errors
# 3rd df will need to be aggregated to date-code
def aggregate_no_critical_errors(df):
    
    df2 = (geography_utils.aggregate_by_geography(
        df[df.critical_errors==0],
        group_cols = ["date", "total_feeds", "num_unique_errors"],
        nunique_cols = ["feed_key"]
        )
    )

    df2 = df2.assign(
        pct_no_errors = round(df2.feed_key.divide(df2.total_feeds), 3)
    ).rename(columns = {"feed_key": "feeds"})
    
    return df2


def aggregate_error_types(df):
    keep_cols = ["date", "num_unique_errors"]
    df2 = (df[df.code.notna()][keep_cols]
          .drop_duplicates()
          .reset_index(drop=True)
         )
    
    return df2



In [4]:
no_errors = aggregate_no_critical_errors(validations)
unique_errors = aggregate_error_types(validations)

In [5]:
chart = setup_charts.base_line_chart(no_errors)
chart = setup_charts.add_tooltip(chart, "feeds")             
chart = (chart
         .encode(
             y=alt.Y("feeds:Q", title="# feeds"),
             color=alt.value(cp.CALITP_CATEGORY_BRIGHT_COLORS[0]),
         ).properties(title="# Feeds with No Critical Errors")
        )

chart = styleguide.preset_chart_config(chart)
display(chart)

In [6]:
chart = setup_charts.base_line_chart(no_errors)
chart = setup_charts.add_tooltip(chart, "pct_no_errors")

chart = (chart
         .encode(
             y=alt.Y("pct_no_errors:Q", title="% feeds", 
                    axis=alt.Axis(format="%")),
             color=alt.value(cp.CALITP_CATEGORY_BRIGHT_COLORS[0]),
         ).properties(title="% Feeds with No Critical Errors")
        )

chart = styleguide.preset_chart_config(chart)
display(chart)

In [7]:
chart = setup_charts.base_line_chart(unique_errors)
chart = setup_charts.add_tooltip(chart, "num_unique_errors")

chart = (chart
         .encode(
             y=alt.Y("num_unique_errors:Q", title="# Unique Errors"),
             color=alt.value(cp.CALITP_CATEGORY_BRIGHT_COLORS[0]),
         ).properties(title="Unique Validation Errors Across Feeds")
        )

chart = styleguide.preset_chart_config(chart)
display(chart)

In [8]:
# For each day, get the number of unique feeds that have a certain error
# Then for that day-code, calculate % error
errors_by_type = geography_utils.aggregate_by_geography(
    validations,
    group_cols = ["date", "code", "total_feeds"],
    nunique_cols = ["feed_key"]
)

errors_by_type = errors_by_type.assign(
    pct_error = round(errors_by_type.feed_key.divide(errors_by_type.total_feeds), 3)
)

# Get the top 10 errors
top_errors = (errors_by_type.groupby(["code"])
              .agg({"feed_key": "sum"})
              .reset_index()
              .sort_values("feed_key", ascending=False)
              .reset_index(drop=True)
)

TOP_ERRORS = list(top_errors.code.iloc[:15])

In [9]:
# Select from legend: https://altair-viz.github.io/gallery/interactive_legend.html
# Combine hover and click selection
# https://stackoverflow.com/questions/66108224/combine-hover-and-click-selections-in-altair

alt.data_transformers.enable('default', max_rows=10_000)

to_plot = errors_by_type[errors_by_type.code.isin(TOP_ERRORS)]
to_plot = to_plot.assign(
    code = to_plot.code.str.replace("_", " ")
)

chart = setup_charts.base_line_chart(to_plot)

chart = setup_charts.add_tooltip(chart, "pct_error")

#single_line = alt.selection_single(fields=['code'], bind='legend')
selection = alt.selection_multi(fields=['code'], bind='legend')


chart = (chart
         .encode(
             y=alt.Y("pct_error:Q", title="% feeds", axis=alt.Axis(format="%")),
             color=alt.Color("code:N", title="Validation Error", 
                             scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)),
             opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
             tooltip=["date", "code", 
                      alt.Tooltip("pct_error", title="% error", format=".1%")],
         ).properties(title={
             "text": "% Daily Feeds by Critical Validation Error Types",
             "subtitle": f"Updated: {today_date_formatted}"
         })
         .add_selection(selection)
         .interactive()
)

chart = styleguide.preset_chart_config(chart)
chart.save('./img/validation_top15_errors.html')
display(chart)