# Validation of frc_eia923
This notebook runs sanity checks on the Fuel Receipts and Costs data that are reported in EIA Form 923. These are the same tests which are run by the frc_eia923 validation tests by PyTest.  The notebook and visualizations are meant to be used as a diagnostic tool, to help understand what's wrong when the PyTest based data validations fail for some reason.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import pandas as pd
import sqlalchemy as sa
import pudl

In [None]:
import warnings
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

In [None]:
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_settings

In [None]:
def bounds_histogram(df, data_col, weight_col, query,
                     low_q, hi_q, low_bound, hi_bound,
                     title=""):
    if query !="":
        df = df.copy().query(query)
    xmin = pudl.validate.weighted_quantile(df[data_col], df[weight_col], 0.01)
    xmax = pudl.validate.weighted_quantile(df[data_col], df[weight_col], 0.99)
    
    plt.hist(df[data_col], weights=df[weight_col], range=(xmin, xmax), bins=50, color="black", label=data_col)

    if low_bound:
        plt.axvline(low_bound, lw=3, ls='--', color='red', label=f"lower bound for {low_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(df[data_col], df[weight_col], low_q), lw=3, color="red", label=f"actual {low_q:.0%}")
    if hi_bound:
        plt.axvline(hi_bound, lw=3, ls='--', color='blue', label=f"upper bound for {hi_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(df[data_col], df[weight_col], hi_q), lw=3, color="blue", label=f"actual {hi_q:.0%}")

    plt.title(title)
    plt.xlabel(data_col)
    plt.ylabel(weight_col)
    plt.legend()
    plt.show()
        
def historical_histogram(orig_df, test_df, data_col, weight_col, query="",
                         low_q=0.05, mid_q=0.5, hi_q=0.95,
                         low_bound=None, hi_bound=None,
                         title=""):
    if query != "":
        orig_df = orig_df.copy().query(query)
    if test_df is not None:
        test_df = test_df.copy().query(query)

    xmin = pudl.validate.weighted_quantile(orig_df[data_col], orig_df[weight_col], 0.01)
    xmax = pudl.validate.weighted_quantile(orig_df[data_col], orig_df[weight_col], 0.99)

    test_alpha = 1.0
    if test_df is not None:
        plt.hist(test_df[data_col], weights=test_df[weight_col], range=(xmin, xmax), bins=50, color="yellow", alpha=0.5, label="Test Distribution")
        test_alpha = 0.5
    else:
        test_df = orig_df
    plt.hist(orig_df[data_col], weights=orig_df[weight_col], range=(xmin, xmax), bins=50, color="black", alpha=test_alpha, label="Original Distribution")
    
    if low_q:
        low_range = pudl.validate.historical_distribution(orig_df, data_col, weight_col, low_q)
        plt.axvspan(min(low_range), max(low_range), color="red", alpha=0.2, label=f"Historical range of {low_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(test_df[data_col], test_df[weight_col], low_q), color="red", label=f"Tested {low_q:.0%}")
    
    if mid_q:
        mid_range = pudl.validate.historical_distribution(orig_df, data_col, weight_col, mid_q)
        plt.axvspan(min(mid_range), max(mid_range), color="green", alpha=0.2, label=f"historical range of {mid_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(test_df[data_col], test_df[weight_col], mid_q), color="green", label=f"Tested {mid_q:.0%}")
    
    if hi_q:
        high_range = pudl.validate.historical_distribution(orig_df, data_col, weight_col, hi_q)
        plt.axvspan(min(high_range), max(high_range), color="blue", alpha=0.2, label=f"Historical range of {hi_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(test_df[data_col], test_df[weight_col], hi_q), color="blue", label=f"Tested {hi_q:.0%}")
        
    
    plt.title(title)
    plt.xlabel(data_col)
    plt.ylabel(weight_col)
    plt.legend()
    plt.show()

def validate_bounds(df, validation_cases):
    for args in validation_cases:
        try:
            pudl.validate.vs_bounds(df, **args)
        except ValueError:
            warnings.warn("ERROR: Validation Failed")

        bounds_histogram(df, **args)

def validate_self(df, validation_cases):
    for args in validation_cases:
        try:
            pudl.validate.vs_self(df, **args)
        except ValueError:
            warnings.warn("ERROR: Validation Failed")

        historical_histogram(df, test_df=None, **args)
        
def validate_agg(orig_df, agg_df, validation_cases):
    for args in validation_cases:
        try:
            pudl.validate.vs_historical(orig_df, agg_df, **args)
        except ValueError:
            warnings.warn("ERROR: Validation Failed")

        historical_histogram(orig_df, agg_df, **args)

## Get the original EIA 923 data
First we pull the original (post-ETL) EIA 923 data out of the database. We will use the values in this dataset as a baseline for checking that latter aggregated data and derived values remain valid.  We will also eyeball these values here to make sure they are within the expected range. This may take a minute or two depending on the speed of your machine.

In [None]:
pudl_out_orig = pudl.output.pudltabl.PudlTabl(pudl_engine, freq=None)
frc_eia923_orig = pudl_out_orig.frc_eia923()

# Validation Against Fixed Bounds 
Some of the variables reported in this table have a fixed range of reasonable values, like the heat content per unit of a given fuel type.  These varaibles can be tested for validity against external standards directly.  In general we have two kinds of tests in this section:
* **Tails:** are the exteme values too extreme? Typically, this is at the 5% and 95% level, but depending on the distribution, sometimes other thresholds are used.
* **Middle:** Is the central value of the distribution where it should be?

## Coal Heat Content (bounds)

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_coal_heat_content)

## Oil Heat Content (bounds)

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_oil_heat_content)

## Natural Gas Heat Content (bounds)
Validation vails because of a small portion of the distribution at 1/10th the real heat content.

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_gas_heat_content)

## Coal Ash Content (bounds)

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_coal_ash_content)

## Coal Sulfur Content (bounds)

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_coal_sulfur_content)

## Coal Mercury Content (bounds)
Currently validation fails for two reasons:
* Unrealistically high mercury content in some coal (9.0ppm)
* 70% of all coal reports... 0ppm mercury.

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_coal_mercury_content)

## Coal Moisture Content (bounds)

In [None]:
pudl.validate.plot_vs_bounds(frc_eia923_orig, pudl.validate.frc_eia923_coal_moisture_content)

# Validating Historical Distributions
As a sanity check of the testing process itself, we can check to see whether the entire historical distribution has attributes that place it within the extremes of a historical subsampling of the distribution. In this case, we sample each historical year, and look at the range of values taken on by some quantile, and see whether the same quantile for the whole of the dataset fits within that range

In [None]:
pudl.validate.plot_vs_self(frc_eia923_orig, pudl.validate.frc_eia923_self)

# Validate Monthly Aggregation
It's possible that the distribution will change as a function of aggregation, or we might make an error in the aggregation process. These tests check that a collection of quantiles for the original and the data aggregated by month have internally consistent values.

In [None]:
pudl_out_month = pudl.output.pudltabl.PudlTabl(pudl_engine, freq="MS")
frc_eia923_month = pudl_out_month.frc_eia923()

In [None]:
pudl.validate.plot_vs_agg(frc_eia923_orig, frc_eia923_month, pudl.validate.frc_eia923_agg)

# Validate Annual Aggregation
It's possible that the distribution will change as a function of aggregation, or we might make an error in the aggregation process. These tests check that a collection of quantiles for the original and the data aggregated by year have internally consistent values.

In [None]:
pudl_out_year = pudl.output.pudltabl.PudlTabl(pudl_engine, freq="AS")
frc_eia923_year = pudl_out_year.frc_eia923()

In [None]:
pudl.validate.plot_vs_agg(frc_eia923_orig, frc_eia923_year, pudl.validate.frc_eia923_agg)