# Validate EIA 923 Data
This notebook runs a bunch of sanity checks on the EIA 923 data.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import pandas as pd
import sqlalchemy as sa
import pudl

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

In [None]:
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 56

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()
ferc1_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_settings

In [None]:
def plot_distribution(orig_df, test_df, data_col, weight_col, query="", low_q=0.05, mid_q=0.5, high_q=0.95, title=""):
    if query != "":
        orig_df = orig_df.copy().query(query)
        test_df = test_df.copy().query(query)
    xmin = pudl.validate.weighted_quantile(orig_df[data_col], orig_df[weight_col], 0.005)
    xmax = pudl.validate.weighted_quantile(orig_df[data_col], orig_df[weight_col], 0.995)
    plt.hist(orig_df[data_col], weights=orig_df[weight_col], range=(xmin, xmax), bins=100, color="black", alpha=0.5, label="Original Distribution")
    plt.hist(test_df[data_col], weights=test_df[weight_col], range=(xmin, xmax), bins=100, color="yellow", alpha=0.5, label="Test Distribution")
    
    if low_q:
        low_range = pudl.validate.historical_distribution(orig_df, data_col, weight_col, low_q)
        plt.axvspan(min(low_range), max(low_range), color="red", alpha=0.2, label=f"Historical range of {low_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(test_df[data_col], test_df[weight_col], low_q), color="red", label=f"Tested {low_q:.0%}")
    
    if mid_q:
        mid_range = pudl.validate.historical_distribution(orig_df, data_col, weight_col, mid_q)
        plt.axvspan(min(mid_range), max(mid_range), color="green", alpha=0.2, label=f"historical range of {mid_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(test_df[data_col], test_df[weight_col], mid_q), color="green", label=f"Tested {mid_q:.0%}")
    
    if high_q:
        high_range = pudl.validate.historical_distribution(orig_df, data_col, weight_col, high_q)
        plt.axvspan(min(high_range), max(high_range), color="blue", alpha=0.2, label=f"Historical range of {high_q:.0%}")
        plt.axvline(pudl.validate.weighted_quantile(test_df[data_col], test_df[weight_col], high_q), color="blue", label=f"Tested {high_q:.0%}")
    
    plt.title(title)
    plt.xlabel(data_col)
    plt.ylabel(weight_col)
    plt.legend()
    plt.show()

## What these tests do
* Select records from each output datarame which ought to have similar values in some variable.
* Calculate the range of lower, upper, and median values which that variable has experienced in the past.
* Check whether the aggregated and derived versions of that value fall within the same range as the original data.
* Create a visualization showing the original data, acceptable range of lower, mid, and upper values.
* Create a visualization showing each aggregated / derived data set, and how it measures up to the original.

## EIA 923 Original Data
First we pull the original (post-ETL) EIA 923 data out of the database. We will use the values in this dataset as a baseline for checking that latter aggregated data and derived values remain valid.  We will also eyeball these values here to make sure they are within the expected range.

In [None]:
%%time
pudl_out_orig = pudl.output.pudltabl.PudlTabl(pudl_engine, freq=None)
pudl_out_month = pudl.output.pudltabl.PudlTabl(pudl_engine, freq="MS")
pudl_out_year = pudl.output.pudltabl.PudlTabl(pudl_engine, freq="AS")

### Fuel Recepts and Costs
For now we are just checking the original data against the monthly and yearly aggregations of that data...

## Fields to test:
 * `ash_content_pct` (coal only)
 * `chlorine_content_ppm` (coal only, new field)
 * `fuel_cost_per_mmbtu` (all fuels)
 * `heat_content_mmbtu_per_unit` (all fuels)
 * `mercury_content_ppm` (coal only)
 * `moisture_content_pct` (coal only, new field)
 * `sulfur_content_pct` (coal only, one-sided)
 
## Original data irregularities:
 * Half a billion tons of coal with extreeemly high mercury content. Rest near zero.
 * ~5% of gas deliveries have heat content 1/10 what it should be.
 * Small population of low heat content petroleum outliers.

## Ideas for other kinds of tests
 * Test relative values of various partitions, e.g. heat content of LIG < SUB < BIT:
 * Test absolute values of some partitioned values (e.g. mmbtu per unit of of LIG / SUB / BIT)
 * Coal sulfur content is one-sided. Needs an absolute test.

In [None]:
frc_eia923_orig = pudl_out_orig.frc_eia923()
for args in pudl.validate.abs_test_args:
    plot_distribution(orig_df=frc_eia923_orig, test_df=frc_eia923_orig, **args)
    pudl.validate.vs_historical(orig_df=frc_eia923_orig, test_df=frc_eia923_orig, **args)

In [None]:
for frc_eia923_test in [pudl_out_month.frc_eia923(), pudl_out_year.frc_eia923()]:
    for args in pudl.validate.agg_test_args:
        plot_distribution(orig_df=frc_eia923_orig, test_df=frc_eia923_test, **args)
        pudl.validate.vs_historical(orig_df=frc_eia923_orig, test_df=frc_eia923_test, **args)