In [None]:
"""Example of diffing tables across multiple different SQLite DBs.

The tables must have the same name/schema. This is intended for use in
investigating validation test errors.
"""
import sqlite3
from pathlib import Path
from typing import Iterable

import pandas as pd

from pudl.helpers import diff_wide_tables, TableDiff
from pudl.metadata.classes import Resource
from pudl.metadata.fields import apply_pudl_dtypes


def table_diff(
        table_name: str,
        old_conn,
        new_conn,
        ignore_cols: Iterable[str] = ("plant_id_ferc1",),
        addl_key_cols: Iterable[str] = (),
    ) -> TableDiff:

    """Diff two versions of the same table that live in SQL databases.

    The table has to have the same name + columns in both DBs.
    """
    query = f"SELECT * FROM {table_name}"
    old_table = apply_pudl_dtypes(pd.read_sql(query, old_conn))
    new_table = apply_pudl_dtypes(pd.read_sql(query, new_conn))

    cols = list(set(old_table.columns) - set(ignore_cols))

    primary_key = list(set(Resource.from_id(table_name).schema.primary_key).union(set(addl_key_cols)))
    return diff_wide_tables(primary_key, old_table[cols], new_table[cols])



In [None]:
new_db = sqlite3.connect(Path("~/Downloads/pudl.sqlite").expanduser())
old_db = sqlite3.connect(Path("~/Downloads/pudl (2).sqlite").expanduser())


In [None]:
table_name = "denorm_plants_steam_ferc1"
diff = table_diff(table_name, old_db, new_db, ignore_cols=("plant_id_ferc1", "plant_id_pudl"), addl_key_cols=("report_year", "utility_id_pudl"))
diff.changed