Merge pull request #2932 from catalyst-cooperative/phmsa-extractor

Extract raw PHMSA distribution and start of transmission data (Table A-D, H, I)
catalyst-cooperative · Jan 19, 2024 · 97e49ad · 97e49ad
2 parents b0de23f + 27dab3e
commit 97e49ad
Show file tree

Hide file tree

Showing 40 changed files with 1,783 additions and 47 deletions.
diff --git a/devtools/debug-column-mapping.ipynb b/devtools/debug-column-mapping.ipynb
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Testing column mapping for Excel spreadsheets\n",
+    "This notebook is designed to quickly test column maps for Excel spreadsheets. It will flag the following:\n",
+    "1) Column names that are input but don't exist in the actual data\n",
+    "2) Column names present in the raw data but not mapped\n",
+    "3) Invalid inputs for pages and files in `page_map.csv` and `file_map.csv`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, select the raw dataset you're going to be mapping and locate all relevant file directories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pudl\n",
+    "from pudl.workspace.datastore import ZenodoDoiSettings\n",
+    "import os\n",
+    "import importlib\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "from zipfile import ZipFile\n",
+    "import logging\n",
+    "import sys\n",
+    "\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.INFO)\n",
+    "handler = logging.StreamHandler(stream=sys.stdout)\n",
+    "formatter = logging.Formatter('%(message)s')\n",
+    "handler.setFormatter(formatter)\n",
+    "logger.handlers = [handler]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = \"phmsagas\"\n",
+    "doi_path = getattr(ZenodoDoiSettings(), dataset).replace(\"/\", \"-\")\n",
+    "data_path = os.path.join(os.getenv(\"PUDL_INPUT\"),dataset,doi_path) # Get path to raw data\n",
+    "map_path = os.path.join(Path(pudl.package_data.__file__).parents[0], dataset) # Get path to mapping CSVs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, validate the file map. Make sure all file names included in the CSV actually exist in the raw data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_map = pd.read_csv(\n",
+    "            os.path.join(map_path, \"file_map.csv\"), index_col=0, comment=\"#\"\n",
+    "        )\n",
+    "raw_files = os.listdir(data_path)\n",
+    "\n",
+    "# For each file, if zipfile get list of file names contained inside\n",
+    "all_files = []\n",
+    "for file in raw_files:\n",
+    "    if file.endswith(\"zip\"):\n",
+    "        file_path = os.path.join(data_path, file)\n",
+    "        file_list = ZipFile(file_path).namelist()\n",
+    "        all_files.append({file_path: file_list})\n",
+    "\n",
+    "for table_files in file_map.values.tolist(): # For each table with a list of files\n",
+    "    for file in table_files: # For each file included in this table\n",
+    "        if file not in str(all_files): # Search the list of files for the file text, flag if not.\n",
+    "            logger.warning(f\"File '{file}' not found in actual raw data. Check file name.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, read in the column mapping CSVs. For each one, read in the raw data and make sure no columns are missing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sheet_name = pd.read_csv(\n",
+    "            os.path.join(map_path, \"page_map.csv\"), index_col=0, comment=\"#\"\n",
+    "        )\n",
+    "skip_rows = pd.read_csv(\n",
+    "            os.path.join(map_path, \"skiprows.csv\"), index_col=0, comment=\"#\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sometimes we don't care about missing raw columns, or we only want to check a particular table. Set parameters here to fine tune what you're checking."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_check = False # If false, only check that mapped columns are found in the raw dataset.\n",
+    "                  # Useful when a table is split between several pages.\n",
+    "table_subset = [] # Leave list empty to check all tables\n",
+    "years_subset = [] # Use empty list if you want to check all years, otherwise supply a list of integers or a range"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_zip(file: str, dicts: list[dict[str,str]]) -> str:\n",
+    "    for dic in dicts:\n",
+    "        match = [i for i in dic if file in dic[i]]\n",
+    "        if match == []:\n",
+    "            continue\n",
+    "        return match[0]\n",
+    "\n",
+    "ds = pudl.workspace.datastore.Datastore()\n",
+    "for page in file_map.index:\n",
+    "    if not table_subset or page in table_subset:\n",
+    "        column_maps = pd.read_csv(\n",
+    "                os.path.join(map_path, \"column_maps\", f\"{page}.csv\"), index_col=0, comment=\"#\"\n",
+    "            )\n",
+    "        for index in file_map.columns: \n",
+    "            if not years_subset or int(index) in years_subset:\n",
+    "                logger.info(f\"Checking column maps for {page}, {index}\")\n",
+    "                file = file_map.loc[page,index] # Get file name\n",
+    "                archive = ZipFile(find_zip(file, all_files)) # Open zipfile and read file\n",
+    "                with archive.open(file) as excel_file:\n",
+    "                    raw_file = pd.read_excel(\n",
+    "                                excel_file,\n",
+    "                                sheet_name=sheet_name.loc[page,index],\n",
+    "                                skiprows=skip_rows.loc[page,index],\n",
+    "                            )\n",
+    "                raw_file = pudl.helpers.simplify_columns(raw_file) # Add pre-processing step used before column rename\n",
+    "                raw_columns = raw_file.columns # Get raw column names\n",
+    "                mapped_columns = column_maps.loc[:, index].dropna()\n",
+    "                raw_missing = [col for col in raw_columns if col not in mapped_columns.values]\n",
+    "                mapped_missing = [col for col in mapped_columns if col not in raw_columns.values]\n",
+    "                if raw_missing and raw_check:\n",
+    "                    logger.warning(f\"Raw columns {raw_missing} from {file} are not mapped.\")\n",
+    "                if mapped_missing:\n",
+    "                    logger.warning(f\"Mapped columns {mapped_missing} do not exist in the raw data file {file}\")\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Go back and fix any incorrectly labelled columns. Then run the cell above again, until all columns are correctly labelled."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pudl-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/conf.py b/docs/conf.py
@@ -149,7 +149,15 @@ def data_dictionary_metadata_to_rst(app):
 def data_sources_metadata_to_rst(app):
     """Export data source metadata to RST for inclusion in the documentation."""
     print("Exporting data source metadata to RST.")
-    included_sources = ["eia860", "eia861", "eia923", "ferc1", "ferc714", "epacems"]
+    included_sources = [
+        "eia860",
+        "eia861",
+        "eia923",
+        "ferc1",
+        "ferc714",
+        "epacems",
+        "phmsagas",
+    ]
     package = Package.from_resource_ids()
     extra_etl_groups = {"eia860": ["entity_eia"], "ferc1": ["glue"]}
     for name in included_sources:
@@ -197,6 +205,7 @@ def cleanup_rsts(app, exception):
     (DOCS_DIR / "data_sources/ferc1.rst").unlink()
     (DOCS_DIR / "data_sources/ferc714.rst").unlink()
     (DOCS_DIR / "data_sources/epacems.rst").unlink()
+    (DOCS_DIR / "data_sources/phmsagas.rst").unlink()
 
 
 def cleanup_csv_dir(app, exception):

diff --git a/docs/data_sources/index.rst b/docs/data_sources/index.rst
@@ -13,6 +13,7 @@ Data Sources
    epacems
    ferc1
    ferc714
+   phmsagas
 
 .. toctree::
    :caption: Work in Progress & Future Datasets

diff --git a/..._sources/phmsagas/Gas Distribution Annual Form - PHMSA F7100.1-1 (2005) - Data Fields.pdf b/..._sources/phmsagas/Gas Distribution Annual Form - PHMSA F7100.1-1 (2005) - Data Fields.pdf
diff --git a/..._sources/phmsagas/Gas Distribution Annual Form - PHMSA F7100.1-1 (2021) - Data fields.pdf b/..._sources/phmsagas/Gas Distribution Annual Form - PHMSA F7100.1-1 (2021) - Data fields.pdf