Skip to content

Commit

Permalink
Merge pull request #2932 from catalyst-cooperative/phmsa-extractor
Browse files Browse the repository at this point in the history
Extract raw PHMSA distribution and start of transmission data (Table A-D, H, I)
  • Loading branch information
e-belfer committed Jan 19, 2024
2 parents b0de23f + 27dab3e commit 97e49ad
Show file tree
Hide file tree
Showing 40 changed files with 1,783 additions and 47 deletions.
200 changes: 200 additions & 0 deletions devtools/debug-column-mapping.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Testing column mapping for Excel spreadsheets\n",
"This notebook is designed to quickly test column maps for Excel spreadsheets. It will flag the following:\n",
"1) Column names that are input but don't exist in the actual data\n",
"2) Column names present in the raw data but not mapped\n",
"3) Invalid inputs for pages and files in `page_map.csv` and `file_map.csv`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, select the raw dataset you're going to be mapping and locate all relevant file directories."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pudl\n",
"from pudl.workspace.datastore import ZenodoDoiSettings\n",
"import os\n",
"import importlib\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"from zipfile import ZipFile\n",
"import logging\n",
"import sys\n",
"\n",
"logger = logging.getLogger()\n",
"logger.setLevel(logging.INFO)\n",
"handler = logging.StreamHandler(stream=sys.stdout)\n",
"formatter = logging.Formatter('%(message)s')\n",
"handler.setFormatter(formatter)\n",
"logger.handlers = [handler]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = \"phmsagas\"\n",
"doi_path = getattr(ZenodoDoiSettings(), dataset).replace(\"/\", \"-\")\n",
"data_path = os.path.join(os.getenv(\"PUDL_INPUT\"),dataset,doi_path) # Get path to raw data\n",
"map_path = os.path.join(Path(pudl.package_data.__file__).parents[0], dataset) # Get path to mapping CSVs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, validate the file map. Make sure all file names included in the CSV actually exist in the raw data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file_map = pd.read_csv(\n",
" os.path.join(map_path, \"file_map.csv\"), index_col=0, comment=\"#\"\n",
" )\n",
"raw_files = os.listdir(data_path)\n",
"\n",
"# For each file, if zipfile get list of file names contained inside\n",
"all_files = []\n",
"for file in raw_files:\n",
" if file.endswith(\"zip\"):\n",
" file_path = os.path.join(data_path, file)\n",
" file_list = ZipFile(file_path).namelist()\n",
" all_files.append({file_path: file_list})\n",
"\n",
"for table_files in file_map.values.tolist(): # For each table with a list of files\n",
" for file in table_files: # For each file included in this table\n",
" if file not in str(all_files): # Search the list of files for the file text, flag if not.\n",
" logger.warning(f\"File '{file}' not found in actual raw data. Check file name.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, read in the column mapping CSVs. For each one, read in the raw data and make sure no columns are missing."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sheet_name = pd.read_csv(\n",
" os.path.join(map_path, \"page_map.csv\"), index_col=0, comment=\"#\"\n",
" )\n",
"skip_rows = pd.read_csv(\n",
" os.path.join(map_path, \"skiprows.csv\"), index_col=0, comment=\"#\"\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sometimes we don't care about missing raw columns, or we only want to check a particular table. Set parameters here to fine tune what you're checking."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"raw_check = False # If false, only check that mapped columns are found in the raw dataset.\n",
" # Useful when a table is split between several pages.\n",
"table_subset = [] # Leave list empty to check all tables\n",
"years_subset = [] # Use empty list if you want to check all years, otherwise supply a list of integers or a range"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def find_zip(file: str, dicts: list[dict[str,str]]) -> str:\n",
" for dic in dicts:\n",
" match = [i for i in dic if file in dic[i]]\n",
" if match == []:\n",
" continue\n",
" return match[0]\n",
"\n",
"ds = pudl.workspace.datastore.Datastore()\n",
"for page in file_map.index:\n",
" if not table_subset or page in table_subset:\n",
" column_maps = pd.read_csv(\n",
" os.path.join(map_path, \"column_maps\", f\"{page}.csv\"), index_col=0, comment=\"#\"\n",
" )\n",
" for index in file_map.columns: \n",
" if not years_subset or int(index) in years_subset:\n",
" logger.info(f\"Checking column maps for {page}, {index}\")\n",
" file = file_map.loc[page,index] # Get file name\n",
" archive = ZipFile(find_zip(file, all_files)) # Open zipfile and read file\n",
" with archive.open(file) as excel_file:\n",
" raw_file = pd.read_excel(\n",
" excel_file,\n",
" sheet_name=sheet_name.loc[page,index],\n",
" skiprows=skip_rows.loc[page,index],\n",
" )\n",
" raw_file = pudl.helpers.simplify_columns(raw_file) # Add pre-processing step used before column rename\n",
" raw_columns = raw_file.columns # Get raw column names\n",
" mapped_columns = column_maps.loc[:, index].dropna()\n",
" raw_missing = [col for col in raw_columns if col not in mapped_columns.values]\n",
" mapped_missing = [col for col in mapped_columns if col not in raw_columns.values]\n",
" if raw_missing and raw_check:\n",
" logger.warning(f\"Raw columns {raw_missing} from {file} are not mapped.\")\n",
" if mapped_missing:\n",
" logger.warning(f\"Mapped columns {mapped_missing} do not exist in the raw data file {file}\")\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Go back and fix any incorrectly labelled columns. Then run the cell above again, until all columns are correctly labelled."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pudl-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
11 changes: 10 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,15 @@ def data_dictionary_metadata_to_rst(app):
def data_sources_metadata_to_rst(app):
"""Export data source metadata to RST for inclusion in the documentation."""
print("Exporting data source metadata to RST.")
included_sources = ["eia860", "eia861", "eia923", "ferc1", "ferc714", "epacems"]
included_sources = [
"eia860",
"eia861",
"eia923",
"ferc1",
"ferc714",
"epacems",
"phmsagas",
]
package = Package.from_resource_ids()
extra_etl_groups = {"eia860": ["entity_eia"], "ferc1": ["glue"]}
for name in included_sources:
Expand Down Expand Up @@ -197,6 +205,7 @@ def cleanup_rsts(app, exception):
(DOCS_DIR / "data_sources/ferc1.rst").unlink()
(DOCS_DIR / "data_sources/ferc714.rst").unlink()
(DOCS_DIR / "data_sources/epacems.rst").unlink()
(DOCS_DIR / "data_sources/phmsagas.rst").unlink()


def cleanup_csv_dir(app, exception):
Expand Down
1 change: 1 addition & 0 deletions docs/data_sources/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Data Sources
epacems
ferc1
ferc714
phmsagas

.. toctree::
:caption: Work in Progress & Future Datasets
Expand Down
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 97e49ad

Please sign in to comment.