Skip to content

Commit

Permalink
Merge pull request #3242 from catalyst-cooperative/phmsa-transmission…
Browse files Browse the repository at this point in the history
…-f-g

Extract raw tables for PHMSA transmission data Part F & G
  • Loading branch information
e-belfer committed Jan 19, 2024
2 parents 8503c65 + 24634a0 commit 72ce726
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 19 deletions.
40 changes: 22 additions & 18 deletions devtools/debug-column-mapping.ipynb
Expand Up @@ -120,7 +120,7 @@
"metadata": {},
"outputs": [],
"source": [
"raw_check = False # If false, only check that mapped columns are found in the raw dataset.\n",
"raw_check = True # If false, only check that mapped columns are found in the raw dataset.\n",
" # Useful when a table is split between several pages.\n",
"table_subset = [] # Leave list empty to check all tables\n",
"years_subset = [] # Use empty list if you want to check all years, otherwise supply a list of integers or a range"
Expand All @@ -147,24 +147,28 @@
" )\n",
" for index in file_map.columns: \n",
" if not years_subset or int(index) in years_subset:\n",
" logger.info(f\"Checking column maps for {page}, {index}\")\n",
" file = file_map.loc[page,index] # Get file name\n",
" archive = ZipFile(find_zip(file, all_files)) # Open zipfile and read file\n",
" with archive.open(file) as excel_file:\n",
" raw_file = pd.read_excel(\n",
" excel_file,\n",
" sheet_name=sheet_name.loc[page,index],\n",
" skiprows=skip_rows.loc[page,index],\n",
" )\n",
" raw_file = pudl.helpers.simplify_columns(raw_file) # Add pre-processing step used before column rename\n",
" raw_columns = raw_file.columns # Get raw column names\n",
" mapped_columns = column_maps.loc[:, index].dropna()\n",
" raw_missing = [col for col in raw_columns if col not in mapped_columns.values]\n",
" mapped_missing = [col for col in mapped_columns if col not in raw_columns.values]\n",
" if raw_missing and raw_check:\n",
" logger.warning(f\"Raw columns {raw_missing} from {file} are not mapped.\")\n",
" if mapped_missing:\n",
" logger.warning(f\"Mapped columns {mapped_missing} do not exist in the raw data file {file}\")\n",
" if file == \"-1\":\n",
" logger.info(f\"No data for year {index}\")\n",
" else:\n",
" logger.info(f\"Checking column maps for {page}, {index}\")\n",
" archive = ZipFile(find_zip(file, all_files)) # Open zipfile and read file\n",
" with archive.open(file) as excel_file:\n",
" raw_file = pd.read_excel(\n",
" excel_file,\n",
" sheet_name=sheet_name.loc[page,index],\n",
" skiprows=skip_rows.loc[page,index],\n",
" )\n",
" raw_file = pudl.helpers.simplify_columns(raw_file) # Add pre-processing step used before column rename\n",
" raw_columns = raw_file.columns # Get raw column names\n",
" mapped_columns = column_maps.loc[:, index].dropna()\n",
" raw_missing = [col for col in raw_columns if col not in mapped_columns.values]\n",
" mapped_missing = [col for col in mapped_columns if col not in raw_columns.values]\n",
" if raw_missing and raw_check:\n",
" logger.warning(f\"Raw columns {raw_missing} from {file} are not mapped.\")\n",
" if mapped_missing:\n",
" logger.warning(f\"Mapped columns {mapped_missing} do not exist in the raw data file {file}\")\n",
" \n",
" "
]
},
Expand Down
3 changes: 2 additions & 1 deletion docs/release_notes.rst
Expand Up @@ -34,7 +34,8 @@ Data Coverage
CEMS instead of the annual files. Integrates CEMS through 2023Q3. See issue
:issue:`2973` & PR :pr:`3096`.
* Began integration of PHMSA gas distribution and transmission tables into PUDL,
extracting raw data from 1990-present. See epic :issue:`2848`, and PR :pr:`3242`.
extracting raw data from 1990-present. See epic :issue:`2848`, and PRs :pr:`2932`,
:pr:`3242`.
* Updated the EIA Bulk Electricity data archive so that the available data now to runs
through 2023-10-01. See :pr:`3252`. Also added this dataset to the set of data that
will automatically generate archives each month. See `This PUDL Archiver PR
Expand Down
1 change: 1 addition & 0 deletions src/pudl/extract/phmsagas.py
Expand Up @@ -55,6 +55,7 @@ def process_final_page(self, df, page):
"raw_phmsagas__yearly_transmission_gathering_summary_by_commodity",
"raw_phmsagas__yearly_miles_of_gathering_pipe_by_nps",
"raw_phmsagas__yearly_miles_of_transmission_pipe_by_nps",
"raw_phmsagas__yearly_inspections_and_assessments",
)

phmsagas_raw_dfs = excel.raw_df_factory(Extractor, name="phmsagas")
Expand Down

0 comments on commit 72ce726

Please sign in to comment.