# Testing column mapping for Excel spreadsheets
This notebook is designed to quickly test column maps for Excel spreadsheets. It will flag the following:
1) Column names that are input but don't exist in the actual data
2) Column names present in the raw data but not mapped
3) Invalid inputs for pages and files in `page_map.csv` and `file_map.csv`

First, select the raw dataset you're going to be mapping and locate all relevant file directories.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import importlib
from pathlib import Path
import pandas as pd
from zipfile import ZipFile
import sys
import types

import pudl
from pudl.workspace.datastore import ZenodoDoiSettings
from pudl.extract.phmsagas import Extractor

logger = pudl.logging_helpers.get_logger("__name__")

In [None]:
dataset = "phmsagas"
doi_path = getattr(ZenodoDoiSettings(), dataset).replace("/", "-")
pudl_paths = pudl.workspace.setup.PudlPaths()
data_path = os.path.join(pudl_paths.pudl_input,dataset,doi_path) # Get path to raw data
map_path = os.path.join(Path(pudl.package_data.__file__).parents[0], dataset) # Get path to mapping CSVs
ds = pudl.workspace.datastore.Datastore(pudl_paths.pudl_input)

## File Check

First, validate the file map. Make sure all file names included in the CSV actually exist in the raw data.

In [None]:
file_map = pd.read_csv(
            os.path.join(map_path, "file_map.csv"), index_col=0, comment="#"
        )
raw_files = os.listdir(data_path)

# For each file, if zipfile get list of file names contained inside
all_files = []
for file in raw_files:
    if file.endswith("zip"):
        file_path = os.path.join(data_path, file)
        file_list = ZipFile(file_path).namelist()
        all_files.append({file_path: file_list})

for table_files in file_map.values.tolist(): # For each table with a list of files
    for file in table_files: # For each file included in this table
        if file not in str(all_files): # Search the list of files for the file text, flag if not.
            logger.warning(f"File '{file}' not found in actual raw data. Check file name.")

Next, read in the column mapping CSVs. For each one, read in the raw data and make sure no columns are missing.

In [None]:
sheet_name = pd.read_csv(
            os.path.join(map_path, "page_map.csv"), index_col=0, comment="#"
        )
skip_rows = pd.read_csv(
            os.path.join(map_path, "skiprows.csv"), index_col=0, comment="#"
        )

## Column Map Check

Sometimes we don't care about missing raw columns, or we only want to check a particular table. Set parameters here to fine tune what you're checking.

In [None]:
raw_check = True # If false, only check that mapped columns are found in the raw dataset.
                  # Useful when a table is split between several pages.
table_subset = [] # Leave list empty to check all tables
years_subset = [] # Use empty list if you want to check all years, otherwise supply a list of integers or a range

In [None]:
def find_zip(file: str, dicts: list[dict[str,str]]) -> str:
    for dic in dicts:
        match = [i for i in dic if file in dic[i]]
        if match == []:
            continue
        return match[0]


for page in file_map.index:
    if not table_subset or page in table_subset:
        column_maps = pd.read_csv(
                os.path.join(map_path, "column_maps", f"{page}.csv"), index_col=0, comment="#"
            )
        for index in file_map.columns: 
            if not years_subset or int(index) in years_subset:
                file = file_map.loc[page,index] # Get file name
                if file == "-1":
                    logger.info(f"No data for year {index}")
                else:
                    logger.info(f"Checking column maps for {page}, {index}")
                    archive = ZipFile(find_zip(file, all_files)) # Open zipfile and read file
                    with archive.open(file) as excel_file:
                        raw_file = pd.read_excel(
                                    excel_file,
                                    sheet_name=sheet_name.loc[page,index],
                                    skiprows=skip_rows.loc[page,index],
                                )
                    raw_file = pudl.helpers.simplify_columns(raw_file) # Add pre-processing step used before column rename
                    raw_columns = raw_file.columns # Get raw column names
                    mapped_columns = column_maps.loc[:, index].dropna()
                    raw_missing = [col for col in raw_columns if col not in mapped_columns.values]
                    mapped_missing = [col for col in mapped_columns if col not in raw_columns.values]
                    if raw_missing and raw_check:
                        logger.warning(f"Raw columns {raw_missing} from {file} are not mapped.")
                    if mapped_missing:
                        logger.warning(f"Mapped columns {mapped_missing} do not exist in the raw data file {file}")
                
        

Go back and fix any incorrectly labelled columns. Then run the cell above again, until all columns are correctly labelled.

## Extractor Check

In [None]:
## SETTINGS FOR EXTRACTOR
extractor_phmsagas = Extractor(ds=ds)

# recommend changing the loglevel here to warning to only get the baddies
pudl.logging_helpers.configure_root_logger(loglevel="WARNING")

# IF you wanna restrict the years
working_years = list(range(1990,2023))
# IF you want to restrict the pages to extract here is a lil way to do that
# you give pages_you_want_to_extract a lil of pages you want to extract
# if pages_you_want_to_extract if nothing, you'll get the standard pages
pages_you_want_to_extract = []
all_pages = extractor_phmsagas._metadata.get_all_pages()
def _new_page_getter(self):
    if pages_you_want_to_extract:
        return pages_you_want_to_extract
    else:
        return all_pages
extractor_phmsagas._metadata.get_all_pages = types.MethodType(_new_page_getter, extractor_phmsagas)

In [None]:
## RUN THE EXTRACTOR
extracted_dfs = extractor_phmsagas.extract(year=working_years)