## Load the data

In [None]:
from dagster import AssetKey
from pudl.etl import defs
import pandas as pd
from pudl.helpers import zero_pad_numeric_string, standardize_phone_column, standardize_na_values
import numpy as np
import re
from pudl.metadata.dfs import POLITICAL_SUBDIVISIONS

In [None]:
raw_df = defs.load_asset_value(AssetKey("raw_phmsagas__yearly_distribution"))

In [None]:
YEARLY_DISTRIBUTION_OPERATORS_COLUMNS = {
    "columns_to_keep": [
        "report_date",
        "report_number",  # not in pudl/metadata/fields.py
        "report_submission_type",  # not in pudl/metadata/fields.py
        "report_year",
        # None of the columns below are in pudl/metadata/fields.py
        "operator_id_phmsa",
        "operator_name_phmsa",
        "office_address_street",
        "office_address_city",
        "office_address_state",
        "office_address_zip",
        "office_address_county",
        "headquarters_address_street",
        "headquarters_address_city",
        "headquarters_address_state",
        "headquarters_address_zip",
        "headquarters_address_county",
        "excavation_damage_excavation_practices",
        "excavation_damage_locating_practices",
        "excavation_damage_one_call_notification",
        "excavation_damage_other",
        "excavation_damage_total",
        "excavation_tickets",
        "services_efv_in_system",
        "services_efv_installed",
        "services_shutoff_valve_in_system",
        "services_shutoff_valve_installed",
        "federal_land_leaks_repaired_or_scheduled",
        "percent_unaccounted_for_gas",
        "additional_information",
        "preparer_email",
        "preparer_fax",
        "preparer_name",
        "preparer_phone",
        "preparer_title",
    ],
    "columns_to_convert_to_ints": [
        "report_year",
        "report_number",
        "operator_id_phmsa",
        "excavation_damage_excavation_practices",
        "excavation_damage_locating_practices",
        "excavation_damage_one_call_notification",
        "excavation_damage_other",
        "excavation_damage_total",
        "excavation_tickets",
        "services_efv_in_system",
        "services_efv_installed",
        "services_shutoff_valve_in_system",
        "services_shutoff_valve_installed",
    ],
    "capitalization_exclusion": ["headquarters_address_state", "office_address_state"],
}

In [None]:
df = raw_df.loc[
    :, YEARLY_DISTRIBUTION_OPERATORS_COLUMNS["columns_to_keep"]
].copy()

# Standardize NAs
df = standardize_na_values(df)

# Initial string cleaning
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].str.strip()

# Specify the columns to convert to integer type
cols_to_convert = YEARLY_DISTRIBUTION_OPERATORS_COLUMNS[
    "columns_to_convert_to_ints"
]

# Fill NaN values with pd.NA, then cast to "Int64" nullable integer type
df[cols_to_convert] = df[cols_to_convert].fillna(pd.NA).astype("Int64")

# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100

# Convert 2-digit years to appropriate 4-digit format (assume cutoff at year 50)
# We could also use the first 4 digits of the "report_number" but there was at least one anomaly here with an invalid year
df.loc[mask, "report_year"] = 2000 + df.loc[mask, "report_year"].where(
    df.loc[mask, "report_year"] < 50, 1900
)

# Standardize case for city, county, operator name, etc.
# Capitalize the first letter of each word in a list of columns
cap_cols = df.select_dtypes(include=["object"]).columns.difference(
    YEARLY_DISTRIBUTION_OPERATORS_COLUMNS["capitalization_exclusion"]
)
for col in cap_cols:
    df[col] = df[col].str.title()

# Standardize state abbreviations
state_to_abbr = {
    x.subdivision_name: x.subdivision_code
    for x in POLITICAL_SUBDIVISIONS.itertuples()
    if x.country_code == "USA" and x.subdivision_type == "state"
}
state_to_abbr.update(
    {
        x.subdivision_code: x.subdivision_code
        for x in POLITICAL_SUBDIVISIONS.itertuples()
        if x.country_code == "USA" and x.subdivision_type == "state"
    }
)

for state_col in ["headquarters_address_state", "office_address_state"]:
    df[state_col] = (
        df[state_col]
        .str.strip()
        .replace(state_to_abbr)
        .where(df[state_col].isin(state_to_abbr.values()), pd.NA)
    )

# Standardize zip codes
df["office_address_zip"] = zero_pad_numeric_string(
    df["office_address_zip"], n_digits=5
)
df["headquarters_address_zip"] = zero_pad_numeric_string(
    df["headquarters_address_zip"], n_digits=5
)

# Standardize telephone and fax number format and drop (000)-000-0000
df = standardize_phone_column(df, ["preparer_phone", "preparer_fax"])

In [None]:
df.head()

In [None]:
df.percent_unaccounted_for_gas.head()

In [None]:
print(negative_count / (positive_count + negative_count)) 

In [None]:
import matplotlib.pyplot as plt

positive_count = (df['percent_unaccounted_for_gas'] > 0).sum()
negative_count = (df['percent_unaccounted_for_gas'] < 0).sum()

# Data for plotting
labels = ['Positive', 'Negative']
counts = [positive_count, negative_count]

# Create the bar plot
plt.bar(labels, counts, color=['green', 'red'])
plt.title('Distribution of Positive vs Negative Values')
plt.xlabel('Value Type')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Rotate labels if necessary
plt.show()


In [None]:
df[df.percent_unaccounted_for_gas<0]

In [None]:
df.columns

## core_phmsagas__yearly_distribution_operators

In [None]:
df = raw_df[[
    "report_date",
    "report_number",
    "report_submission_type",
    "report_year",
    "operator_id_phmsa",
    "operator_name_phmsa",
    "office_address_street",
    "office_address_city",
    "office_address_state",
    "office_address_zip",
    "office_address_county",
    "headquarters_address_street",
    "headquarters_address_city",
    "headquarters_address_state",
    "headquarters_address_zip",
    "headquarters_address_county",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled",
    "percent_unaccounted_for_gas",
    "additional_information",
    "preparer_email",
    "preparer_fax",
    "preparer_name",
    "preparer_phone",
    "preparer_title"
]]

In [None]:
# Convert columns to ints
# Excluded "federal_land_leaks_repaired_or_scheduled" from this list since there were a couple rows with decimal values
columns_to_convert = [
    "report_year",
    "report_number",
    "operator_id_phmsa",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed"
]
df[columns_to_convert] = df[columns_to_convert].astype("Int64")

In [None]:
# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100

# Convert 2-digit years to appropriate 4-digit format (assume cutoff at year 50)
# We could also use the first 4 digits of the "report_number" but there was at least one anomaly here with an invalid year
df.loc[mask, "report_year"] = df.loc[mask, "report_year"].apply(
    lambda x: 2000 + x if x < 50 else 1900 + x
)

### Operator Table Tasks

#### Standardize NAs

In [None]:
# Fill NA values with zeroes because these columns are simply counts.
# Note that "excavation_damage..." columns should sum up to the value in "excavation_damage_total". However, many rows
# (on the scale of thousands) do not actually sum up to "excavation_damage_total".
columns_to_fill = [
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled"
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Fill in bad strings
df = standardize_na_values(df)

#### Standardize case for city, county, operator name, etc.

In [None]:
# Capitalize the first letter of each word in all object-type columns except the excluded ones
exclude_columns = ['headquarters_address_state', 'office_address_state']
df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)] = \
    df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)].apply(lambda col: col.str.title())

In [None]:
# Standardize state abbreviations
# First create a dictionary of state names to abbreviations
state_to_abbr = {
    x.subdivision_name: x.subdivision_code
    for x in POLITICAL_SUBDIVISIONS.itertuples()
    if x.country_code == "USA" and x.subdivision_type == "state"
}
# Add abbreviations to the dictionary
state_to_abbr.update({
    x.subdivision_code: x.subdivision_code
    for x in POLITICAL_SUBDIVISIONS.itertuples()
    if x.country_code == "USA" and x.subdivision_type == "state"
})

def standardize_state(state):
    if pd.isna(state):
        return state
    state = state.strip()
    standardized_state = state_to_abbr.get(state, state)
    if standardized_state not in state_to_abbr.values():
        return np.nan
    return standardized_state

In [None]:
df["headquarters_address_state"] = df["headquarters_address_state"].apply(standardize_state)
df["office_address_state"] = df["office_address_state"].apply(standardize_state)

In [None]:
# Trim all the object-type columns
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).applymap(lambda x: x.strip() if isinstance(x, str) else x)

#### Standardize telephone and fax number format and drop (000)-000-0000


In [None]:
df = standardize_phone_column(df, ["preparer_phone", "preparer_fax"])

### Other cleanup

In [None]:
# Standardize zip codes
df["office_address_zip"] = zero_pad_numeric_string(df["office_address_zip"], n_digits=5)
df["headquarters_address_zip"] = zero_pad_numeric_string(df["headquarters_address_zip"], n_digits=5)

In [None]:
# Strip whitespace from all object (string) columns
df[df.select_dtypes(include=["object"]).columns] = df.select_dtypes(
    include=["object"]
).apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))


Below is code that can be used to analyze missing values in a dataset:

In [None]:

def analyze_missing_values(
    df: pd.DataFrame, custom_missing_values: list[str] = None
) -> list[str]:
    """Analyze columns of a DataFrame for missing or invalid values.

    PLEASE NOTE: No calls to this method should be included in any final
    transformation scripts. This is purely for analysis and does not perform
    any data transformation or cleaning.

    This function checks each column for missing or custom missing values
    and logs a summary of the findings for string (object), numeric, and
    datetime columns.

    Args:
        df: The DataFrame to analyze.
        custom_missing_values: Optional list of custom values to consider
            as "missing" (e.g., empty strings, specific strings like "NA",
            "NULL", etc.). If not provided, defaults to a standard set.

    Returns:
        exception_cols: List of names of columns that couldn't be analyzed
            due to a caught exception.
    """
    nan_cols = []
    exception_cols = []

    # Use a default set of custom missing values if none are provided
    if custom_missing_values is None:
        custom_missing_values = [
            "",
            " ",
            "NA",
            "N/A",
            "NULL",
            "-",
            "None",
            "NaN",
            "?",
            "*",
            "#",
        ]

    # Analyze columns for missing values
    for col in df.columns:
        try:
            logger.info(f"Analyzing column: {col}")

            # Get the column values
            col_data = df[col]

            # Check if the column is of string (object) type
            if col_data.dtype == "object":
                # Count rows where the value is NaN, None, empty string, or custom missing values
                none_count = col_data.isna().sum()  # Count None (NaN)
                empty_string_count = (
                    col_data.str.strip() == ""
                ).sum()  # Count empty strings
                custom_missing_count = col_data.isin(
                    custom_missing_values
                ).sum()  # Count custom missing values

                total_nan_count = none_count + empty_string_count + custom_missing_count

                if total_nan_count > 0:
                    nan_cols.append(col)

                # Output counts
                logger.info(f"Column '{col}' is a string type.")
                if none_count > 0:
                    logger.warning(f"Rows with None values: {none_count}")
                    logger.warning(df[df[col].isna()].head())
                if empty_string_count > 0:
                    logger.warning(f"Rows with empty strings: {empty_string_count}")
                    logger.warning(df[df[col].str.strip() == ""].head())
                if custom_missing_count > 0:
                    logger.warning(
                        f"Rows with custom missing values: {custom_missing_count}"
                    )
                    logger.warning(df[df[col].isin(custom_missing_values)].head())
                if (
                    none_count == 0
                    and empty_string_count == 0
                    and custom_missing_count == 0
                ):
                    logger.info("Found nothing worth reporting here")

            # Check if the column is numeric (int or float)
            elif pd.api.types.is_numeric_dtype(col_data):
                # Count NA values in the column
                na_count = col_data.isna().sum()
                # Count custom missing values in numeric columns (if applicable)
                custom_missing_numeric_count = col_data.isin(
                    [0]
                ).sum()  # Assuming 0 is considered a missing value

                if na_count > 0 or custom_missing_numeric_count > 0:
                    nan_cols.append(col)

                # Handle the non-NA data for further analysis
                col_data_cleaned = col_data.dropna()

                if not col_data_cleaned.empty:
                    # Calculate min and max
                    min_val = col_data_cleaned.min()
                    max_val = col_data_cleaned.max()

                    if min_val < 0 or na_count > 0 or custom_missing_numeric_count > 0:
                        logger.warning(f"Min value: {min_val}")
                        logger.warning(f"Max value: {max_val}")
                    if na_count > 0:
                        logger.warning(f"Rows with NA values: {na_count}")
                        logger.warning(df[df[col].isna()].head())
                    if custom_missing_numeric_count > 0:
                        logger.warning(
                            f"Custom missing values (e.g., 0): {custom_missing_numeric_count}"
                        )
                        logger.warning(df[df[col].isin([0])].head())
                    if (
                        min_val > 0
                        and na_count == 0
                        and custom_missing_numeric_count == 0
                    ):
                        logger.info("Found nothing worth reporting here")
                else:
                    logger.warning(
                        f"Column '{col}' is numeric but contains only NA values."
                    )

            # Check if the column is a datetime type
            elif pd.api.types.is_datetime64_any_dtype(col_data):
                # Count NA values in the datetime column
                na_count = col_data.isna().sum()
                # Assuming custom missing values might be present in string form before conversion
                custom_missing_count = col_data.isin(custom_missing_values).sum()

                if na_count > 0 or custom_missing_count > 0:
                    nan_cols.append(col)

                # Handle the non-NA data for further analysis
                col_data_cleaned = col_data.dropna()

                if not col_data_cleaned.empty:
                    # Output min and max datetime values
                    min_date = col_data_cleaned.min()
                    max_date = col_data_cleaned.max()

                    if na_count > 0 or custom_missing_count > 0:
                        logger.warning(f"Min date: {min_date}")
                        logger.warning(f"Max date: {max_date}")
                        logger.warning(f"Rows with NA values: {na_count}")
                        logger.warning(df[df[col].isna()].head())
                        logger.warning(f"Custom missing values: {custom_missing_count}")
                        logger.warning(df[df[col].isin(custom_missing_values)].head())
                    if na_count == 0 and custom_missing_count == 0:
                        logger.info("Found nothing worth reporting here")
                else:
                    logger.warning(
                        f"Column '{col}' is datetime but contains only NA values."
                    )

            # If the column is of some other type, simply note the type
            else:
                logger.info(f"Column '{col}' is of type {col_data.dtype}.")

        except Exception as e:
            exception_cols.append(col)
            logger.warning(f"Caught exception for column {col}: {e}\n")
            continue

    logger.info(f"Columns with NaNs or custom missing values: {nan_cols}")
    logger.info(f"Columns with exceptions during processing: {exception_cols}")

    return exception_cols
