## Load the data

In [1]:
from dagster import AssetKey
from pudl.etl import defs
import pandas as pd
from pudl.helpers import zero_pad_numeric_string, organize_cols, standardize_phone_column, fix_na, analyze_missing_values, standardize_state_columns
import numpy as np
import re

In [2]:
raw_df = defs.load_asset_value(AssetKey("raw_phmsagas__yearly_distribution"))

No dagster instance configuration file (dagster.yaml) found at /Users/sam/Documents/pudl-data/dagster_home. Defaulting to loading and storing all metadata with /Users/sam/Documents/pudl-data/dagster_home. If this is the desired behavior, create an empty dagster.yaml file in /Users/sam/Documents/pudl-data/dagster_home.
2024-10-11 17:22:13 -0400 - dagster - DEBUG - system - Loading file from: /Users/sam/Documents/pudl-data/dagster_home/storage/raw_phmsagas__yearly_distribution using PickledObjectFilesystemIOManager...


## core_phmsagas__yearly_distribution_operators

In [3]:
df = raw_df[[
    "report_date",
    "report_number",
    "report_submission_type",
    "report_year",
    "operator_id_phmsa",
    "operator_name_phmsa",
    "office_address_street",
    "office_address_city",
    "office_address_state",
    "office_address_zip",
    "office_address_county",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled",
    "percent_unaccounted_for_gas",
    "additional_information",
    "preparer_email",
    "preparer_fax",
    "preparer_name",
    "preparer_phone",
    "preparer_title",
    # Adding these fields temporarily for transformation cleanup
    "headquarters_address_city",
    "headquarters_address_county",
    "headquarters_address_state",
    "headquarters_address_street",
    "headquarters_address_zip"
]]

In [None]:
# Convert columns to ints
# Would have included "federal_land_leaks_repaired_or_scheduled" in this list but there were a couple rows with decimal values
columns_to_convert = [
    "report_year",
    "report_number",
    "operator_id_phmsa",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed"
]
df[columns_to_convert] = df[columns_to_convert].astype("Int64")

# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100

# Convert 2-digit years to appropriate 4-digit format (assume cutoff at year 50)
# We could also use the first 4 digits of the "report_number" but there was at least one anomaly here with an invalid year
df.loc[mask, "report_year"] = df.loc[mask, "report_year"].apply(
    lambda x: 2000 + x if x < 50 else 1900 + x
)

### Operator Table Tasks

#### Standardize NAs

In [None]:
# Fill NA values with zeroes because these columns are simply counts.
# Note that "excavation_damage..." columns should sum up to the value in "excavation_damage_total". However, many rows
# (on the scale of thousands) do not actually sum up to "excavation_damage_total".
columns_to_fill = [
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled"
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Fill in bad strings
df = fix_na(df)

#### Standardize case for city, county, operator name, etc.

In [None]:
# Capitalize the first letter of each word in all object-type columns except the excluded ones
exclude_columns = ['headquarters_address_state', 'office_address_state']
df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)] = \
    df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)].apply(lambda col: col.str.title())

# List of state columns to standardize
state_columns_to_standardize = ['headquarters_address_state', 'office_address_state']
df = standardize_state_columns(df, state_columns_to_standardize)

# Trim all the object-type columns
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).applymap(lambda x: x.strip() if isinstance(x, str) else x)

#### Standardize usage of office vs HQ address over time

- Make sure to use "clean_eia_counties" from helpers at some point

In [10]:
# Clean up office zip codes
df["office_address_zip_clean"] = zero_pad_numeric_string(df["office_address_zip"], 5).astype("object")
df["headquarters_address_zip_clean"] = zero_pad_numeric_string(df["headquarters_address_zip"], 5).astype("object")

In [11]:
# Create an address DF
address_cols = [
    'operator_id_phmsa', 'report_year',
    'office_address_street', 'office_address_city', 'office_address_state', 'office_address_zip', 'office_address_zip_clean',
    'headquarters_address_street', 'headquarters_address_city', 'headquarters_address_state', 'headquarters_address_zip', 'headquarters_address_zip_clean'
]

df_addresses = df[address_cols]

In [None]:
# Create a concatenated address string for office address
df_addresses['office_address'] = df_addresses['office_address_street'].astype(str) + ', ' + \
                                 df_addresses['office_address_city'].astype(str) + ', ' + \
                                 df_addresses['office_address_state'].astype(str) + ' ' + \
                                 df_addresses['office_address_zip_clean'].astype(str)

# Create a concatenated address string for headquarters address
df_addresses['headquarters_address'] = df_addresses['headquarters_address_street'].astype(str) + ', ' + \
                                       df_addresses['headquarters_address_city'].astype(str) + ', ' + \
                                       df_addresses['headquarters_address_state'].astype(str) + ' ' + \
                                       df_addresses['headquarters_address_zip_clean'].astype(str)


In [None]:
def standardize_address(address: str) -> str:
    if pd.isna(address):  # Handle missing addresses
        return address
    address = address.upper()  # Convert to uppercase
    address = address.replace('P O BOX', 'PO BOX')  # Standardize "P O BOX"
    address = address.replace('ST.', 'STREET').replace('RD.', 'ROAD')  # Example replacements
    address = address.replace(',', '')  # Remove commas for consistent comparison
    address = re.sub(r'\s+', ' ', address).strip()  # Replace multiple spaces with a single space and trim
    return address

# Step 4: Apply the standardization to the concatenated addresses
df_addresses['office_address_std'] = df_addresses['office_address'].apply(standardize_address)
df_addresses['headquarters_address_std'] = df_addresses['headquarters_address'].apply(standardize_address)


In [None]:
df_unique_addresses = df_addresses.groupby('operator_id_phmsa').agg(
    unique_office_addresses=('office_address', 'nunique'),
    unique_headquarters_addresses=('headquarters_address', 'nunique')
).reset_index()

df_unique_addresses.head()

In [None]:
df_unique_addresses[df_unique_addresses.unique_headquarters_addresses>df_unique_addresses.unique_office_addresses].count()

In [None]:
df_unique_addresses[df_unique_addresses.unique_headquarters_addresses>df_unique_addresses.unique_office_addresses].head()

In [None]:
df_addresses[df_addresses.operator_id_phmsa == 728]["headquarters_address"].unique()

In [None]:
df_addresses[df_addresses.operator_id_phmsa == 728]["office_address"].unique()

In [None]:
# Temporarily increase the max column width to display full content
pd.set_option('display.max_colwidth', None)

# Now print the desired values
print(df_addresses[df_addresses.operator_id_phmsa == 728][["headquarters_address"]])

# Reset the column width option if needed
pd.reset_option('display.max_colwidth')

In [20]:
# # Group by operator and sort by year
# df_addresses = df_addresses.sort_values(by=['operator_id_phmsa', 'report_year'])

# # Create flags for overlap or switch
# df_addresses['address_overlap'] = df_addresses['office_address'] == df_addresses['headquarters_address']
# df_addresses['address_switch'] = df_addresses.groupby('operator_id_phmsa').apply(
#     lambda x: (x['office_address'].shift() == x['headquarters_address']) & 
#               (x['headquarters_address'].shift() == x['office_address'])
# ).reset_index(drop=True)

# # Display the results where overlap or switch occurred
# df_overlap_switch = df_addresses[(df_addresses['address_overlap'] == True) | (df_addresses['address_switch'] == True)]


#### Standardize telephone and fax number format and drop (000)-000-0000


In [21]:
# df = standardize_phone_column(df, "preparer_phone")
# df[df.preparer_phone.notna()]

In [None]:
df.head()

In [None]:
df[df.preparer_fax.notnull()]['report_year'].unique()

In [None]:
df[~df['preparer_fax'].str.contains('@', na=False)]['preparer_fax'].unique()