## Load the data

In [1]:
from dagster import AssetKey
from pudl.etl import defs
import pandas as pd
from pudl.helpers import zero_pad_numeric_string, organize_cols, standardize_phone_column, fix_na, analyze_missing_values
import numpy as np
import re

raw_df = defs.load_asset_value(AssetKey("raw_phmsagas__yearly_distribution"))

No dagster instance configuration file (dagster.yaml) found at /Users/sam/Documents/pudl-data/dagster_home. Defaulting to loading and storing all metadata with /Users/sam/Documents/pudl-data/dagster_home. If this is the desired behavior, create an empty dagster.yaml file in /Users/sam/Documents/pudl-data/dagster_home.
2024-10-05 22:18:17 -0400 - dagster - DEBUG - system - Loading file from: /Users/sam/Documents/pudl-data/dagster_home/storage/raw_phmsagas__yearly_distribution using PickledObjectFilesystemIOManager...


Earliest reported year available in asset: 94.0
Latest reported year available in asset: 2023.0


array([1990., 1991., 1992., 1993.,   94.,   95.,   96.,   97., 1998.,
       1999., 2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007.,
       2008., 2009., 2010., 2011., 2012., 2013., 2014., 2015., 2016.,
       2017., 2018., 2019., 2020., 2021., 2022., 2023.])

## core_phmsagas__yearly_distribution_operators

In [7]:
df = raw_df[[
    "report_date",
    "report_number",
    "report_submission_type",
    "report_year",
    "operator_id_phmsa",
    "operator_name_phmsa",
    "office_address_street",
    "office_address_city",
    "office_address_state",
    "office_address_zip",
    "office_address_county",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled",
    "percent_unaccounted_for_gas",
    "additional_information",
    "preparer_email",
    "preparer_fax",
    "preparer_name",
    "preparer_phone",
    "preparer_title",
    # Adding these fields temporarily for transformation cleanup
    "headquarters_address_city",
    "headquarters_address_county",
    "headquarters_address_state",
    "headquarters_address_street",
    "headquarters_address_zip"
]]

In [9]:
# Clean up office zip codes
df["office_address_zip"] = zero_pad_numeric_string(df["office_address_zip"], 5)
df.head()

# Convert columns to ints
# Would have included "federal_land_leaks_repaired_or_scheduled" in this list but there were a couple rows with decimal values
columns_to_convert = [
    "report_year",
    "report_number",
    "operator_id_phmsa",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed"
]
df[columns_to_convert] = df[columns_to_convert].astype("Int64")

# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100
df.loc[mask, "report_year"] = df.loc[mask, "report_year"].astype(int).astype(str).str[:4].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["office_address_zip"] = zero_pad_numeric_string(df["office_address_zip"], 5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_convert] = df[columns_to_convert].astype("Int64")


### Operator Table Tasks

#### Standardize NAs

In [16]:
# Fill NA values with zeroes because these columns are simply counts.
# Note that "excavation_damage..." columns should sum up to the value in "excavation_damage_total". However, many rows
# (on the scale of thousands) do not actually sum up to "excavation_damage_total".
columns_to_fill = [
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled"
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_fill] = df[columns_to_fill].fillna(0)


#### Standardize usage of office vs HQ address over time

- Make sure to use "clean_eia_counties" from helpers at some point
- Also maybe "zero_pad_numeric_string"


In [18]:
# Selecting the relevant columns
address_cols = [
    'operator_id_phmsa', 'report_year',
    'office_address_street', 'office_address_city', 'office_address_state', 'office_address_zip',
    'headquarters_address_street', 'headquarters_address_city', 'headquarters_address_state', 'headquarters_address_zip'
]

df_addresses = df[address_cols]

In [19]:
df_addresses.dtypes

operator_id_phmsa                       Int64
report_year                             Int64
office_address_street                  object
office_address_city                    object
office_address_state                   object
office_address_zip             string[python]
headquarters_address_street            object
headquarters_address_city              object
headquarters_address_state             object
headquarters_address_zip               object
dtype: object

In [17]:
# Create a concatenated address string for easier comparison
df_addresses['office_address'] = df_addresses['office_address_street'] + ', ' + df_addresses['office_address_city'] + ', ' + df_addresses['office_address_state'] + ' ' + df_addresses['office_address_zip']
df_addresses['headquarters_address'] = df_addresses['headquarters_address_street'] + ', ' + df_addresses['headquarters_address_city'] + ', ' + df_addresses['headquarters_address_state'] + ' ' + df_addresses['headquarters_address_zip']

# Group by operator and sort by year
df_addresses = df_addresses.sort_values(by=['operator_id_phmsa', 'report_year'])

# Create flags for overlap or switch
df_addresses['address_overlap'] = df_addresses['office_address'] == df_addresses['headquarters_address']
df_addresses['address_switch'] = df_addresses.groupby('operator_id_phmsa').apply(
    lambda x: (x['office_address'].shift() == x['headquarters_address']) & 
              (x['headquarters_address'].shift() == x['office_address'])
).reset_index(drop=True)

# Display the results where overlap or switch occurred
df_overlap_switch = df_addresses[(df_addresses['address_overlap'] == True) | (df_addresses['address_switch'] == True)]


TypeError: unsupported operand type(s) for +: 'int' and 'str'

#### Standardize case for city, county, operator name, etc.

#### Standardize telephone and fax number format and drop (000)-000-0000


In [21]:
df = standardize_phone_column(df, "preparer_phone")
df[df.preparer_phone.notna()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].apply(standardize_phone_number)


Unnamed: 0,report_date,report_number,report_submission_type,report_year,operator_id_phmsa,operator_name_phmsa,office_address_street,office_address_city,office_address_state,office_address_zip,office_address_county,excavation_damage_excavation_practices,excavation_damage_locating_practices,excavation_damage_one_call_notification,excavation_damage_other,excavation_damage_total,excavation_tickets,services_efv_in_system,services_efv_installed,services_shutoff_valve_in_system,services_shutoff_valve_installed,federal_land_leaks_repaired_or_scheduled,percent_unaccounted_for_gas,additional_information,preparer_email,preparer_fax,preparer_name,preparer_phone,preparer_title,headquarters_address_city,headquarters_address_county,headquarters_address_state,headquarters_address_street,headquarters_address_zip
21069,2009-12-03 00:00:00,20081542,,2004,7130,HASKELL PUBLIC WORKS AUTH,"P.O. BOX 9, 109 S. BROADWAY",HASKELL,OK,74436,MUSKOGEE,0,0,0,0,0,0,0,0,0,0,0.0,3.0,,9184825518.0,,MARILYNN TUCKER,918-482-3148,TOWNOFHASKELL@VALORNET.COM,,,,,
21070,2009-12-16 00:00:00,20081562,,2004,19090,"TANGIPAHOA, VILLAGE OF",12616 JACKSON ST.,TANGIPAHOA,LA,70465,TANGIPAHOA,0,0,0,0,0,0,0,0,0,0,0.0,0.0,,9852298300.0,,"FREDRIKA M. RUFFIN, CLERK",985-229-4423,ROTPD@BELLSOUTH.NET,,,,,
21071,2009-07-08 00:00:00,20081404,,2004,18160,"SEALY GAS DEPT, CITY OF",415 MAIN ST.,SEALY,TX,77474,AUSTIN,0,0,0,0,0,0,0,0,0,0,0.0,1.0,,9798853511.0,,"JOHN MARESH, CITY MANAGER",979-885-3513,JMARESH@CI.SEALY.TX.US,,,,,
21073,2005-03-21 00:00:00,20041246,,2004,5040,CITY OF FAIRHOPE,555 SOUTH SECTION STREET,FAIRHOPE,AL,36532,BALDWIN,0,0,0,0,0,0,0,0,0,0,0.0,3.0,,2519288003.0,,STEVE SEAY SUPERINTENDENT,251-990-0388,STEVE.SEAY@COFAIRHOPE.COM,,,,,
21074,2005-02-24 00:00:00,20040546,,2004,12510,VILLAGE OF MILFORD,15 S WEST AVE,MILFORD,IL,60953,IROQUOIS,0,0,0,0,0,0,0,0,0,0,0.0,4.0,,8158894249.0,,TRICIA HUFF,815-889-5589,MILFORDCITY@DTNSPEED.NET,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50640,2024-03-15 11:54:14,20241206,INITIAL,2023,40711,BRIGHTMARK KIRKMAN RNG LLC,9720 CYPRESSWOOD DRIVE,HOUSTON,TX,77070,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,,(000) 000-0000,schuyler.dickerson@everlineus.com,Schuyler Dickerson,832-386-5484,Regulatory Consultant,SAN FRANCISCO,,CA,1725 MONTGOMERY ST FL 3,94111
50641,2024-03-15 12:56:29,20241215,INITIAL,2023,40712,BRIGHTMARK VAN ESS RNG LLC,9720 CYPRESSWOOD DRIVE,HOUSTON,TX,77070,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,,(000) 000-0000,schuyler.dickerson@everlineus.com,Schuyler Dickerson,832-386-5484,Regulator Consultant,SAN FRANCISCO,,CA,1725 MONTGOMERY ST FL 3,94111
50642,2024-03-15 12:50:23,20241214,INITIAL,2023,40713,BRIGHTMARK ROORDA RNG LLC,3405 Eide Drive,Sioux Falls,SD,57107,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,,(000) 000-0000,schuyler.dickerson@everlineus.com,Schuyler Dickerson,832-386-5484,Regulatory Consultant,SAN FRANCISCO,,CA,1725 MONTGOMERY ST FL 3,94111
50643,2024-07-15 15:54:36,20241437,INITIAL,2023,40732,ABBS VALLEY PIPELINE,P.O. BOX 225,PINEVILLE,WV,24874,WYOMING,0,0,0,0,0,0,0,0,0,0,0.0,-0.005,,(000) 000-0000,sgillespie@bgpartners.net,Stevens Gillespie,703-963-0048,Agent,CORPUS CHRISTI,,TX,615 N UPPER BROADWAY STE. 525,78401
