catalyst-cooperative · zaneselvans · Aug 30, 2023 · Feb 3, 2023 · Feb 12, 2023 · Feb 13, 2023
diff --git a/...558_regenerate_migrations_to_name_all_.py → ...tions/versions/8b3029915ab1_pandas_2_0.py b/...558_regenerate_migrations_to_name_all_.py → ...tions/versions/8b3029915ab1_pandas_2_0.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,14 +33,13 @@ dependencies = [
     "jinja2>=2,<3.2",
     "matplotlib>=3.3,<3.8",  # Should make this optional with a "viz" extras
     "networkx>=2.2,<3.2",
+    "pandas[parquet,excel,fss,gcp,compression]>=2.0,<2.1",
     "numpy>=1.18.5,!=1.23.0,<1.26",
-    "pandas>=1.4,<1.5.4",
     "pyarrow>=5,<12.1",
     "pydantic[email]>=1.7,<2",
     "python-dotenv>=0.21,<1.1",
-    "python-snappy>=0.6,<0.7",
     "pyyaml>=5,<6.1",
-    "recordlinkage>=0.14,<0.17",
+    "recordlinkage>=0.16,<0.17",
     "scikit-learn>=1.0,<1.4",
     "scipy>=1.6,<1.12",
     "Shapely>=2.0,<2.1",

diff --git a/src/pudl/analysis/allocate_gen_fuel.py b/src/pudl/analysis/allocate_gen_fuel.py
@@ -629,7 +629,7 @@ def stack_generators(
         pd.DataFrame(gens.set_index(IDX_GENS)[esc].stack(level=0))
         .reset_index()
         .rename(columns={"level_3": cat_col, 0: stacked_col})
-        .pipe(apply_pudl_dtypes, "eia")
+        .pipe(apply_pudl_dtypes, group="eia")
     )
     # arrange energy source codes by number and type (start with energy_source_code, then planned_, then startup_)
     gens_stack_prep = gens_stack_prep.sort_values(
@@ -712,13 +712,21 @@ def associate_generator_tables(
     """
     stack_gens = stack_generators(
         gens, cat_col="energy_source_code_num", stacked_col="energy_source_code"
-    )
+    ).pipe(apply_pudl_dtypes, group="eia")
     # allocate the boiler fuel data to generators
-    bf_by_gens = allocate_bf_data_to_gens(bf, gens, bga)
     bf_by_gens = (
-        bf_by_gens.set_index(IDX_GENS_PM_ESC).add_suffix("_bf_tbl").reset_index()
+        allocate_bf_data_to_gens(bf, gens, bga)
+        .set_index(IDX_GENS_PM_ESC)
+        .add_suffix("_bf_tbl")
+        .reset_index()
+        .pipe(apply_pudl_dtypes, group="eia")
+    )
+    gf = (
+        gf.set_index(IDX_PM_ESC)[DATA_COLUMNS]
+        .add_suffix("_gf_tbl")
+        .reset_index()
+        .pipe(apply_pudl_dtypes, group="eia")
     )
-    gf = gf.set_index(IDX_PM_ESC)[DATA_COLUMNS].add_suffix("_gf_tbl").reset_index()
 
     gen_assoc = (
         pd.merge(
@@ -767,7 +775,7 @@ def associate_generator_tables(
         .reset_index(),
         on=IDX_ESC,
         how="outer",
-    ).pipe(apply_pudl_dtypes, "eia")
+    ).pipe(apply_pudl_dtypes, group="eia")
     return gen_assoc
 
 
@@ -1566,7 +1574,7 @@ def assign_plant_year(df):
                         "year": x.report_date.dt.year,
                         "month": 1,
                         "day": 1,
-                    }
+                    },
                 )
             )
             .pipe(
@@ -1577,6 +1585,7 @@ def assign_plant_year(df):
             )
             .assign(**{data_column_name: lambda x: x[data_column_name] / 12})
             .pipe(assign_plant_year)
+            .pipe(apply_pudl_dtypes, group="eia")
             .set_index(["plant_year"])
         )
         # sometimes a plant oscillates btwn annual and monthly reporting. when it does

diff --git a/src/pudl/analysis/plant_parts_eia.py b/src/pudl/analysis/plant_parts_eia.py
@@ -480,6 +480,7 @@ def execute(
                 validate_own_merge,
             )
         )
+        gens_mega = gens_mega.convert_dtypes()
         return gens_mega
 
     def get_gens_mega_table(self, mcoe):

diff --git a/src/pudl/analysis/state_demand.py b/src/pudl/analysis/state_demand.py
@@ -372,7 +372,7 @@ def filter_ferc714_hourly_demand_matrix(
             .groupby("id")["year"]
             .apply(lambda x: np.sort(x))
         )
-        with pd.option_context("display.max_colwidth", -1):
+        with pd.option_context("display.max_colwidth", None):
             logger.info(f"{msg}:\n{report}")
     # Drop respondents with no data
     blank = df.columns[df.isnull().all()].tolist()

diff --git a/src/pudl/extract/eia_bulk_elec.py b/src/pudl/extract/eia_bulk_elec.py
@@ -75,11 +75,12 @@ def _parse_data_column(elec_df: pd.DataFrame) -> pd.DataFrame:
             )
         else:
             data_df.loc[:, "date"] = pd.to_datetime(
-                data_df.loc[:, "date"], infer_datetime_format=True, errors="raise"
+                data_df.loc[:, "date"], errors="raise"
             )
         data_df["series_id"] = elec_df.at[idx, "series_id"]
         out.append(data_df)
     out = pd.concat(out, ignore_index=True, axis=0)
+    out = out.convert_dtypes()
     out.loc[:, "series_id"] = out.loc[:, "series_id"].astype("category", copy=False)
     return out.loc[:, ["series_id", "date", "value"]]  # reorder cols
 

diff --git a/src/pudl/glue/ferc1_eia.py b/src/pudl/glue/ferc1_eia.py
@@ -245,7 +245,7 @@ def drop_invalid_rows(self, df):
                     "`drop_invalid_rows`. Adding empty columns for: "
                     f"{missing_required_cols}"
                 )
-                df.loc[:, missing_required_cols] = pd.NA
+                df.loc[:, list(missing_required_cols)] = pd.NA
         return super().drop_invalid_rows(df)
 
 
@@ -423,7 +423,7 @@ def get_utility_most_recent_capacity(pudl_engine) -> pd.DataFrame:
         == gen_caps["report_date"]
     )
     most_recent_gens = gen_caps.loc[most_recent_gens_idx]
-    utility_caps = most_recent_gens.groupby("utility_id_eia").sum()
+    utility_caps = most_recent_gens.groupby("utility_id_eia")["capacity_mw"].sum()
     return utility_caps
 
 

diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py
@@ -26,7 +26,7 @@
 from pandas._libs.missing import NAType
 
 import pudl.logging_helpers
-from pudl.metadata.fields import get_pudl_dtypes
+from pudl.metadata.fields import apply_pudl_dtypes, get_pudl_dtypes
 
 sum_na = partial(pd.Series.sum, skipna=False)
 """A sum function that returns NA if the Series includes any NA values.
@@ -364,22 +364,23 @@ def is_doi(doi):
     return bool(re.match(doi_regex, doi))
 
 
-def convert_col_to_datetime(df, date_col_name):
-    """Convert a column in a dataframe to a datetime.
+def convert_col_to_datetime(df: pd.DataFrame, date_col_name: str) -> pd.DataFrame:
+    """Convert a non-datetime column in a dataframe to a datetime64[s].
 
     If the column isn't a datetime, it needs to be converted to a string type
     first so that integer years are formatted correctly.
 
     Args:
-        df (pandas.DataFrame): Dataframe with column to convert.
-        date_col_name (string): name of the column to convert.
+        df: Dataframe with column to convert.
+        date_col_name: name of the datetime column to convert.
 
     Returns:
         Dataframe with the converted datetime column.
     """
-    if pd.api.types.is_datetime64_ns_dtype(df[date_col_name]) is False:
+    if not pd.api.types.is_datetime64_dtype(df[date_col_name]):
         logger.warning(
-            f"{date_col_name} is {df[date_col_name].dtype} column. Converting to datetime."
+            f"{date_col_name} is {df[date_col_name].dtype} column. "
+            "Converting to datetime64[ns]."
         )
         df[date_col_name] = pd.to_datetime(df[date_col_name].astype("string"))
     return df
@@ -618,17 +619,21 @@ def expand_timeseries(
             f"{fill_through_freq} is not a valid frequency to fill through."
         )
     end_dates["drop_row"] = True
-    df = pd.concat([df, end_dates.reset_index()])
     df = (
-        df.set_index(date_col)
+        pd.concat([df, end_dates.reset_index()])
+        .set_index(date_col)
         .groupby(key_cols)
         .resample(freq)
         .ffill()
         .drop(key_cols, axis=1)
         .reset_index()
     )
-    df = df[df.drop_row.isnull()].drop("drop_row", axis=1).reset_index(drop=True)
-    return df
+    return (
+        df[df.drop_row.isnull()]
+        .drop(columns="drop_row")
+        .reset_index(drop=True)
+        .pipe(apply_pudl_dtypes)
+    )
 
 
 def organize_cols(df, cols):
@@ -984,26 +989,19 @@ def convert_to_date(
     return df
 
 
-def fix_eia_na(df):
+def fix_eia_na(df: pd.DataFrame) -> pd.DataFrame:
     """Replace common ill-posed EIA NA spreadsheet values with np.nan.
 
     Currently replaces empty string, single decimal points with no numbers,
     and any single whitespace character with np.nan.
 
     Args:
-        df (pandas.DataFrame): The DataFrame to clean.
+        df: The DataFrame to clean.
 
     Returns:
-        pandas.DataFrame: The cleaned DataFrame.
+        DataFrame with regularized NA values.
     """
-    return df.replace(
-        to_replace=[
-            r"^\.$",  # Nothing but a decimal point
-            r"^\s*$",  # The empty string and entirely whitespace strings
-        ],
-        value=np.nan,
-        regex=True,
-    )
+    return df.replace(regex=r"(^\.$|^\s*$)", value=np.nan)
 
 
 def simplify_columns(df):
@@ -1025,14 +1023,18 @@ def simplify_columns(df):
     Todo:
         Update docstring.
     """
-    df.columns = (
-        df.columns.str.replace(r"[^0-9a-zA-Z]+", " ", regex=True)
-        .str.strip()
-        .str.lower()
-        .str.replace(r"\s+", " ", regex=True)
-        .str.replace(" ", "_")
-    )
-    return df
+    # Do nothing, if empty dataframe (e.g. mocked for tests)
+    if df.shape[0] == 0:
+        return df
+    else:
+        df.columns = (
+            df.columns.str.replace(r"[^0-9a-zA-Z]+", " ", regex=True)
+            .str.strip()
+            .str.lower()
+            .str.replace(r"\s+", " ", regex=True)
+            .str.replace(" ", "_")
+        )
+        return df
 
 
 def drop_tables(engine: sa.engine.Engine, clobber: bool = False):
@@ -1220,11 +1222,11 @@ def generate_rolling_avg(
     # to get the backbone/complete date range/groups
     bones = (
         date_range.merge(groups)
-        .drop("tmp", axis=1)  # drop the temp column
+        .drop(columns="tmp")  # drop the temp column
         .merge(df, on=group_cols + ["report_date"])
         .set_index(group_cols + ["report_date"])
         .groupby(by=group_cols + ["report_date"])
-        .mean()
+        .mean(numeric_only=True)
     )
     # with the aggregated data, get a rolling average
     roll = bones.rolling(window=window, center=True, **kwargs).agg({data_col: "mean"})
@@ -1600,7 +1602,7 @@ def convert_df_to_excel_file(df: pd.DataFrame, **kwargs) -> pd.ExcelFile:
     writer = pd.ExcelWriter(bio, engine="xlsxwriter")
     df.to_excel(writer, **kwargs)
 
-    writer.save()
+    writer.close()
 
     bio.seek(0)
     workbook = bio.read()

diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py
@@ -663,7 +663,7 @@ def to_pandas_dtype(self, compact: bool = False) -> str | pd.CategoricalDtype:
                 return "float32"
         return FIELD_DTYPES_PANDAS[self.type]
 
-    def to_sql_dtype(self) -> sa.sql.visitors.VisitableType:
+    def to_sql_dtype(self) -> type:
         """Return SQLAlchemy data type."""
         if self.constraints.enum and self.type == "string":
             return sa.Enum(*self.constraints.enum)

diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py
@@ -28,7 +28,7 @@
     "year": pa.int32(),
 }
 
-FIELD_DTYPES_SQL: dict[str, sa.sql.visitors.VisitableType] = {
+FIELD_DTYPES_SQL: dict[str, type] = {
     "boolean": sa.Boolean,
     "date": sa.Date,
     # Ensure SQLite's string representation of datetime uses only whole seconds:

diff --git a/src/pudl/metadata/fields.py b/src/pudl/metadata/fields.py
@@ -2907,7 +2907,7 @@ def apply_pudl_dtypes(
 ) -> pd.DataFrame:
     """Apply dtypes to those columns in a dataframe that have PUDL types defined.
 
-    Note at ad-hoc column dtypes can be defined and merged with default PUDL field
+    Note that ad-hoc column dtypes can be defined and merged with default PUDL field
     metadata before it's passed in as ``field_meta`` if you have module specific column
     types you need to apply alongside the standard PUDL field types.
 

diff --git a/src/pudl/output/censusdp1tract.py b/src/pudl/output/censusdp1tract.py
@@ -42,17 +42,17 @@ def get_layer(layer, dp1_engine):
             table_name = f"{layer}_2010census_dp1"
             df = pd.read_sql(
                 """
-        SELECT geom_cols.f_table_name as table_name,
-            geom_cols.f_geometry_column as geom_col,
-            crs.auth_name as auth_name,
-            crs.auth_srid as auth_srid
-        FROM geometry_columns geom_cols
-        INNER JOIN spatial_ref_sys crs
-            ON geom_cols.srid = crs.srid
-        WHERE table_name = ?
-        """,
+SELECT geom_cols.f_table_name as table_name,
+    geom_cols.f_geometry_column as geom_col,
+    crs.auth_name as auth_name,
+    crs.auth_srid as auth_srid
+FROM geometry_columns geom_cols
+INNER JOIN spatial_ref_sys crs
+    ON geom_cols.srid = crs.srid
+WHERE table_name = ?
+""",
                 dp1_engine,
-                params=[table_name],
+                params=(table_name,),
             )
             if len(df) != 1:
                 raise AssertionError(

diff --git a/src/pudl/output/ferc714.py b/src/pudl/output/ferc714.py
@@ -236,7 +236,7 @@ def filled_balancing_authority_eia861(
     df = pd.concat([df, pd.DataFrame(rows)])
     # Remove balancing authorities treated as utilities
     mask = df["balancing_authority_id_eia"].isin([util["id"] for util in UTILITIES])
-    return df[~mask]
+    return apply_pudl_dtypes(df[~mask], group="eia")
 
 
 def filled_balancing_authority_assn_eia861(
@@ -312,7 +312,11 @@ def filled_balancing_authority_assn_eia861(
             tables.append(table)
             if "replace" in util and util["replace"]:
                 mask |= is_child
-    return pd.concat([df[~mask], pd.concat(tables)]).drop_duplicates()
+    return (
+        pd.concat([df[~mask]] + tables)
+        .drop_duplicates()
+        .pipe(apply_pudl_dtypes, group="eia")
+    )
 
 
 def filled_service_territory_eia861(
@@ -340,8 +344,7 @@ def filled_service_territory_eia861(
     # Reformat as unique utility-state-year
     assn = assn[selected][index].drop_duplicates()
     # Select relevant service territories
-    df = service_territory_eia861
-    mdf = assn.merge(df, how="left")
+    mdf = assn.merge(service_territory_eia861, how="left")
     # Drop utility-state with no counties for all years
     grouped = mdf.groupby(["utility_id_eia", "state"])["county_id_fips"]
     mdf = mdf[grouped.transform("count").gt(0)]
@@ -361,7 +364,9 @@ def filled_service_territory_eia861(
         idx = (years - row["report_date"]).abs().idxmin()
         mask &= mdf["report_date"].eq(years[idx])
         tables.append(mdf[mask].assign(report_date=row["report_date"]))
-    return pd.concat([df] + tables)
+    return pd.concat([service_territory_eia861] + tables).pipe(
+        apply_pudl_dtypes, group="eia"
+    )
 
 
 @asset(compute_kind="Python")

diff --git a/src/pudl/transform/eia860.py b/src/pudl/transform/eia860.py
@@ -922,8 +922,7 @@ def _core_eia860__boiler_emissions_control_equipment_assn(
         raw_eia860__boiler_particulate,
     ]
 
-    bece_df = pd.DataFrame({})
-
+    dfs = []
     for table in raw_tables:
         # There are some utilities that report the same emissions control equipment.
         # Drop duplicate rows where the only difference is utility.
@@ -948,7 +947,8 @@ def _core_eia860__boiler_emissions_control_equipment_assn(
             var_name="emission_control_id_type",
             value_name="emission_control_id_eia",
         )
-        bece_df = bece_df.append(table)
+        dfs.append(table)
+    bece_df = pd.concat(dfs)
 
     # The report_year column must be report_date in order for the harvcesting process
     # to work on this table. It later gets converted back to report_year.