New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update PUDL to pandas 2.0 #2320
Changes from 52 commits
290074f
9842f65
9e33529
d184201
ad2c3f4
5658b66
89b35c5
3e4323c
c4bf0d6
2115739
bc1ce1c
193c89e
bc77904
e6cb446
eb2c47a
ab5fde8
15424f1
9f624c2
9635ff5
fcb3fa2
17ea796
1214290
a15a244
f1164d7
97fae98
6c8fe03
4cc7e4d
6e795ec
4af9d7e
a67248e
1f04e24
dbb58f0
09fb939
3347efb
c358d7f
f3d9763
6b19483
c2c776b
9c0883e
91d9595
e1b437b
ca602b0
0aab435
f5324f9
2e3d228
2aa2556
b9c00fb
4f455fc
2b2ad81
378d6b0
09fc965
f5a56f2
10155d5
cb5a56e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -245,7 +245,7 @@ def drop_invalid_rows(self, df): | |
"`drop_invalid_rows`. Adding empty columns for: " | ||
f"{missing_required_cols}" | ||
) | ||
df.loc[:, missing_required_cols] = pd.NA | ||
df.loc[:, list(missing_required_cols)] = pd.NA | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't use sets as indexers in pandas 2.0 |
||
return super().drop_invalid_rows(df) | ||
|
||
|
||
|
@@ -423,7 +423,7 @@ def get_utility_most_recent_capacity(pudl_engine) -> pd.DataFrame: | |
== gen_caps["report_date"] | ||
) | ||
most_recent_gens = gen_caps.loc[most_recent_gens_idx] | ||
utility_caps = most_recent_gens.groupby("utility_id_eia").sum() | ||
utility_caps = most_recent_gens.groupby("utility_id_eia")["capacity_mw"].sum() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We only want to sum one column, not the whole dataframe. |
||
return utility_caps | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,7 @@ | |
from pandas._libs.missing import NAType | ||
|
||
import pudl.logging_helpers | ||
from pudl.metadata.fields import get_pudl_dtypes | ||
from pudl.metadata.fields import apply_pudl_dtypes, get_pudl_dtypes | ||
|
||
sum_na = partial(pd.Series.sum, skipna=False) | ||
"""A sum function that returns NA if the Series includes any NA values. | ||
|
@@ -364,22 +364,23 @@ def is_doi(doi): | |
return bool(re.match(doi_regex, doi)) | ||
|
||
|
||
def convert_col_to_datetime(df, date_col_name): | ||
"""Convert a column in a dataframe to a datetime. | ||
def convert_col_to_datetime(df: pd.DataFrame, date_col_name: str) -> pd.DataFrame: | ||
"""Convert a non-datetime column in a dataframe to a datetime64[s]. | ||
|
||
If the column isn't a datetime, it needs to be converted to a string type | ||
first so that integer years are formatted correctly. | ||
|
||
Args: | ||
df (pandas.DataFrame): Dataframe with column to convert. | ||
date_col_name (string): name of the column to convert. | ||
df: Dataframe with column to convert. | ||
date_col_name: name of the datetime column to convert. | ||
|
||
Returns: | ||
Dataframe with the converted datetime column. | ||
""" | ||
if pd.api.types.is_datetime64_ns_dtype(df[date_col_name]) is False: | ||
if not pd.api.types.is_datetime64_dtype(df[date_col_name]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use a more liberal type check here, since what we're really trying to do is detect non-datetime columns (e.g. int, string) to convert. |
||
logger.warning( | ||
f"{date_col_name} is {df[date_col_name].dtype} column. Converting to datetime." | ||
f"{date_col_name} is {df[date_col_name].dtype} column. " | ||
"Converting to datetime64[ns]." | ||
) | ||
df[date_col_name] = pd.to_datetime(df[date_col_name].astype("string")) | ||
return df | ||
|
@@ -618,17 +619,21 @@ def expand_timeseries( | |
f"{fill_through_freq} is not a valid frequency to fill through." | ||
) | ||
end_dates["drop_row"] = True | ||
df = pd.concat([df, end_dates.reset_index()]) | ||
df = ( | ||
df.set_index(date_col) | ||
pd.concat([df, end_dates.reset_index()]) | ||
.set_index(date_col) | ||
.groupby(key_cols) | ||
.resample(freq) | ||
.ffill() | ||
.drop(key_cols, axis=1) | ||
.reset_index() | ||
) | ||
df = df[df.drop_row.isnull()].drop("drop_row", axis=1).reset_index(drop=True) | ||
return df | ||
return ( | ||
df[df.drop_row.isnull()] | ||
.drop(columns="drop_row") | ||
.reset_index(drop=True) | ||
.pipe(apply_pudl_dtypes) | ||
) | ||
|
||
|
||
def organize_cols(df, cols): | ||
|
@@ -984,26 +989,19 @@ def convert_to_date( | |
return df | ||
|
||
|
||
def fix_eia_na(df): | ||
def fix_eia_na(df: pd.DataFrame) -> pd.DataFrame: | ||
"""Replace common ill-posed EIA NA spreadsheet values with np.nan. | ||
|
||
Currently replaces empty string, single decimal points with no numbers, | ||
and any single whitespace character with np.nan. | ||
|
||
Args: | ||
df (pandas.DataFrame): The DataFrame to clean. | ||
df: The DataFrame to clean. | ||
|
||
Returns: | ||
pandas.DataFrame: The cleaned DataFrame. | ||
DataFrame with regularized NA values. | ||
""" | ||
return df.replace( | ||
to_replace=[ | ||
r"^\.$", # Nothing but a decimal point | ||
r"^\s*$", # The empty string and entirely whitespace strings | ||
], | ||
value=np.nan, | ||
regex=True, | ||
) | ||
return df.replace(regex=r"(^\.$|^\s*$)", value=np.nan) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a regression in pandas. See pandas-dev/pandas#54399 |
||
|
||
|
||
def simplify_columns(df): | ||
|
@@ -1025,14 +1023,18 @@ def simplify_columns(df): | |
Todo: | ||
Update docstring. | ||
""" | ||
df.columns = ( | ||
df.columns.str.replace(r"[^0-9a-zA-Z]+", " ", regex=True) | ||
.str.strip() | ||
.str.lower() | ||
.str.replace(r"\s+", " ", regex=True) | ||
.str.replace(" ", "_") | ||
) | ||
return df | ||
# Do nothing, if empty dataframe (e.g. mocked for tests) | ||
if df.shape[0] == 0: | ||
return df | ||
else: | ||
df.columns = ( | ||
df.columns.str.replace(r"[^0-9a-zA-Z]+", " ", regex=True) | ||
.str.strip() | ||
.str.lower() | ||
.str.replace(r"\s+", " ", regex=True) | ||
.str.replace(" ", "_") | ||
) | ||
return df | ||
|
||
|
||
def drop_tables(engine: sa.engine.Engine, clobber: bool = False): | ||
|
@@ -1220,11 +1222,11 @@ def generate_rolling_avg( | |
# to get the backbone/complete date range/groups | ||
bones = ( | ||
date_range.merge(groups) | ||
.drop("tmp", axis=1) # drop the temp column | ||
.drop(columns="tmp") # drop the temp column | ||
.merge(df, on=group_cols + ["report_date"]) | ||
.set_index(group_cols + ["report_date"]) | ||
.groupby(by=group_cols + ["report_date"]) | ||
.mean() | ||
.mean(numeric_only=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change in pandas default behavior. |
||
) | ||
# with the aggregated data, get a rolling average | ||
roll = bones.rolling(window=window, center=True, **kwargs).agg({data_col: "mean"}) | ||
|
@@ -1600,7 +1602,7 @@ def convert_df_to_excel_file(df: pd.DataFrame, **kwargs) -> pd.ExcelFile: | |
writer = pd.ExcelWriter(bio, engine="xlsxwriter") | ||
df.to_excel(writer, **kwargs) | ||
|
||
writer.save() | ||
writer.close() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change in API? |
||
|
||
bio.seek(0) | ||
workbook = bio.read() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -663,7 +663,7 @@ def to_pandas_dtype(self, compact: bool = False) -> str | pd.CategoricalDtype: | |
return "float32" | ||
return FIELD_DTYPES_PANDAS[self.type] | ||
|
||
def to_sql_dtype(self) -> sa.sql.visitors.VisitableType: | ||
def to_sql_dtype(self) -> type: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To work with SQLAlchemy 2.0 |
||
"""Return SQLAlchemy data type.""" | ||
if self.constraints.enum and self.type == "string": | ||
return sa.Enum(*self.constraints.enum) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,7 @@ | |
"year": pa.int32(), | ||
} | ||
|
||
FIELD_DTYPES_SQL: dict[str, sa.sql.visitors.VisitableType] = { | ||
FIELD_DTYPES_SQL: dict[str, type] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For compatibility with SQLAlchemy 2.0 |
||
"boolean": sa.Boolean, | ||
"date": sa.Date, | ||
# Ensure SQLite's string representation of datetime uses only whole seconds: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,17 +42,17 @@ def get_layer(layer, dp1_engine): | |
table_name = f"{layer}_2010census_dp1" | ||
df = pd.read_sql( | ||
""" | ||
SELECT geom_cols.f_table_name as table_name, | ||
geom_cols.f_geometry_column as geom_col, | ||
crs.auth_name as auth_name, | ||
crs.auth_srid as auth_srid | ||
FROM geometry_columns geom_cols | ||
INNER JOIN spatial_ref_sys crs | ||
ON geom_cols.srid = crs.srid | ||
WHERE table_name = ? | ||
""", | ||
SELECT geom_cols.f_table_name as table_name, | ||
geom_cols.f_geometry_column as geom_col, | ||
crs.auth_name as auth_name, | ||
crs.auth_srid as auth_srid | ||
FROM geometry_columns geom_cols | ||
INNER JOIN spatial_ref_sys crs | ||
ON geom_cols.srid = crs.srid | ||
WHERE table_name = ? | ||
""", | ||
dp1_engine, | ||
params=[table_name], | ||
params=(table_name,), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something changed about how pandas passes query params to SQL Alchemy. Tuple is okay, list is not. |
||
) | ||
if len(df) != 1: | ||
raise AssertionError( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -922,8 +922,7 @@ def _core_eia860__boiler_emissions_control_equipment_assn( | |
raw_eia860__boiler_particulate, | ||
] | ||
|
||
bece_df = pd.DataFrame({}) | ||
|
||
dfs = [] | ||
for table in raw_tables: | ||
# There are some utilities that report the same emissions control equipment. | ||
# Drop duplicate rows where the only difference is utility. | ||
|
@@ -948,7 +947,8 @@ def _core_eia860__boiler_emissions_control_equipment_assn( | |
var_name="emission_control_id_type", | ||
value_name="emission_control_id_eia", | ||
) | ||
bece_df = bece_df.append(table) | ||
dfs.append(table) | ||
bece_df = pd.concat(dfs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I also switched to doing a single big concatenation rather than many incremental ones. |
||
|
||
# The report_year column must be report_date in order for the harvcesting process | ||
# to work on this table. It later gets converted back to report_year. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Displays all columns. Not sure why -1 ever worked.