Skip to content

Commit

Permalink
Work around typing issue in examples and providers (#35494)
Browse files Browse the repository at this point in the history
* Work around typing issue in workday examples

Since this is just an example, we'll just ignore this and not worry
too much about it.

* Rewrite Pandas column-renaming

The assignment syntax doesn't seem to be type-safe (although working at
runtime), let's use a more semantic method.

* Fix commin.sql arg type to Pandas

Some of the parameters were too lax for Pandas and need to be tightened
up.

* More column rename fixes

* Work around strict Numpy typing

This seems to be a definition issue between Numpy and Pandas? Not sure,
but this works at runtime so let's not dig too deep. Simply annotating
the variable as Any is enough to work around the problem.

* Work around complex Pandas typing

This is just too much. Let's cast and not worry about it.

* More parameters type fixes
  • Loading branch information
uranusjr committed Nov 7, 2023
1 parent 865b3a5 commit 11bdfe4
Show file tree
Hide file tree
Showing 10 changed files with 36 additions and 22 deletions.
2 changes: 1 addition & 1 deletion airflow/example_dags/plugins/workday.py
Expand Up @@ -38,7 +38,7 @@
holiday_calendar = USFederalHolidayCalendar()
except ImportError:
log.warning("Could not import pandas. Holidays will not be considered.")
holiday_calendar = None
holiday_calendar = None # type: ignore[assignment]


class AfterWorkdayTimetable(Timetable):
Expand Down
16 changes: 9 additions & 7 deletions airflow/providers/amazon/aws/transfers/sql_to_s3.py
Expand Up @@ -20,7 +20,7 @@
import enum
from collections import namedtuple
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Iterable, Mapping, Sequence
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Sequence, cast

from typing_extensions import Literal

Expand Down Expand Up @@ -105,7 +105,7 @@ def __init__(
s3_key: str,
sql_conn_id: str,
sql_hook_params: dict | None = None,
parameters: None | Mapping | Iterable = None,
parameters: None | Mapping[str, Any] | list | tuple = None,
replace: bool = False,
aws_conn_id: str = "aws_default",
verify: bool | str | None = None,
Expand Down Expand Up @@ -158,7 +158,7 @@ def _fix_dtypes(df: pd.DataFrame, file_format: FILE_FORMAT) -> None:

if "float" in df[col].dtype.name and df[col].hasnans:
# inspect values to determine if dtype of non-null values is int or float
notna_series = df[col].dropna().values
notna_series: Any = df[col].dropna().values
if np.equal(notna_series, notna_series.astype(int)).all():
# set to dtype that retains integers and supports NaNs
# The type ignore can be removed here if https://github.com/numpy/numpy/pull/23690
Expand Down Expand Up @@ -196,10 +196,12 @@ def _partition_dataframe(self, df: pd.DataFrame) -> Iterable[tuple[str, pd.DataF
"""Partition dataframe using pandas groupby() method."""
if not self.groupby_kwargs:
yield "", df
else:
grouped_df = df.groupby(**self.groupby_kwargs)
for group_label in grouped_df.groups:
yield group_label, grouped_df.get_group(group_label).reset_index(drop=True)
return
for group_label in (grouped_df := df.groupby(**self.groupby_kwargs)).groups:
yield (
cast(str, group_label),
cast("pd.DataFrame", grouped_df.get_group(group_label).reset_index(drop=True)),
)

def _get_hook(self) -> DbApiHook:
self.log.debug("Get connection for %s", self.sql_conn_id)
Expand Down
21 changes: 17 additions & 4 deletions airflow/providers/common/sql/hooks/sql.py
Expand Up @@ -22,6 +22,7 @@
TYPE_CHECKING,
Any,
Callable,
Generator,
Iterable,
Mapping,
Protocol,
Expand All @@ -41,6 +42,8 @@
from airflow.version import version

if TYPE_CHECKING:
from pandas import DataFrame

from airflow.providers.openlineage.extractors import OperatorLineage
from airflow.providers.openlineage.sqlparser import DatabaseInfo

Expand Down Expand Up @@ -198,7 +201,12 @@ def get_sqlalchemy_engine(self, engine_kwargs=None):
engine_kwargs = {}
return create_engine(self.get_uri(), **engine_kwargs)

def get_pandas_df(self, sql, parameters: Iterable | Mapping[str, Any] | None = None, **kwargs):
def get_pandas_df(
self,
sql,
parameters: list | tuple | Mapping[str, Any] | None = None,
**kwargs,
) -> DataFrame:
"""
Execute the sql and returns a pandas dataframe.
Expand All @@ -218,14 +226,19 @@ def get_pandas_df(self, sql, parameters: Iterable | Mapping[str, Any] | None = N
return psql.read_sql(sql, con=conn, params=parameters, **kwargs)

def get_pandas_df_by_chunks(
self, sql, parameters: Iterable | Mapping[str, Any] | None = None, *, chunksize: int | None, **kwargs
):
self,
sql,
parameters: list | tuple | Mapping[str, Any] | None = None,
*,
chunksize: int,
**kwargs,
) -> Generator[DataFrame, None, None]:
"""
Execute the sql and return a generator.
:param sql: the sql statement to be executed (str) or a list of sql statements to execute
:param parameters: The parameters to render the SQL query with
:param chunksize: number of rows to include in each chunk
:param chunksize: number of rows to include in each chunk
:param kwargs: (optional) passed into pandas.io.sql.read_sql method
"""
try:
Expand Down
2 changes: 1 addition & 1 deletion airflow/providers/presto/hooks/presto.py
Expand Up @@ -171,7 +171,7 @@ def get_pandas_df(self, sql: str = "", parameters=None, **kwargs):
column_descriptions = cursor.description
if data:
df = pd.DataFrame(data, **kwargs)
df.columns = [c[0] for c in column_descriptions]
df.rename(columns={n: c[0] for n, c in zip(df.columns, column_descriptions)}, inplace=True)
else:
df = pd.DataFrame(**kwargs)
return df
Expand Down
3 changes: 1 addition & 2 deletions airflow/providers/salesforce/hooks/salesforce.py
Expand Up @@ -367,8 +367,7 @@ def object_to_df(
# that's because None/np.nan cannot exist in an integer column
# we should write all of our timestamps as FLOATS in our final schema
df = pd.DataFrame.from_records(query_results, exclude=["attributes"])

df.columns = [column.lower() for column in df.columns]
df.rename(columns=str.lower, inplace=True)

# convert columns with datetime strings to datetimes
# not all strings will be datetimes, so we ignore any errors that occur
Expand Down
4 changes: 2 additions & 2 deletions airflow/providers/slack/transfers/base_sql_to_slack.py
Expand Up @@ -16,7 +16,7 @@
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Iterable, Mapping
from typing import TYPE_CHECKING, Any, Mapping

from airflow.exceptions import AirflowException
from airflow.hooks.base import BaseHook
Expand Down Expand Up @@ -50,7 +50,7 @@ def __init__(
sql: str,
sql_conn_id: str,
sql_hook_params: dict | None = None,
parameters: Iterable | Mapping[str, Any] | None = None,
parameters: list | tuple | Mapping[str, Any] | None = None,
slack_proxy: str | None = None,
slack_timeout: int | None = None,
slack_retry_handlers: list[RetryHandler] | None = None,
Expand Down
4 changes: 2 additions & 2 deletions airflow/providers/slack/transfers/sql_to_slack.py
Expand Up @@ -18,7 +18,7 @@

import warnings
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Sequence
from typing import TYPE_CHECKING, Any, Mapping, Sequence

from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
from airflow.providers.slack.hooks.slack import SlackHook
Expand Down Expand Up @@ -74,7 +74,7 @@ def __init__(
sql: str,
sql_conn_id: str,
sql_hook_params: dict | None = None,
parameters: Iterable | Mapping[str, Any] | None = None,
parameters: list | tuple | Mapping[str, Any] | None = None,
slack_conn_id: str = SlackHook.default_conn_name,
slack_filename: str,
slack_channels: str | Sequence[str] | None = None,
Expand Down
2 changes: 1 addition & 1 deletion airflow/providers/slack/transfers/sql_to_slack_webhook.py
Expand Up @@ -85,7 +85,7 @@ def __init__(
slack_channel: str | None = None,
slack_message: str,
results_df_name: str = "results_df",
parameters: Iterable | Mapping[str, Any] | None = None,
parameters: list | tuple | Mapping[str, Any] | None = None,
**kwargs,
) -> None:
if slack_conn_id := kwargs.pop("slack_conn_id", None):
Expand Down
2 changes: 1 addition & 1 deletion airflow/providers/trino/hooks/trino.py
Expand Up @@ -189,7 +189,7 @@ def get_pandas_df(self, sql: str = "", parameters: Iterable | Mapping[str, Any]
column_descriptions = cursor.description
if data:
df = pd.DataFrame(data, **kwargs)
df.columns = [c[0] for c in column_descriptions]
df.rename(columns={n: c[0] for n, c in zip(df.columns, column_descriptions)}, inplace=True)
else:
df = pd.DataFrame(**kwargs)
return df
Expand Down
2 changes: 1 addition & 1 deletion tests/plugins/workday.py
Expand Up @@ -34,7 +34,7 @@
holiday_calendar = USFederalHolidayCalendar()
except ImportError:
log.warning("Could not import pandas. Holidays will not be considered.")
holiday_calendar = None
holiday_calendar = None # type: ignore[assignment]


class AfterWorkdayTimetable(Timetable):
Expand Down

0 comments on commit 11bdfe4

Please sign in to comment.