# What we got and what we can do?

## Intro

In [2]:
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Example dataset
X = np.array([
    [1., 'red', 12.5],
    [2., 'blue', 15.3],
    [np.nan, 'green', 14.8]
])
y = [0, 1, 0]


NUMERICAL_FEATURES = [0, 2]
CATEGORICAL_FEATURES = [1]


numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    OneHotEncoder()
)

column_transformer = make_column_transformer(
    (numerical_pipeline, NUMERICAL_FEATURES),
    (categorical_pipeline, CATEGORICAL_FEATURES)
)

pipeline = make_pipeline(
    column_transformer,
    LogisticRegression()
)

pipeline.fit(X, y)

predicted = pipeline.predict(X)

Code above makes sense can we use same pattern to refactor this?

https://gitlab.com/v.gruzauskas/inostart-model-training/-/merge_requests/1/diffs

## Custom transformer

In [4]:
df = pd.DataFrame({
    'Date': pd.date_range(start='2021-01-01', periods=5, freq='D'),
    'Value': [10, 20, 30, 40, 50]
})
df

Unnamed: 0,Date,Value
0,2021-01-01,10
1,2021-01-02,20
2,2021-01-03,30
3,2021-01-04,40
4,2021-01-05,50


In [6]:
class DateFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X) -> pd.DataFrame:
        X_transformed = pd.DataFrame(X.copy())

        if 'Date' not in X_transformed:
            raise ValueError("DataFrame does not contain a 'Date' column.")

        date = pd.to_datetime(X_transformed['Date']).dt
        X_transformed = X_transformed.drop("Date", axis=1)
        X_transformed['Year'] = date.year
        X_transformed['Month'] = date.month
        X_transformed['Day'] = date.day

        return X_transformed

date_features_extractor = DateFeaturesExtractor()
transformed_df = date_features_extractor.transform(df)
transformed_df

Unnamed: 0,Value,Year,Month,Day
0,10,2021,1,1
1,20,2021,1,2
2,30,2021,1,3
3,40,2021,1,4
4,50,2021,1,5


## Functional transformer

There is a cleaner way to create transformers!

In [7]:
def extract_date_features(X: pd.DataFrame) -> pd.DataFrame:
    if 'Date' not in X:
        raise ValueError("DataFrame does not contain a 'Date' column.")

    X_transformed = X.copy()
    date = pd.to_datetime(X_transformed['Date']).dt
    X_transformed = X_transformed.drop("Date", axis=1)
    X_transformed['Year'] = date.year
    X_transformed['Month'] = date.month
    X_transformed['Day'] = date.day
    return X_transformed

date_features_extractor = FunctionTransformer(extract_date_features, feature_names_out=["Year", "Month", "Day"])
date_features_extractor.transform(df)


Unnamed: 0,Value,Year,Month,Day
0,10,2021,1,1
1,20,2021,1,2
2,30,2021,1,3
3,40,2021,1,4
4,50,2021,1,5


## Ugly transformer

Turns out the cleaner way can become ugly...

In [None]:
# ruff: noqa: F821
# type: ignore
FunctionTransformer(
    ensure_cluster_id_is_available,  
    kw_args={
        "gmaps_client": googlemaps.Client(key=settings.GOOGLE_MAPS_API_KEY),
        "cluster_to_region": cluster_to_region,
        "address_df_transaction": address_df_transaction,
    },
    feature_names_out=lambda self, input_features: [
        x for x in input_features.tolist() if x not in ["Address", "lat", "lng"]
    ]
    + ["Region"],
),



In [None]:
# type: ignore
FunctionTransformer(
    split_date_to_year_quarter_month,
    feature_names_out=lambda self, input_features: [x for x in input_features.tolist() if x != "Date"]
    + ["Year", "Quarter", "Month"],
),


## However, the pipeline is quite dense now.

Here is how most of it looks.

In [None]:

@timeit
def indicators_pipeline(cluster_to_region, indicator_df, address_df_transaction) -> Pipeline:
    indicators_df_pivoted = (
        pd.pivot_table(
            indicator_df,
            index=["Region", "Year", "Quarter", "Month"],
            columns="Indicator",
            values="Value",
            aggfunc="mean",
        )
        .bfill()
        .ffill()
    )
    pipeline = Pipeline(
        [
            ("selector", ColumnSelector(settings.COLUMN_SELECTOR_FEATURES)),
            (
                "ensure_cluster_id_is_available",
                FunctionTransformer(
                    ensure_cluster_id_is_available,
                    kw_args={
                        "gmaps_client": googlemaps.Client(key=settings.GOOGLE_MAPS_API_KEY),
                        "cluster_to_region": cluster_to_region,
                        "address_df_transaction": address_df_transaction,
                    },
                    feature_names_out=lambda self, input_features: [
                        x for x in input_features.tolist() if x not in ["Address", "lat", "lng"]
                    ]
                    + ["Region"],
                ),
            ),
            (
                "region_name_shortener",
                FunctionTransformer(
                    func=shorten_region_names,
                    kw_args={"pattern": re.compile("(ajono )|(ivaldybė)")},
                    feature_names_out="one-to-one",
                ),
            ),
            (
                "date",
                FunctionTransformer(
                    split_date_to_year_quarter_month,
                    feature_names_out=lambda self, input_features: [x for x in input_features.tolist() if x != "Date"]
                    + ["Year", "Quarter", "Month"],
                ),
            ),
            (
                "indicators",
                FunctionTransformer(
                    merge_indicators,
                    kw_args={"indicator_pivoted": indicators_df_pivoted},
                    feature_names_out=lambda self, input_features: [
                        x for x in input_features.tolist() if x not in ["Region", "Year", "TransID"]
                    ]
                    + indicators_df_pivoted.columns.tolist(),
                ),
            ),
        ],
    )

    return pipeline


## Links

https://gitlab.com/aidiss/va-model/-/blob/main/src/va_model/preprocessing/pipelines/indicators_pipe.py?ref_type=heads
