In [None]:
#| default_exp transformer

In [None]:
#| export
from fastcore.all import *
import pandas as pd
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
#| export
class DataFrameTransformer(TransformerMixin, BaseEstimator):
    """Applies a transformer to a set of columns of pandas DataFrame and it outputs a DataFrame too."""

    def __init__(self, transformer=None, input_cols=None, output_cols=None, prev_step=None, append=False,
                 print_input_cols=False, print_output_cols=False, print_out_df_cols=False):
        store_attr()

    def before_fit(self):
        if not self.prev_step is None and self.input_cols is None:
            if hasattr(self.prev_step, 'output_cols'):
                self.input_cols = self.prev_step.output_cols
        if self.print_input_cols: print(L(self.input_cols, use_list=not isinstance(self.input_cols, str)))

    def after_fit(self):
        if self.output_cols is None and hasattr(self.transformer, 'get_feature_names_out'):
            self.output_cols = self.transformer.get_feature_names_out()
        if self.print_output_cols: print(L(self.output_cols, use_list=not isinstance(self.output_cols, str)))
    
    def after_transform(self, out, X):
        if issparse(out): out = out.todense()
        out_df = pd.DataFrame(out, columns=self.output_cols, index=X.index)
        if self.append: out_df = pd.concat([X, out_df], axis=1)
        if self.print_out_df_cols: print(L(out_df.columns, use_list=True))
        return out_df

    def transform(self, X):
        out = self.transformer.transform(X[self.input_cols])
        out_df = self.after_transform(out, X)
        return out_df
    
    def fit(self, X, y=None): 
        self.before_fit()
        out = self.transformer.fit(X[self.input_cols], y=y)
        self.after_fit()
        return out
    
    def fit_transform(self, X, y=None):
        self.before_fit()
        out = self.transformer.fit_transform(X[self.input_cols], y=y)
        self.after_fit()
        out_df = self.after_transform(out, X)
        return out_df

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [None]:
X = pd.DataFrame(
    {'city': ['London', 'London', 'Paris', 'Sallisaw'],
     'title': ["His Last Bow", "How Watson Learned the Trick",
               "A Moveable Feast", "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating': [4, 5, 4, 3]})
X

Unnamed: 0,city,title,expert_rating,user_rating
0,London,His Last Bow,5,4
1,London,How Watson Learned the Trick,3,5
2,Paris,A Moveable Feast,4,4
3,Sallisaw,The Grapes of Wrath,5,3


The `OneHotEncoder` expects a two dimensional array as input, so we set the `input_cols` to a list of columns. `DataFrameTransformer` uses the 

In [None]:
enc_city = DataFrameTransformer(transformer=OneHotEncoder(dtype='int'),
                                input_cols=['city'],
                                append=True)
enc_city.fit_transform(X)

Unnamed: 0,city,title,expert_rating,user_rating,city_London,city_Paris,city_Sallisaw
0,London,His Last Bow,5,4,1,0,0
1,London,How Watson Learned the Trick,3,5,1,0,0
2,Paris,A Moveable Feast,4,4,0,1,0
3,Sallisaw,The Grapes of Wrath,5,3,0,0,1


`CountVectorizer` expects a one-dimensional array as input so we set `input_cols` to a string that will retrieve a one-dimensional array from the input `DataFrame`.

In [None]:
enc_title = DataFrameTransformer(transformer=CountVectorizer(), input_cols='title', append=True)
enc_title.fit_transform(X)

Unnamed: 0,city,title,expert_rating,user_rating,bow,feast,grapes,his,how,last,learned,moveable,of,the,trick,watson,wrath
0,London,His Last Bow,5,4,1,0,0,1,0,1,0,0,0,0,0,0,0
1,London,How Watson Learned the Trick,3,5,0,0,0,0,1,0,1,0,0,1,1,1,0
2,Paris,A Moveable Feast,4,4,0,1,0,0,0,0,0,1,0,0,0,0,0
3,Sallisaw,The Grapes of Wrath,5,3,0,0,1,0,0,0,0,0,1,1,0,0,1


We can chain these two into one `Pipeline`.

In [None]:
pipe = Pipeline([('enc_city', enc_city), ('enc_title', enc_title)])
pipe.fit_transform(X)

Unnamed: 0,city,title,expert_rating,user_rating,city_London,city_Paris,city_Sallisaw,bow,feast,grapes,his,how,last,learned,moveable,of,the,trick,watson,wrath
0,London,His Last Bow,5,4,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0
1,London,How Watson Learned the Trick,3,5,1,0,0,0,0,0,0,1,0,1,0,0,1,1,1,0
2,Paris,A Moveable Feast,4,4,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,Sallisaw,The Grapes of Wrath,5,3,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1


## Export -

In [None]:
#|hide
#|eval: false
from nbdev import nbdev_export; nbdev_export()