In [1]:
import pandas as pd
from pathlib import Path
from typing import List
from sklearn.pipeline import Pipeline
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from sklearn.base import BaseEstimator, TransformerMixin



In [2]:
PACKAGE_ROOT = Path().resolve().parents[0]
data = pd.read_csv(f'{PACKAGE_ROOT}/datasets/house_price_predict.csv', index_col=0)
data.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17384,1453602313,20141029T000000,2,1.5,1430,1650,3.0,0,0,3,7,1430,0,1999,0,98125,47.7222,-122.29,1430,1650
722,2225059214,20140808T000000,4,3.25,4670,51836,2.0,0,0,4,12,4670,0,1988,0,98005,47.635,-122.164,4230,41075
2680,2768000270,20140625T000000,2,0.75,1440,3700,1.0,0,0,3,7,1200,240,1914,0,98107,47.6707,-122.364,1440,4300
18754,6819100040,20140624T000000,2,1.0,1130,2640,1.0,0,0,4,8,1130,0,1927,0,98109,47.6438,-122.357,1680,3200
14554,4027700666,20150426T000000,4,2.5,3180,9603,2.0,0,2,3,9,3180,0,2002,0,98155,47.7717,-122.277,2440,15261


### Transformer

In [3]:
class DataTypeTransformer(BaseEstimator, TransformerMixin):
    """Data type transformer."""
    def __init__(self, variables: List[str], data_type: str):

        if not isinstance(variables, list):
            raise ValueError("variables should be a list")

        self.variables = variables
        self.data_type = data_type
    
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        # we need this step to fit the sklearn pipeline
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:

        # so that we do not over-write the original dataframe
        X = X.copy()

        for var in self.variables:

            if self.data_type == "categorical":
                X[var] = X[var].astype(str)
            elif self.data_type == "numerical":
                X[var] = X[var].astype(int)
            elif self.data_type == "date":
                X[var] = pd.to_datetime(X[var])
            
        return X

In [4]:
demo_pipe = Pipeline(
    [
        (
            "categorical",
            DataTypeTransformer(
                variables=['zipcode', 'waterfront', 'view', 'condition', 'grade'],
                data_type="categorical"
            )
        ),
        (
            "numerical",
            DataTypeTransformer(
                variables=['bedrooms','bathrooms','sqft_living','sqft_lot', 'floors',
                           'sqft_above','sqft_basement','sqft_living15'],
                data_type="numerical"
            )
        ),
        (
            "categorical_missing",
            CategoricalImputer(
                imputation_method="missing",
                #variables=config.data_config.categorical_variables
                variables=['zipcode', 'waterfront', 'view', 'condition', 'grade']
            )
        ),
        (
            "numerical_missing",
            MeanMedianImputer(
                imputation_method="mean",
                #variables=config.data_config.numerical_variables
                variables=['bedrooms','bathrooms','sqft_living','sqft_lot','floors',
                           'sqft_above', 'sqft_basement','sqft_living15']
            )
        )
    ]
)

In [6]:
demo_pipe.fit_transform(data)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17384,1453602313,20141029T000000,2,1,1430,1650,3,0,0,3,7,1430,0,1999,0,98125,47.7222,-122.290,1430,1650
722,2225059214,20140808T000000,4,3,4670,51836,2,0,0,4,12,4670,0,1988,0,98005,47.6350,-122.164,4230,41075
2680,2768000270,20140625T000000,2,0,1440,3700,1,0,0,3,7,1200,240,1914,0,98107,47.6707,-122.364,1440,4300
18754,6819100040,20140624T000000,2,1,1130,2640,1,0,0,4,8,1130,0,1927,0,98109,47.6438,-122.357,1680,3200
14554,4027700666,20150426T000000,4,2,3180,9603,2,0,2,3,9,3180,0,2002,0,98155,47.7717,-122.277,2440,15261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5427,3528000545,20140815T000000,4,3,3090,67518,2,0,0,3,10,3090,0,1988,0,98053,47.6674,-122.046,3200,65775
16547,526059259,20140819T000000,3,1,1260,8487,1,0,0,3,7,1260,0,1970,0,98011,47.7664,-122.201,1890,13051
4585,339600090,20140925T000000,3,2,1360,3718,2,0,0,3,7,1360,0,1987,0,98052,47.6827,-122.097,1090,3718
17762,7750500120,20141118T000000,3,1,950,4760,1,0,0,3,6,950,0,1929,0,98106,47.5236,-122.348,1080,4760


In [7]:
import pickle
filename = '../assets/pipeline.pkl'
pickle.dump(demo_pipe, open(filename, 'wb'))

### Reaproveitando o pipeline com pickle

In [8]:
pipe = pickle.load(open(filename, 'rb'))

In [9]:
pipe.fit_transform(data)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17384,1453602313,20141029T000000,2,1,1430,1650,3,0,0,3,7,1430,0,1999,0,98125,47.7222,-122.290,1430,1650
722,2225059214,20140808T000000,4,3,4670,51836,2,0,0,4,12,4670,0,1988,0,98005,47.6350,-122.164,4230,41075
2680,2768000270,20140625T000000,2,0,1440,3700,1,0,0,3,7,1200,240,1914,0,98107,47.6707,-122.364,1440,4300
18754,6819100040,20140624T000000,2,1,1130,2640,1,0,0,4,8,1130,0,1927,0,98109,47.6438,-122.357,1680,3200
14554,4027700666,20150426T000000,4,2,3180,9603,2,0,2,3,9,3180,0,2002,0,98155,47.7717,-122.277,2440,15261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5427,3528000545,20140815T000000,4,3,3090,67518,2,0,0,3,10,3090,0,1988,0,98053,47.6674,-122.046,3200,65775
16547,526059259,20140819T000000,3,1,1260,8487,1,0,0,3,7,1260,0,1970,0,98011,47.7664,-122.201,1890,13051
4585,339600090,20140925T000000,3,2,1360,3718,2,0,0,3,7,1360,0,1987,0,98052,47.6827,-122.097,1090,3718
17762,7750500120,20141118T000000,3,1,950,4760,1,0,0,3,6,950,0,1929,0,98106,47.5236,-122.348,1080,4760
