## Settings

In [12]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
%autoreload 2

In [14]:
import helpers.settings as sts
sts.print_settings(sts)

[1m[91mDATASET_TRAIN_FILENAME : dataset_train.parquet
[1m[91mDATASET_VALIDATION_FILENAME : dataset_validation.parquet
[1m[91mETL_VERSION : 0.0.1
[1m[91mMODEL_FILENAME : model_0.0.1.pkl
[1m[91mMODEL_VERSION : 0.0.1
[1m[91mPREPROCESSOR_FILENAME : preprocessor_0.0.1.pkl
[1m[91mcolor : <class 'helpers.settings.color'>
[1m[91mprint_settings : <function print_settings at 0x7fd672aa8700>
[0m


## Imports

In [64]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from category_encoders.woe import WOEEncoder
import warnings
from datetime import datetime

## Load train dataset

In [16]:
df_train = pd.read_parquet(f"data/{sts.DATASET_TRAIN_FILENAME}")

In [17]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

## Features

In [28]:
X_train.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,F,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-21645,365243,1,0,0,0,,2.0
1,F,N,N,0,157500.0,Commercial associate,Higher education,Civil marriage,House / apartment,-10193,-105,1,1,1,0,Accountants,2.0
2,F,Y,Y,0,675000.0,Pensioner,Higher education,Married,House / apartment,-21721,365243,1,0,0,0,,2.0
3,F,Y,Y,2,112500.0,Working,Secondary / secondary special,Married,House / apartment,-9994,-644,1,1,0,0,Sales staff,4.0
4,F,N,Y,0,112500.0,Pensioner,Lower secondary,Married,House / apartment,-23754,365243,1,0,0,0,,2.0


In [29]:
excluded_features = []

In [30]:
special_features = []

In [46]:
numeric_features = [
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "CNT_FAM_MEMBERS",
]

In [47]:
categorical_features = [
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "FLAG_MOBIL",    
    "FLAG_WORK_PHONE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
    "OCCUPATION_TYPE",
]

In [48]:
for numeric_feature in numeric_features:
    print(f"{numeric_feature}: {numeric_feature in X_train.select_dtypes(exclude=['object']).columns}")

CNT_CHILDREN: True
AMT_INCOME_TOTAL: True
DAYS_BIRTH: True
DAYS_EMPLOYED: True
CNT_FAM_MEMBERS: True


In [49]:
for categorical_feature in categorical_features:
    print(f"{categorical_feature}:{categorical_feature in X_train.select_dtypes(include=['object']).columns}")

CODE_GENDER:True
FLAG_OWN_CAR:True
FLAG_OWN_REALTY:True
NAME_INCOME_TYPE:True
NAME_EDUCATION_TYPE:True
NAME_FAMILY_STATUS:True
NAME_HOUSING_TYPE:True
FLAG_MOBIL:False
FLAG_WORK_PHONE:False
FLAG_PHONE:False
FLAG_EMAIL:False
OCCUPATION_TYPE:True


## Additional definitions

In [51]:
def replace_values_in_string(text, args_dict):
    for key in args_dict.keys():
        text = text.replace(key, str(args_dict[key]))
    return text

In [54]:
class ModifiedColumnTransformer(ColumnTransformer):       
    """Wraps a modified version of a ColumnTransformer that includes the column names after having done all the
    transformations.
        
    Args:
        transformers (list): List of transformers that are going to be set for the ColumnTransformer inheriting parent
        numeric_features (list): List of strings containing the standard numeric features contained in the initial 
            dataset
        categorical_features (list): List of strings containing the standard categorical features contained in the
            initial dataset
        special_features (list): List of strings containing the special features contained in the
            initial dataset (could be numeric or categorical, the difference is that they get a different treatment
            than the rest in the pipeline)
        hard_mode (bool): Wheter to enforce initial fitted features during transformation or not
    Returns:
        None.
    Raises:
        None.
    """
    def __init__(self, transformers, numeric_features:list = [], categorical_features:list = [], special_features:list = [], hard_mode:bool = True):
        super().__init__(transformers=transformers)
        self.initial_features = numeric_features + categorical_features + special_features
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.special_features = special_features
        self.final_features = None
        self.hard_mode = hard_mode
        if len(self.initial_features) == 0:
            warnings.warn(f"{datetime.now()} INFO: No initial features were set, please set explicitly numeric_features, categorical_features, and/or special_features to avoid unexpected beahaviors. You can continue like this but some problems may appear when using the transformer.", stacklevel=2)
        warnings.warn(f"""{datetime.now()} INFO: Hard mode for the ModifiedColumnTransformer set to {self.hard_mode}: The initial features {'are' if self.hard_mode else 'are not'} going to be enforced during transformation and fit steps""", stacklevel=2)
    
    def fit(self, X, y=None, **kwargs):
        if self.hard_mode:
            super().fit(X[self.initial_features], y=y)
        else:
            super().fit(X, y=y)
            self.initial_features = X.columns
        self.final_features = ModifiedColumnTransformer.get_all_column_names(self)
        
    def transform(self, X, y=None):
        if self.hard_mode:
            return super().transform(X[self.initial_features])
        else:
            return super().transform(X)
        
    def fit_transform(self, X, y):
        if self.hard_mode:
            result = super().fit_transform(X[self.initial_features], y=y)
        else:
            result = super().fit_transform(X, y=y)
            self.initial_features = X.columns
        self.final_features = ModifiedColumnTransformer.get_all_column_names(self)
        return result
    
    @staticmethod
    def get_all_column_names(column_transformer) -> list:
        """Extracts the name of the resulting columns of a ColumnTransformer after all the transformations
        Args:
            column_transformer (ColumnTranformer): ColumnTransformer fitted instance from which to extract the column
                names
        Returns:
            col_name (list): List containing the column names based on the order of the ColumnTransformer transformers
        Raises:
            None.
        """
        col_name = []
        for transformer_in_columns in column_transformer.transformers_:
            # print(transformer_in_columns)
            raw_col_name = transformer_in_columns[2]
            if isinstance(transformer_in_columns[1],Pipeline): 
                transformer = transformer_in_columns[1].steps[-1][1]
            else:
                transformer = transformer_in_columns[1]
            try:
                category_dict = {}
                i=0
                names = transformer.get_feature_names()
                for category in transformer_in_columns[2]:
                    category_dict[f"x{i}"] = category
                    i+=1
                names = [replace_values_in_string(name,category_dict) for name in names]
                # print(category_dict)
            except AttributeError: # if no 'get_feature_names' function, use raw column name
                names = raw_col_name
            if isinstance(names,np.ndarray): # eg.
                col_name += names.tolist()
            elif isinstance(names,list):
                col_name += names    
            elif isinstance(names,str):
                col_name.append(names)
        return col_name   

## Pipeline

In [68]:
categorical_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant",missing_values=np.nan, fill_value=np.nan)),
        ("encoder", WOEEncoder()),
    ]
)

numeric_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", missing_values=np.nan, fill_value=np.nan)),
    ]
)


preprocessor = ModifiedColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("numeric", numeric_transformer, numeric_features),
    ],
    numeric_features=numeric_features,
    categorical_features=categorical_features,
)

  preprocessor = ModifiedColumnTransformer(


In [71]:
pd.DataFrame(preprocessor.fit_transform(X_train,y_train), columns=preprocessor.final_features)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,CNT_FAM_MEMBERS
0,0.043373,-0.017715,0.248458,0.363272,0.026367,-0.013086,-0.041703,-0.002433,0.112936,-0.033699,-0.057213,0.000000,0.0,135000.0,-21645.0,365243.0,2.0
1,0.043373,-0.017715,-0.841560,-0.073486,-0.215596,-0.439822,-0.041703,-0.002433,-0.528348,0.076444,-0.057213,0.606423,0.0,157500.0,-10193.0,-105.0,2.0
2,0.043373,0.028455,0.248458,0.363272,-0.215596,-0.013086,-0.041703,-0.002433,0.112936,-0.033699,-0.057213,0.000000,0.0,675000.0,-21721.0,365243.0,2.0
3,0.043373,0.028455,0.248458,-0.027511,0.026367,-0.013086,-0.041703,-0.002433,-0.528348,-0.033699,-0.057213,-0.193711,2.0,112500.0,-9994.0,-644.0,4.0
4,0.043373,-0.017715,0.248458,0.363272,-0.341495,-0.013086,-0.041703,-0.002433,0.112936,-0.033699,-0.057213,0.000000,0.0,112500.0,-23754.0,365243.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30983,0.043373,-0.017715,0.248458,0.363272,0.026367,0.174109,-0.041703,-0.002433,0.112936,0.076444,0.447095,0.000000,0.0,112500.0,-23400.0,365243.0,1.0
30984,0.043373,-0.017715,0.248458,-0.073486,0.026367,-0.013086,-0.041703,-0.002433,0.112936,-0.033699,-0.057213,-0.431977,1.0,135000.0,-15532.0,-8256.0,3.0
30985,0.043373,-0.017715,0.248458,-0.027511,0.026367,-0.013086,-0.041703,-0.002433,-0.528348,0.076444,-0.057213,-0.305770,0.0,76500.0,-17782.0,-3291.0,2.0
30986,0.043373,-0.017715,0.248458,0.363272,0.026367,-0.439822,-0.041703,-0.002433,0.112936,0.076444,-0.057213,0.000000,0.0,157500.0,-21635.0,365243.0,2.0


## Dump preprocessor