## Settings

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import helpers.settings as sts
sts.print_settings(sts)

[1m[91mBEST_ESTIMATOR_FILENAME : best_estimator_0.0.1.pkl
[1m[91mDATASET_TRAIN_FILENAME : dataset_train.parquet
[1m[91mDATASET_VALIDATION_FILENAME : dataset_validation.parquet
[1m[91mETL_VERSION : 0.0.1
[1m[91mMODEL_FILENAME : model.pkl
[1m[91mMODEL_VERSION : 0.0.1
[1m[91mPREPROCESSOR_FILENAME : preprocessor_0.0.1.pkl
[1m[91mTRAINED_BEST_ESTIMATOR_FILENAME : trained_best_estimator_0.0.1.pkl
[1m[91mcolor : <class 'helpers.settings.color'>
[1m[91mprint_settings : <function print_settings at 0x7fd9583aec10>
[0m


## Imports

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')  
import numpy as np
from category_encoders.woe import WOEEncoder
import warnings
from datetime import datetime
import pickle
from helpers.processing import ModifiedColumnTransformer

## Load train dataset

In [5]:
df_train = pd.read_parquet(f"data/{sts.DATASET_TRAIN_FILENAME}")

In [6]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

## Features

In [7]:
X_train.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,F,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-21645,365243,1,0,0,0,,2.0
1,F,N,N,0,157500.0,Commercial associate,Higher education,Civil marriage,House / apartment,-10193,-105,1,1,1,0,Accountants,2.0
2,F,Y,Y,0,675000.0,Pensioner,Higher education,Married,House / apartment,-21721,365243,1,0,0,0,,2.0
3,F,Y,Y,2,112500.0,Working,Secondary / secondary special,Married,House / apartment,-9994,-644,1,1,0,0,Sales staff,4.0
4,F,N,Y,0,112500.0,Pensioner,Lower secondary,Married,House / apartment,-23754,365243,1,0,0,0,,2.0


In [8]:
excluded_features = []

In [9]:
special_features = []

In [10]:
numeric_features = [
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "CNT_FAM_MEMBERS",
]

In [11]:
categorical_features = [
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "FLAG_MOBIL",    
    "FLAG_WORK_PHONE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
]

In [12]:
for numeric_feature in numeric_features:
    print(f"{numeric_feature}: {numeric_feature in X_train.select_dtypes(exclude=['object']).columns}")

CNT_CHILDREN: True
AMT_INCOME_TOTAL: True
DAYS_BIRTH: True
DAYS_EMPLOYED: True
CNT_FAM_MEMBERS: True


In [13]:
for categorical_feature in categorical_features:
    print(f"{categorical_feature}:{categorical_feature in X_train.select_dtypes(include=['object']).columns}")

CODE_GENDER:True
FLAG_OWN_CAR:True
FLAG_OWN_REALTY:True
NAME_INCOME_TYPE:True
NAME_EDUCATION_TYPE:True
NAME_FAMILY_STATUS:True
NAME_HOUSING_TYPE:True
FLAG_MOBIL:False
FLAG_WORK_PHONE:False
FLAG_PHONE:False
FLAG_EMAIL:False


## Pipeline

In [14]:
categorical_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant",missing_values=np.nan, fill_value=np.nan)),
        ("encoder", WOEEncoder()),
    ]
)

numeric_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="constant", missing_values=np.nan, fill_value=np.nan)),
        ("scaler", RobustScaler()),
    ]
)


preprocessor = ModifiedColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("numeric", numeric_transformer, numeric_features),
    ],
    numeric_features=numeric_features,
    categorical_features=categorical_features,
)

  preprocessor = ModifiedColumnTransformer(


In [15]:
preprocessor

## Dump preprocessor

In [16]:
with open(f"artifacts/{sts.PREPROCESSOR_FILENAME}","wb") as file:
    pickle.dump(preprocessor, file)