# Dynamic preprocessing models

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import pickle
from pandas.api.types import is_numeric_dtype
import os

## Create data

In [2]:
# raw - train
df_raw_train = pd.DataFrame(
    {
        'col1': [1,1,1,1,np.nan],
        'col2': [2,2,2,2,np.nan],
        'col3': [3,3,3,3,np.nan],
        'col4': ['four','Four','fo ur','four',np.nan],
        'col5': ['five','Five','fi ve','five',np.nan],
    }
)

# show
df_raw_train

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,,,,,


In [3]:
# raw - test
df_raw_test = pd.DataFrame(
     {
        'col1': [np.nan, np.nan],
        'col2': [np.nan, np.nan],
        'col3': [np.nan, np.nan],
        'col4': [np.nan, np.nan],
        'col5': [np.nan, np.nan],
    }
)

# show
df_raw_test

Unnamed: 0,col1,col2,col3,col4,col5
0,,,,,
1,,,,,


## Numeric Imputer (minimum)

### Find minimum of each numeric column

In [4]:
# loose code
dict_impute_min = {}
for col in df_raw_train.columns:
    # get min
    try:
        val_min = np.min(df_raw_train[col])
        # assign to dict_impute_min
        dict_impute_min[col] = val_min
    # if non-numeric
    except TypeError:
        pass

# show
pprint(dict_impute_min)

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


In [5]:
# function
def fit(X):
    dict_impute_min = {}
    for col in X.columns:
        # get min
        try:
            val_min = np.min(X[col])
            # assign to dict_impute_min
            dict_impute_min[col] = val_min
        # if non-numeric
        except TypeError:
            pass
    # return dict_impute_min
    return dict_impute_min

# use fit
dict_impute_min = fit(X=df_raw_train)

# show
pprint(dict_impute_min)

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


### Impute the minimum for each column

In [6]:
# create copy of the data so we dont edit the originals
df_raw_train_copy = df_raw_train.copy()

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,,,,,


In [7]:
# loose code
for key, val in dict_impute_min.items():
    # impute
    df_raw_train_copy[key] = df_raw_train_copy[key].fillna(val, inplace=False)

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,1.0,2.0,3.0,,


In [8]:
# create copy of the data so we dont edit the originals
df_raw_train_copy = df_raw_train.copy()

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,,,,,


In [9]:
# function
def transform(X, dict_impute_min):
    for key, val in dict_impute_min.items():
        # impute
        X[key] = X[key].fillna(val, inplace=False)
    # return X
    return X

In [10]:
# transform train
df_raw_train_copy = transform(
    X=df_raw_train_copy, 
    dict_impute_min=dict_impute_min,
)

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,1.0,2.0,3.0,,


### Create class

In [11]:
# create copy of the data so we dont edit the originals
df_raw_train_copy = df_raw_train.copy()

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,,,,,


In [12]:
# class
class ImputerNumericMin:
    # initialize
    def __init__(self):
        pass
    # fit
    def fit(self, X):
        dict_impute_min = {}
        for col in X.columns:
            # get min
            try:
                val_min = np.min(X[col])
                # assign to dict_impute_min
                dict_impute_min[col] = val_min
            # if non-numeric
            except TypeError:
                pass
        # save to object
        self.dict_impute_min = dict_impute_min
        # return object
        return self
    # transform
    def transform(self, X):
        for key, val in self.dict_impute_min.items():
            # impute
            X[key] = X[key].fillna(val, inplace=False)
        # return X
        return X

In [13]:
# initialize class
cls_imputer_numeric_min = ImputerNumericMin()

# show
try:
    pprint(cls_imputer_numeric_min.dict_impute_min)
except AttributeError:
    print('dict_impute_min does not exist, please run fit and check again')

dict_impute_min does not exist, please run fit and check again


In [14]:
# fit train
cls_imputer_numeric_min.fit(X=df_raw_train_copy)

# show
try:
    pprint(cls_imputer_numeric_min.dict_impute_min)
except AttributeError:
    print('dict_impute_min does not exist, please run fit and check again')

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


In [15]:
# transform train
df_raw_train_copy = cls_imputer_numeric_min.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,1.0,2.0,3.0,,


### But what if the data changes? What if a column is dropped?

In [16]:
# list cols to keep (drop col3)
list_cols_keep = [
    'col1',
    'col2',
    'col4',
    'col5',
]

# subset
df_raw_train_copy = df_raw_train.copy()[list_cols_keep]

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col4,col5
0,1.0,2.0,four,five
1,1.0,2.0,Four,Five
2,1.0,2.0,fo ur,fi ve
3,1.0,2.0,four,five
4,,,,


In [17]:
# transform train
try:
    df_raw_train_copy = cls_imputer_numeric_min.transform(X=df_raw_train_copy)
except KeyError as e:
    print(f'Error! {e} not in data')

Error! 'col3' not in data


### Edit the transform function to be dynamic (i.e., will work even if columns are dropped)

In [18]:
# subset
df_raw_train_copy = df_raw_train.copy()[list_cols_keep]

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col4,col5
0,1.0,2.0,four,five
1,1.0,2.0,Four,Five
2,1.0,2.0,fo ur,fi ve
3,1.0,2.0,four,five
4,,,,


In [19]:
# class
class ImputerNumericMin:
    # initialize
    def __init__(self):
        pass
    # fit
    def fit(self, X):
        dict_impute_min = {}
        for col in X.columns:
            # get min
            try:
                val_min = np.min(X[col])
                # assign to dict_impute_min
                dict_impute_min[col] = val_min
            # if non-numeric
            except TypeError:
                pass
        # save to object
        self.dict_impute_min = dict_impute_min
        # return object
        return self
    # transform
    def transform(self, X):
        # future proof
        dict_impute_min = {key: val for key, val in self.dict_impute_min.items() if key in list(X.columns)}
        # use the dict_impute_min not saved to self
        for key, val in dict_impute_min.items():
            # impute
            X[key] = X[key].fillna(val, inplace=False)
        # return X
        return X

In [20]:
# initialize class
cls_imputer_numeric_min = ImputerNumericMin()

# fit on full data
cls_imputer_numeric_min.fit(X=df_raw_train)

# show
try:
    pprint(cls_imputer_numeric_min.dict_impute_min)
except AttributeError:
    print('dict_impute_min does not exist, please run fit and check again')

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


In [21]:
# transform train
df_raw_train_copy = cls_imputer_numeric_min.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col4,col5
0,1.0,2.0,four,five
1,1.0,2.0,Four,Five
2,1.0,2.0,fo ur,fi ve
3,1.0,2.0,four,five
4,1.0,2.0,,


### But, what if the developers need some sort of logging so they can monitor the model in production?

In [22]:
# class
class ImputerNumericMin:
    # initialize
    def __init__(self, str_message='Numeric imputation (minimum)'):
        self.str_message = str_message
    # fit
    def fit(self, X):
        dict_impute_min = {}
        for col in X.columns:
            # get min
            try:
                val_min = np.min(X[col])
                # assign to dict_impute_min
                dict_impute_min[col] = val_min
            # if non-numeric
            except TypeError:
                pass
        # save to object
        self.dict_impute_min = dict_impute_min
        # return object
        return self
    # transform
    def transform(self, X):
        # future proof
        dict_impute_min = {key: val for key, val in self.dict_impute_min.items() if key in list(X.columns)}
        # use the dict_impute_min not saved to self
        for key, val in dict_impute_min.items():
            # impute
            X[key] = X[key].fillna(val, inplace=False)
        # print message
        print(self.str_message)
        # return X
        return X

In [23]:
# initialize class
cls_imputer_numeric_min = ImputerNumericMin(str_message='Numeric imputation (minimum)')

# fit on full data
cls_imputer_numeric_min.fit(X=df_raw_train)

# show
try:
    pprint(cls_imputer_numeric_min.dict_impute_min)
except AttributeError:
    print('dict_impute_min does not exist, please run fit and check again')

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


In [24]:
# transform train
df_raw_train_copy = cls_imputer_numeric_min.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Numeric imputation (minimum)


Unnamed: 0,col1,col2,col4,col5
0,1.0,2.0,four,five
1,1.0,2.0,Four,Five
2,1.0,2.0,fo ur,fi ve
3,1.0,2.0,four,five
4,1.0,2.0,,


## Non-numeric text cleaner and imputer

In [25]:
# class
class CleanAndImputeText:
    # initialize
    def __init__(self, str_message='Lower and strip text'):
        self.str_message = str_message
    # fit
    def fit(self, X):
        # get non-numeric columns
        list_clean_and_impute = [col for col in list(X.columns) if not is_numeric_dtype(X[col])]
        # save to object
        self.list_clean_and_impute = list_clean_and_impute
        # return object
        return self
    # transform
    def transform(self, X):
        # future-proof
        list_clean_and_impute = [col for col in self.list_clean_and_impute if col in list(X.columns)]
        # iterate
        for col in list_clean_and_impute:
            X[col] = X[col].astype(str).str.lower().str.replace(' ','')
        # print message
        print(self.str_message)
        # return X
        return X

In [26]:
# initialize class
cls_clean_and_impute = CleanAndImputeText(str_message='Lower and strip text')

# fit to train
cls_clean_and_impute.fit(X=df_raw_train)

# print list
pprint(cls_clean_and_impute.list_clean_and_impute)

['col4', 'col5']


In [27]:
# transform train
df_raw_train_copy = cls_clean_and_impute.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Lower and strip text


Unnamed: 0,col1,col2,col4,col5
0,1.0,2.0,four,five
1,1.0,2.0,four,five
2,1.0,2.0,four,five
3,1.0,2.0,four,five
4,1.0,2.0,,


In [28]:
# check the last row of col4
if df_raw_train_copy['col4'].iloc[-1] == 'nan':
    print('Success')
else:
    print('Error')

Success


## Preprocessing model

In [29]:
# list cols to keep (drop col3 and col 4)
list_cols_keep = [
    'col1',
    'col2',
    'col5',
]

# subset
df_raw_train_copy = df_raw_train.copy()[list_cols_keep]

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col5
0,1.0,2.0,five
1,1.0,2.0,Five
2,1.0,2.0,fi ve
3,1.0,2.0,five
4,,,


In [30]:
# initialize class
cls_imputer_numeric_min = ImputerNumericMin(str_message='Numeric imputation (minimum)')

# fit on training
cls_imputer_numeric_min.fit(X=df_raw_train)

# print
pprint(cls_imputer_numeric_min.dict_impute_min)

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


In [31]:
# initialize class
cls_clean_and_impute = CleanAndImputeText(str_message='Lower and strip text')

# fit on training
cls_clean_and_impute.fit(X=df_raw_train)

# print
pprint(cls_clean_and_impute.list_clean_and_impute)

['col4', 'col5']


In [32]:
# put into list
list_transformers = [
    cls_imputer_numeric_min,
    cls_clean_and_impute,
]

In [33]:
# iterate and transform train
for transformer in list_transformers:
    df_raw_train_copy = transformer.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Numeric imputation (minimum)
Lower and strip text


Unnamed: 0,col1,col2,col5
0,1.0,2.0,five
1,1.0,2.0,five
2,1.0,2.0,five
3,1.0,2.0,five
4,1.0,2.0,


### Create class

In [34]:
# list cols to keep (drop col3 and col 4)
list_cols_keep = [
    'col1',
    'col2',
    'col5',
]

# subset
df_raw_train_copy = df_raw_train.copy()[list_cols_keep]

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col5
0,1.0,2.0,five
1,1.0,2.0,Five
2,1.0,2.0,fi ve
3,1.0,2.0,five
4,,,


In [35]:
# subset
df_raw_test_copy = df_raw_test.copy()[list_cols_keep]

# show
df_raw_test_copy

Unnamed: 0,col1,col2,col5
0,,,
1,,,


In [36]:
# class
class PreprocessingModel:
    # initialize
    def __init__(self, list_transformers):
        self.list_transformers = list_transformers
    # transform
    def transform(self, X):
        # iterate through transformers
        for transformer in self.list_transformers:
            X = transformer.transform(X=X)
        # return X
        return X

In [37]:
# initialize
cls_preprocessing_model = PreprocessingModel(list_transformers=list_transformers)

In [38]:
# transform train
df_raw_train_copy = cls_preprocessing_model.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Numeric imputation (minimum)
Lower and strip text


Unnamed: 0,col1,col2,col5
0,1.0,2.0,five
1,1.0,2.0,five
2,1.0,2.0,five
3,1.0,2.0,five
4,1.0,2.0,


In [39]:
# transform test
df_raw_test_copy = cls_preprocessing_model.transform(X=df_raw_test_copy)

# show
df_raw_test_copy

Numeric imputation (minimum)
Lower and strip text


Unnamed: 0,col1,col2,col5
0,1.0,2.0,
1,1.0,2.0,


## Pickle preprocessing model, load it, and use it

In [40]:
# list cols to keep (drop col3 and col 4)
list_cols_keep = [
    'col1',
    'col2',
    'col5',
]

# subset
df_raw_train_copy = df_raw_train.copy()[list_cols_keep]

# show
df_raw_train_copy

Unnamed: 0,col1,col2,col5
0,1.0,2.0,five
1,1.0,2.0,Five
2,1.0,2.0,fi ve
3,1.0,2.0,five
4,,,


In [41]:
# show files
pprint(os.listdir())

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'dynamic_preprocessing_models.ipynb']


In [42]:
# filename
str_filename = './cls_model_preprocessing.pkl'

# pickle
pickle.dump(cls_preprocessing_model, open(str_filename, 'wb'))

# show files
pprint(os.listdir())

# load
cls_preprocessing_model = pickle.load(open(str_filename, 'rb'))

# rm
os.remove(str_filename)

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'cls_model_preprocessing.pkl',
 'dynamic_preprocessing_models.ipynb']


In [43]:
# transform
df_raw_train_copy = cls_preprocessing_model.transform(X=df_raw_train_copy)

# show
df_raw_train_copy

Numeric imputation (minimum)
Lower and strip text


Unnamed: 0,col1,col2,col5
0,1.0,2.0,five
1,1.0,2.0,five
2,1.0,2.0,five
3,1.0,2.0,five
4,1.0,2.0,


## Make python library [preprocessing_toolbox_example](https://github.com/gopfsrisk/preprocessing_toolbox_example)

### Folder structure

In [44]:
# preprocessing_toolbox_example
#     |_____ .gitignore
#     |_____ setup.py
#     |_____ preprocessing_toolbox_example
#                 |_____ preprocessing.py
#                 |_____ __init__.py

### Create root folder

In [45]:
# dirname
str_dirname_root = 'preprocessing_toolbox_example'
# dirname
str_dirname_root_new = f'../{str_dirname_root}'

# mkdir
try:
    os.mkdir(str_dirname_root_new)
    print(f'Successfully created the root directory {str_dirname_root_new}')
except FileExistsError:
    print(f'The root directory {str_dirname_root_new} already exists')

Successfully created the root directory ../preprocessing_toolbox_example


In [46]:
# show objects in str_dirname_root
print(f'Items in {str_dirname_root_new}: {os.listdir(str_dirname_root_new)}')

Items in ../preprocessing_toolbox_example: []


### Write ```.gitignore``` to root directory

In [47]:
%%writefile ../preprocessing_toolbox_example/.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

Writing ../preprocessing_toolbox_example/.gitignore


### Write ```setup.py``` to root directory

In [48]:
%%writefile ../preprocessing_toolbox_example/setup.py
from setuptools import setup

setup(
    name='preprocessing_toolbox_example',
    version='0.0.1',
    description='Example',
    url='http://github.com/gopfsrisk/preprocessing_toolbox_example',
    author='Aaron England',
    author_email='aaron.england.dev@gmail.com',
    license='MIT',
    packages=['preprocessing_toolbox_example'],
    zip_safe=False,
)

Writing ../preprocessing_toolbox_example/setup.py


### Make internal folder

In [49]:
# dirname
str_dirname_new = f'../{str_dirname_root}/{str_dirname_root}'

# mkdir
try:
    os.mkdir(str_dirname_new)
    print(f'Successfully created the directory {str_dirname_new}')
except FileExistsError:
    print(f'The directory {str_dirname_new} already exists')

Successfully created the directory ../preprocessing_toolbox_example/preprocessing_toolbox_example


In [50]:
# show objects in str_dirname_root
print(f'Items in {str_dirname_root_new}: {os.listdir(str_dirname_root_new)}')

Items in ../preprocessing_toolbox_example: ['.gitignore', 'preprocessing_toolbox_example', 'setup.py']


In [51]:
# show objects in str_dirname_new
print(f'Items in {str_dirname_new}: {os.listdir(str_dirname_new)}')

Items in ../preprocessing_toolbox_example/preprocessing_toolbox_example: []


### Write to ```preprocessing.py``` to internal folder

In [52]:
%%writefile ../preprocessing_toolbox_example/preprocessing_toolbox_example/preprocessing.py
import numpy as np
from pandas.api.types import is_numeric_dtype

# class
class ImputerNumericMin:
    # initialize
    def __init__(self, str_message='Numeric imputation (minimum)'):
        self.str_message = str_message
    # fit
    def fit(self, X):
        dict_impute_min = {}
        for col in X.columns:
            # get min
            try:
                val_min = np.min(X[col])
                # assign to dict_impute_min
                dict_impute_min[col] = val_min
            # if non-numeric
            except TypeError:
                pass
        # save to object
        self.dict_impute_min = dict_impute_min
        # return object
        return self
    # transform
    def transform(self, X):
        # future proof
        dict_impute_min = {key: val for key, val in self.dict_impute_min.items() if key in list(X.columns)}
        # use the dict_impute_min not saved to self
        for key, val in dict_impute_min.items():
            # impute
            X[key] = X[key].fillna(val, inplace=False)
        # print message
        print(self.str_message)
        # return X
        return X

# class
class CleanAndImputeText:
    # initialize
    def __init__(self, str_message='Lower and strip text'):
        self.str_message = str_message
    # fit
    def fit(self, X):
        # get non-numeric columns
        list_clean_and_impute = [col for col in list(X.columns) if not is_numeric_dtype(X[col])]
        # save to object
        self.list_clean_and_impute = list_clean_and_impute
        # return object
        return self
    # transform
    def transform(self, X):
        # future-proof
        list_clean_and_impute = [col for col in self.list_clean_and_impute if col in list(X.columns)]
        # iterate
        for col in list_clean_and_impute:
            X[col] = X[col].astype(str).str.lower().str.replace(' ','')
        # print message
        print(self.str_message)
        # return X
        return X

# class
class PreprocessingModel:
    # initialize
    def __init__(self, list_transformers):
        self.list_transformers = list_transformers
    # transform
    def transform(self, X):
        # iterate through transformers
        for transformer in self.list_transformers:
            X = transformer.transform(X=X)
        # return X
        return X

Writing ../preprocessing_toolbox_example/preprocessing_toolbox_example/preprocessing.py


### Write ```__init__.py``` to internal folder

In [53]:
%%writefile ../preprocessing_toolbox_example/preprocessing_toolbox_example/__init__.py
# __init__

Writing ../preprocessing_toolbox_example/preprocessing_toolbox_example/__init__.py


In [54]:
# show objects in str_dirname_new
print(f'Items in {str_dirname_new}: {os.listdir(str_dirname_new)}')

Items in ../preprocessing_toolbox_example/preprocessing_toolbox_example: ['preprocessing.py', '__init__.py']


### Show directory structure

In [55]:
# spaces
str_spaces = '     '
str_hline = '_____'
str_vline = ' |'

# title
print(f'Directory structure for the package {str_dirname_root_new[3:]}:')
print('')

# print root
print(f'{str_dirname_root_new[3:]} (Root)')

# get list of items in str_dirname_root_new
list_items_root = os.listdir(str_dirname_root_new)
# get files
list_files_root = [item for item in list_items_root if '.' in item]
for file in list_files_root:
    print(f'{str_spaces}{str_vline}{str_hline} {file}')
# get list dirs
list_dirs_root = [item for item in list_items_root if '.' not in item]
for dir_ in list_dirs_root:
    print(f'{str_spaces}{str_vline}{str_hline} {dir_}')
    # get files
    list_files_dir = os.listdir(f'{str_dirname_root_new}/{dir_}')
    for file in list_files_dir:
        print(f'{str_spaces}{str_spaces}{str_spaces}{str_vline}{str_hline} {file}')

Directory structure for the package preprocessing_toolbox_example:

preprocessing_toolbox_example (Root)
      |_____ .gitignore
      |_____ setup.py
      |_____ preprocessing_toolbox_example
                |_____ preprocessing.py
                |_____ __init__.py


## Install package

In [56]:
! pip install git+https://github.com/gopfsrisk/preprocessing_toolbox_example@620bfb4d40dc661c2486760f159adb12abb3aa4d

Collecting git+https://github.com/gopfsrisk/preprocessing_toolbox_example@620bfb4d40dc661c2486760f159adb12abb3aa4d

  Running command git clone -q https://github.com/gopfsrisk/preprocessing_toolbox_example 'C:\Users\aengland\AppData\Local\Temp\pip-req-build-taksf5vj'
  Running command git rev-parse -q --verify 'sha^620bfb4d40dc661c2486760f159adb12abb3aa4d'
  Running command git fetch -q https://github.com/gopfsrisk/preprocessing_toolbox_example 620bfb4d40dc661c2486760f159adb12abb3aa4d



  Cloning https://github.com/gopfsrisk/preprocessing_toolbox_example (to revision 620bfb4d40dc661c2486760f159adb12abb3aa4d) to c:\users\aengland\appdata\local\temp\pip-req-build-taksf5vj
  Resolved https://github.com/gopfsrisk/preprocessing_toolbox_example to commit 620bfb4d40dc661c2486760f159adb12abb3aa4d


In [57]:
import preprocessing_toolbox_example.preprocessing as pre

In [58]:
# initialize class
cls_imputer_numeric_min = pre.ImputerNumericMin(str_message='Numeric imputation (minimum)')

# fit on train
cls_imputer_numeric_min.fit(X=df_raw_train)

# show
try:
    pprint(cls_imputer_numeric_min.dict_impute_min)
except AttributeError:
    print('dict_impute_min does not exist, please run fit and check again')

{'col1': 1.0, 'col2': 2.0, 'col3': 3.0}


In [59]:
# initialize class
cls_clean_and_impute = pre.CleanAndImputeText(str_message='Lower and strip text')

# fit on train
cls_clean_and_impute.fit(X=df_raw_train)

# show
try:
    pprint(cls_clean_and_impute.list_clean_and_impute)
except AttributeError:
    print('list_clean_and_impute does not exist, please run fit and check again')

['col4', 'col5']


In [60]:
# create list of transformers
list_transformers = [
    cls_imputer_numeric_min,
    cls_clean_and_impute,
]

# initialize class
cls_model_preprocessing = pre.PreprocessingModel(list_transformers=list_transformers)

In [61]:
# show train
df_raw_train

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,Four,Five
2,1.0,2.0,3.0,fo ur,fi ve
3,1.0,2.0,3.0,four,five
4,,,,,


In [62]:
# show test
df_raw_test

Unnamed: 0,col1,col2,col3,col4,col5
0,,,,,
1,,,,,


In [63]:
# transform train
df_raw_train = cls_model_preprocessing.transform(X=df_raw_train)

# show
df_raw_train

Numeric imputation (minimum)
Lower and strip text


Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,four,five
1,1.0,2.0,3.0,four,five
2,1.0,2.0,3.0,four,five
3,1.0,2.0,3.0,four,five
4,1.0,2.0,3.0,,


In [64]:
# transform test
df_raw_test = cls_model_preprocessing.transform(X=df_raw_test)

# show
df_raw_test

Numeric imputation (minimum)
Lower and strip text


Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,,
1,1.0,2.0,3.0,,
