## Purpose

Reusable steps in processing tabular data

In [10]:
import pandas as pd
import numpy as np
import random
from sklearn.base import TransformerMixin

In [5]:
"""
Check if the current code is running in a notebook
From https://stackoverflow.com/questions/15411967/how-can-i-check-if-code-is-executed-in-the-ipython-notebook
"""
def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

## Tabular Data

In [11]:
def get_categorical_columns(df, ignore_cols=[]):
    return [o for o in df.select_dtypes(include=['category','object']).columns if o not in ignore_cols]

def get_numeric_columns(df, ignore_cols=[]):
    return [o for o in df.select_dtypes(exclude=['category','object']).columns if o not in ignore_cols]


### Transformations
First create a sample dataframe 

Then we try to implement the following steps for categorical columns

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline

df = pd.DataFrame({
    'n1': [1,2,3,4,5,None],
    'n2': [5,3,2,6,1,4],
    'c1': ['a','a','b','a',np.nan,'b'],
    'c2': ['x',np.nan,'y',np.nan,np.nan,'x']
})

cat_cols = get_categorical_columns(df)
cont_cols = get_numeric_columns(df)

# lets define a custom tranformation
class SortData(TransformerMixin):
    def __init__(self, cols=['lon', 'lat'], ascending=[1, 1]):
        self.cols = cols
        self.ascending = ascending

    def fit(self, df, y=None):
        return self

    def transform(self, X, y=None):
        return X.sort_values(by=self.cols, ascending=self.ascending)

ColumnTransformer allows us to use different set of transformations for categorial cand continous columns. 
But, finally it returns a numpy array. To return a dataframe, here are the steps. 

https://towardsdatascience.com/using-columntransformer-to-combine-data-processing-steps-af383f7d5260
https://stackoverflow.com/questions/68874492/preserve-column-order-after-applying-sklearn-compose-columntransformer

In [13]:
# define pipeline for each column groups (to ensure the sequnce)
cat_cols = get_categorical_columns(df)
cont_cols = get_numeric_columns(df)

cat_pipe = Pipeline([('imp_cat',SimpleImputer(strategy='constant', fill_value='-999')),
                     ('encode_cat',OrdinalEncoder())])

cont_pipe = Pipeline([('imp_cont', SimpleImputer(strategy='mean')),
                      ('scaler', RobustScaler())])

# Apply each pipline to the column groups
col_trans = ColumnTransformer(
    transformers=[
        ('cats', cat_pipe, cat_cols),
        ('conts', cont_pipe, cont_cols)],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipe = Pipeline([
    ('sort', SortData(cols=['n2'], ascending=[1])),
    ('column_trans', col_trans)
])


pd.DataFrame(pipe.fit_transform(df), columns=pipe[-1].get_feature_names_out())

Unnamed: 0,c1,c2,n1,n2
0,0.0,0.0,1.333333,-1.0
1,2.0,2.0,0.0,-0.6
2,1.0,0.0,-0.666667,-0.2
3,2.0,1.0,0.0,0.2
4,1.0,1.0,-1.333333,0.6
5,1.0,0.0,0.666667,1.0


Note that `verbose_feature_names_out` is needed to avoid column prefixes for output of column transformations

### Outliers
remove outliers using percentiles

In [7]:
x = pd.Series(list(np.random.rand(50)) + [2.0])
upper_bound, lower_bound = np.percentile(x, [1,99])
x_clipped = np.clip(x, upper_bound, lower_bound)

### Scalers
https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html

Which scaler to choose? - https://docs.google.com/spreadsheets/d/1woVi7wq13628HJ-tN6ApaRGVZ85OdmHsDBKLAf5ylaQ/edit#gid=0