## Purpose
Reusable code, utils etc. 

In [8]:
import pandas as pd
import numpy as np
import random

In [9]:
"""
Check if the current code is running in a notebook
From https://stackoverflow.com/questions/15411967/how-can-i-check-if-code-is-executed-in-the-ipython-notebook
"""
def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

## Tabular Data

In [10]:
import pandas as pd

def get_categorical_columns(df, ignore_cols=[]):
    return [o for o in df.select_dtypes(include=['category','object']).columns if o not in ignore_cols]

def get_numeric_columns(df, ignore_cols=[]):
    return [o for o in df.select_dtypes(exclude=['category','object']).columns if o not in ignore_cols]


### Transformations
First create a sample dataframe 

Then we try to implement the following steps for categorical columns

In [41]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.pipeline import Pipeline

df = pd.DataFrame({
    'n1': [1,2,3,4,5,None],
    'c1': ['a','a','b','a',np.nan,'b'],
    'c2': ['x',np.nan,'y',np.nan,np.nan,'x']
})

cat_cols = get_categorical_columns(df)
cont_cols = get_numeric_columns(df)

https://towardsdatascience.com/using-columntransformer-to-combine-data-processing-steps-af383f7d5260

In [42]:
# define pipeline for each column groups (to ensure the sequnce)
cat_pipe = Pipeline([('imp_cat',SimpleImputer(strategy='constant', fill_value='-999')),
                     ])
cont_pipe = Pipeline([('imp_cont', SimpleImputer(strategy='mean')),
                      ('scaler', RobustScaler())])

# Apply each pipline to the column groups
col_trans = ColumnTransformer(
    transformers=[
        ('cats', cat_pipe, cat_cols),
        ('conts', cont_pipe, cont_cols)],
    remainder='passthrough'
)

col_trans.fit_transform(df)

array([['a', 'x', -1.3333333333333333],
       ['a', '-999', -0.6666666666666666],
       ['b', 'y', 0.0],
       ['a', '-999', 0.6666666666666666],
       ['-999', '-999', 1.3333333333333333],
       ['b', 'x', 0.0]], dtype=object)

### Outliers
remove outliers using percentiles

In [43]:
x = pd.Series(list(np.random.rand(50)) + [2.0])
upper_bound, lower_bound = np.percentile(x, [1,99])
x_clipped = np.clip(x, upper_bound, lower_bound)

### Scalers
https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html

Which scaler to choose? - https://docs.google.com/spreadsheets/d/1woVi7wq13628HJ-tN6ApaRGVZ85OdmHsDBKLAf5ylaQ/edit#gid=0