<a href="https://colab.research.google.com/github/adamlutzz/CheatSheets/blob/master/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook that shows quick and easy preprocessing for baseline model

## Imports

In [0]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

## Data

In [0]:
columns = ["name", "age"]
data = [["Alvin", 23], [np.nan, 21], ["Riken", np.nan]]
df = pd.DataFrame(columns= columns, data = data)

# *** Note that prior to this step, you remove the target column ***

## Split numeric and categorical

In [0]:
numeric = df.select_dtypes(include= "number").columns
categorical = df.select_dtypes(exclude = "number").columns

In [0]:
numeric

Index([], dtype='object')

## Create Categorical and Numeric preprocessing pipelines

In [0]:
c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent"))]
c_pipeline = Pipeline(c_steps)

n_steps = [('n_imputer', SimpleImputer())]
n_pipeline = Pipeline(n_steps)

## Fit pipelines to corresponsing datatypes

In [0]:
df[numeric] = n_pipeline.fit_transform(df[numeric])
df[categorical] = c_pipeline.fit_transform(df[categorical])

# Use first function if One hot encoder is not a step in your categorical preprocessing

## 0 for only numeric feautres, 1 for categorical, 2 for both

In [0]:
def simple_preprocess(df, switch):
  
  if switch == 0:
    n_steps = [('n_imputer', SimpleImputer())]
    n_pipeline = Pipeline(n_steps)
    df = n_pipeline.fit_transform(df.values)
  
  elif switch == 1:
    c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent")),
            ('ordinal', OrdinalEncoder())]
    c_pipeline = Pipeline(c_steps)
    df = c_pipeline.fit_transform(df.values)
  
  elif switch == 2:
    c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent")),
            ('ordinal', OrdinalEncoder())]
    c_pipeline = Pipeline(c_steps)
    n_steps = [('n_imputer', SimpleImputer())]
    n_pipeline = Pipeline(n_steps)
    numeric = df.select_dtypes(include= "number").columns
    categorical = df.select_dtypes(exclude = "number").columns
    df[numeric] = n_pipeline.fit_transform(df[numeric])
    df[categorical] = c_pipeline.fit_transform(df[categorical])
  
  else:
    print("Invalid Switch")
  
  return df

## 0 for only numeric feautres, 1 for categorical, 2 for both


In [0]:
def simple_preprocess(df, switch):
  
  if switch == 0:
    n_steps = [('n_imputer', SimpleImputer())]
    n_pipeline = Pipeline(n_steps)
    df = n_pipeline.fit(df.values)
  
  elif switch == 1:
    c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent")),
            ('onehot', OneHotEncoder())]
    c_pipeline = Pipeline(c_steps)
    categorical_data = c_pipeline.fit_transform(df.values).toarray()
    df = pd.DataFrame(columns= c_pipeline['onehot'].get_feature_names(), data=categorical_data)
  
  elif switch == 2:
    numeric = df.select_dtypes(include= "number")
    categorical = df.select_dtypes(exclude = "number")
    c_steps = [('c_imputer', SimpleImputer(strategy="most_frequent")),
            ('onehot', OneHotEncoder())]
    c_pipeline = Pipeline(c_steps)
    n_steps = [('n_imputer', SimpleImputer())]
    n_pipeline = Pipeline(n_steps)
    numeric_data = n_pipeline.fit_transform(numeric)
    categorical_data = c_pipeline.fit_transform(categorical).toarray()
    numeric = pd.DataFrame(columns= numeric.columns, data=numeric_data)
    categorical = pd.DataFrame(columns= c_pipeline['onehot'].get_feature_names(), data=categorical_data)
    df = pd.concat([numeric, categorical], axis = 1)
  
  else:
    print("Invalid Switch")
  
  return df

In [0]:
simple_preprocess(df, 2)

Unnamed: 0,age,x0_Alvin,x0_Riken
0,23.0,1.0,0.0
1,21.0,1.0,0.0
2,22.0,0.0,1.0
