# **`Data Preparation`**


1. Impute missing values using different strategies
2. Encode categorical columns
3. Feature engineer - Custom sklearn Transformer
4. Set up a Data Preprocessing Pipeline 

## Autoreload Module

In [1]:
%load_ext autoreload
%autoreload 2

## Change Project Directory


In [2]:
import os
os.chdir('/content/drive/MyDrive/projects/data-science-explorations/mlops/mpg-pred-end-to-end-ml/notebooks')
os.listdir('../')

['README.md',
 'data',
 'notebooks',
 'src',
 'venv-hpx360-win',
 'requirements.txt',
 'scripts',
 'reports',
 '.gitignore',
 'src.egg-info',
 'setup.py']

In [3]:
!pip install -e ../.

Obtaining file:///content/drive/My%20Drive/projects/data-science-explorations/mlops/mpg-pred-end-to-end-ml
Installing collected packages: src
  Found existing installation: src 0.0.1
    Can't uninstall 'src'. No files were found to uninstall.
  Running setup.py develop for src
Successfully installed src


## Some Imports

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



## Load Data

In [5]:
from src.project import load_data
df = load_data()
df.head()

Unnamed: 0,mpg,cylinder,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


## Make a copy of the data

* We are doing this to demonstrate preprocessing of data, before we finally make the preprocessing pipeline

In [6]:
df_prep = df.copy()

## Split into train and test sets

In [7]:
RANDOM_STATE = 42
TEST_SIZE = 0.2

TARGET_VARIABLE = 'mpg'

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

In [9]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=RANDOM_STATE)
splitter

StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.2,
            train_size=None)

In [10]:
for idx_train, idx_test in splitter.split(df, df['cylinder']): # splitting so that both groups have same distribution of cylinders
    dfX_train, dfy_train = df.loc[idx_train].drop([TARGET_VARIABLE], axis=1), df.loc[idx_train, [TARGET_VARIABLE]]
    dfX_test, dfy_test = df.loc[idx_test].drop([TARGET_VARIABLE], axis=1), df.loc[idx_test, [TARGET_VARIABLE]]


In [11]:
print(f'Train: {len(dfX_train)}, Test: {len(dfX_test)}')

Train: 318, Test: 80


## Transformer to map `Origin` to Country

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
class ValueMapper(BaseEstimator, TransformerMixin):
    
    def __init__(self, mapper: dict):
        self.mapper = mapper

    def fit(self, X, y=None):
        # TODO: Get column wise default mappings
        return self

    def transform(self, X):
        X = X.apply(lambda col: col.map(self.mapper)).to_numpy()
        return X

In [13]:
origin_mapper = ValueMapper({1: "India", 2: "USA", 3: "Germany"})
transformed = origin_mapper.fit_transform(df_prep[['origin']])
transformed

array([['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['Germany'],
       ['India'],
       ['India'],
       ['India'],
       ['Germany'],
       ['USA'],
       ['USA'],
       ['USA'],
       ['USA'],
       ['USA'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['Germany'],
       ['India'],
       ['Germany'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['India'],
       ['USA'],
       ['USA'],
       ['USA'],
       ['Germany'],
       ['Germany'],
       ['USA']

In [14]:
df_prep['origin'] = transformed
df_prep

Unnamed: 0,mpg,cylinder,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,India
1,15.0,8,350.0,165.0,3693.0,11.5,70,India
2,18.0,8,318.0,150.0,3436.0,11.0,70,India
3,16.0,8,304.0,150.0,3433.0,12.0,70,India
4,17.0,8,302.0,140.0,3449.0,10.5,70,India
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,India
394,44.0,4,97.0,52.0,2130.0,24.6,82,USA
395,32.0,4,135.0,84.0,2295.0,11.6,82,India
396,28.0,4,120.0,79.0,2625.0,18.6,82,India


## Encoding the `origin` column

* Since this is a nominal variable, we will one-hot-encode it

In [15]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
transformed = ohe.fit_transform(df_prep[['origin']])
transformed

<398x3 sparse matrix of type '<class 'numpy.float64'>'
	with 398 stored elements in Compressed Sparse Row format>

> **Note**: A sparse matrix is returned. This is, in many occassions, quite useful, especially for text data where the categories are the unique tokens in the vocabulary, which is quite high


In [16]:
ohe.categories_

[array(['Germany', 'India', 'USA'], dtype=object)]

In [17]:
ohe.get_feature_names(['origin'])

array(['origin_Germany', 'origin_India', 'origin_USA'], dtype=object)

In [18]:
transformed = pd.DataFrame(transformed.toarray().astype(int), columns=ohe.get_feature_names(['origin']))
transformed

Unnamed: 0,origin_Germany,origin_India,origin_USA
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
393,0,1,0
394,0,0,1
395,0,1,0
396,0,1,0


In [19]:
df_prep = pd.concat([df_prep.drop(['origin'], axis=1), transformed], axis=1)
df_prep.head()

Unnamed: 0,mpg,cylinder,displacement,horsepower,weight,acceleration,model_year,origin_Germany,origin_India,origin_USA
0,18.0,8,307.0,130.0,3504.0,12.0,70,0,1,0
1,15.0,8,350.0,165.0,3693.0,11.5,70,0,1,0
2,18.0,8,318.0,150.0,3436.0,11.0,70,0,1,0
3,16.0,8,304.0,150.0,3433.0,12.0,70,0,1,0
4,17.0,8,302.0,140.0,3449.0,10.5,70,0,1,0


## Imputing missing values

In [20]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=RANDOM_STATE)

cols = ['horsepower']
transformed = imputer.fit_transform(df_prep)
transformed

array([[ 18.,   8., 307., ...,   0.,   1.,   0.],
       [ 15.,   8., 350., ...,   0.,   1.,   0.],
       [ 18.,   8., 318., ...,   0.,   1.,   0.],
       ...,
       [ 32.,   4., 135., ...,   0.,   1.,   0.],
       [ 28.,   4., 120., ...,   0.,   1.,   0.],
       [ 31.,   4., 119., ...,   0.,   1.,   0.]])

In [21]:
df_prep = pd.DataFrame(transformed, columns=df_prep.columns)
df_prep.head()

Unnamed: 0,mpg,cylinder,displacement,horsepower,weight,acceleration,model_year,origin_Germany,origin_India,origin_USA
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,0.0,1.0,0.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,0.0,1.0,0.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,0.0,1.0,0.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,0.0,1.0,0.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,0.0,1.0,0.0


## Feature Engineering new attributes



In [22]:
class CustomAttrAdder(BaseEstimator, TransformerMixin):

    INDICES = {
        'cylinder': 1,
        'displacement': 2,
        'horsepower': 3,
        'weight': 4,
        'acceleration': 5
    }

    def __init__(self, 
                 on_pairs=(('displacement', 'horsepower'),
                           ('weight', 'cylinder'),
                           ('acceleration', 'horsepower'),
                           ('acceleration', 'cylinder'))):
    
        self.on_pairs = on_pairs
        self.feature_names = []
        for col1, col2 in self.on_pairs:
            self.feature_names.append(f'{col1}_on_{col2}')

    def fit(self, X, y=None):
        
        return self

    def transform(self, X):
        transformed = []

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        
        for col1, col2 in self.on_pairs:
            t = X[:, self.INDICES[col1]]/X[:, self.INDICES[col2]]
            
            transformed.append(t)

        return np.concatenate([X, np.array(transformed).T], axis=1)


    



In [23]:
attr_adder = CustomAttrAdder()
transformed = attr_adder.fit_transform(df_prep.to_numpy())

In [24]:
attr_adder.feature_names

['displacement_on_horsepower',
 'weight_on_cylinder',
 'acceleration_on_horsepower',
 'acceleration_on_cylinder']

In [25]:
df_prep = pd.DataFrame(transformed, columns=list(df_prep.columns) + attr_adder.feature_names)
df_prep.head()

Unnamed: 0,mpg,cylinder,displacement,horsepower,weight,acceleration,model_year,origin_Germany,origin_India,origin_USA,displacement_on_horsepower,weight_on_cylinder,acceleration_on_horsepower,acceleration_on_cylinder
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,0.0,1.0,0.0,2.361538,438.0,0.092308,1.5
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,0.0,1.0,0.0,2.121212,461.625,0.069697,1.4375
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,0.0,1.0,0.0,2.12,429.5,0.073333,1.375
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,0.0,1.0,0.0,2.026667,429.125,0.08,1.5
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,0.0,1.0,0.0,2.157143,431.125,0.075,1.3125


Alright now, we have created our preprocessed dataframe! But how do we preprocess another dataframe by applying the same steps in the data pipeline?

> sklearn `Pipeline`


## Preprocessing data `Pipeline`

In [26]:
df.head(1)

Unnamed: 0,mpg,cylinder,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

In [28]:
cat_encoder = ColumnTransformer([
    ('enc', OneHotEncoder(), ['origin'])
], remainder='drop')


col_dropper = ColumnTransformer([
    ('drop_cols', 'drop', ['origin'])
], remainder='passthrough')

imputer = Pipeline([
    ('drop_cols', col_dropper),
    ('impute', IterativeImputer(random_state=RANDOM_STATE))]
    )

feature_union = FeatureUnion([
    ('imputer', imputer),
    ('cat_encoder', cat_encoder)
    ])


pipe = Pipeline([
    ('feature_union', feature_union),
    ('attr_adder', CustomAttrAdder()),
    ('scale', StandardScaler())
])

### Fit on the training set and transform it

In [29]:
pipe.fit_transform(dfX_train)

array([[-0.85657842, -1.07804475, -1.1470902 , ...,  1.94940972,
         1.20141767,  1.48364431],
       [-0.85657842, -1.1174582 , -0.98553382, ...,  1.51737377,
         1.20854596,  1.66279955],
       [-0.85657842, -0.3587492 , -0.31238226, ..., -0.277037  ,
         0.52956686,  0.03141453],
       ...,
       [-0.85657842, -0.56566984, -0.52779076, ..., -0.3920346 ,
         1.04516581,  0.35591681],
       [-0.85657842, -0.78244384, -0.23160407, ...,  0.3745968 ,
         0.24522837,  0.40378517],
       [ 0.32260746, -0.45728283,  0.44154749, ..., -0.25129842,
         0.00521573,  0.14711459]])

### Transform the traintest data point/set

In [30]:
pipe.transform(dfX_test)

array([[ 0.32260746,  0.56746699, -0.09697376, ..., -0.68557005,
        -0.69746569, -0.8799507 ],
       [ 0.32260746,  0.56746699, -0.42008651, ..., -0.71831987,
        -0.44252938, -0.8958253 ],
       [-0.85657842, -1.05833802, -1.23302957, ...,  1.5337121 ,
         2.0570856 ,  1.68052993],
       ...,
       [-0.85657842, -0.80215056, -0.63549501, ...,  0.38425751,
         1.04529585,  0.69835701],
       [ 1.50179333,  1.09954863,  1.24932936, ..., -1.17961837,
        -1.02861306, -1.11472777],
       [-0.85657842, -0.94009766, -0.68934713, ...,  0.58120662,
         0.97184264,  1.09570135]])