## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

In [13]:
data = pd.read_csv('train.csv', usecols = ['Sex', 'Age', 'Fare', 'Embarked', 'Survived'])
data.head()

Unnamed: 0,Survived,Sex,Age,Fare,Embarked
0,0,male,22.0,7.25,S
1,1,female,38.0,71.2833,C
2,1,female,26.0,7.925,S
3,1,female,35.0,53.1,S
4,0,male,35.0,8.05,S


## Create the pipelines for Numerical data
- Steps
    1. Missing value 
    2. Scaling

### Import the libraries for the steps

In [4]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

## Specify the steps

In [6]:
steps = [('imputer', sim_imp), ('Scaler', ss)]

## Create a pipeline

In [7]:
from sklearn.pipeline import Pipeline
pipe_num = Pipeline(steps)

## Creating pipeline for Categorical data
- Steps
    1. Missing Value Treatment
    2. One Hot Encoding

## Import the libraries for steps

In [8]:
from sklearn.impute import SimpleImputer
imp_cat = SimpleImputer(strategy = 'most_frequent')

In [9]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

## specify the steps

In [10]:
steps = [('Cat_imputer',imp_cat), ("Encoder",ohe)]

## Create a pipeline

In [11]:
from sklearn.pipeline import Pipeline
pipe_cat = Pipeline(steps)

# Column Transformation

## Step 1: Specify your num and cat columns

In [14]:
num_features = ['Age', 'Fare']
cat_features = ['Sex', 'Embarked']
all_features = num_features + cat_features
all_features

['Age', 'Fare', 'Sex', 'Embarked']

## Step 2 : perform Column Transformation

In [15]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([('num_transform', pipe_num, num_features), 
                   ('cat_transform', pipe_cat, cat_features)])

## Step 3 : Fit the full_pipeline on the data

In [16]:
full_pipeline.fit_transform(data)

array([[-0.56573646, -0.50244517,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.66386103,  0.78684529,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.48885426,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.1046374 , -0.17626324,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833709, -0.04438104,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276197, -0.49237783,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [18]:
data_test = pd.read_csv('test.csv', usecols = ['Sex', 'Age', 'Fare', 'Embarked'])
data_test.head()

Unnamed: 0,Sex,Age,Fare,Embarked
0,male,34.5,7.8292,Q
1,female,47.0,7.0,S
2,male,62.0,9.6875,Q
3,male,27.0,8.6625,S
4,female,22.0,12.2875,S


In [19]:
full_pipeline.fit_transform(data_test)

array([[ 0.38623105, -0.49741333,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.37137004, -0.51227801,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 2.55353683, -0.46410047,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.70147553, -0.50779638,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.20485235, -0.49345515,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.20485235, -0.23695704,  0.        , ...,  1.        ,
         0.        ,  0.        ]])

- Conclusion :
1. Using pipelines and Column Transformers we automated all the preprocessing steps.
2. Using this automated steps entire data preprocessing is done at once