# Re-create your own _One Hot Encoder_ 

In [1]:
import pandas as pd
import seaborn as sns

## (1) The Titanic Dataset

In [2]:
# Loading 100% of the dataset. 
# Choose 0.5 to load only 50% of the rows randomly

data = sns.load_dataset('titanic').sample(frac = 1) 
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
528,0,3,male,39.0,0,0,7.925,S,Third,man,True,,Southampton,no,True
367,1,3,female,,0,0,7.2292,C,Third,woman,False,,Cherbourg,yes,True
480,0,3,male,9.0,5,2,46.9,S,Third,child,False,,Southampton,no,False
335,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
350,0,3,male,23.0,0,0,9.225,S,Third,man,True,,Southampton,no,True


In [3]:
from sklearn.model_selection import train_test_split

X = data.drop(columns = ['survived', 'alive', 'who', 'adult_male'])
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [4]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
716,1,female,38.0,0,0,227.5250,C,First,C,Cherbourg,True
573,3,female,,0,0,7.7500,Q,Third,,Queenstown,True
696,3,male,44.0,0,0,8.0500,S,Third,,Southampton,True
300,3,female,,0,0,7.7500,Q,Third,,Queenstown,True
877,3,male,19.0,0,0,7.8958,S,Third,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...
335,3,male,,0,0,7.8958,S,Third,,Southampton,True
554,3,female,22.0,0,0,7.7750,S,Third,,Southampton,True
847,3,male,35.0,0,0,7.8958,C,Third,,Cherbourg,True
51,3,male,21.0,0,0,7.8000,S,Third,,Southampton,True


## (2) A first pipeline

❓ Create a basic Pipeline which ***encodes categorical features*** and ***scales numerical features*** ❓

💡 Use [`make_pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) and [`make_column_transformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html)

In [5]:
num_features = ['age','fare','sibsp','parch']
cat_features = ['pclass','sex','embarked','class','embark_town','alone']

In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features),
#     remainder='passthrough'
)

pipe = make_pipeline(preproc_basic, SVC())

pipe

In [36]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'fare', 'sibsp', 'parch']),
                                   ('onehotencoder', OneHotEncoder(),
                                    ['pclass', 'sex', 'embarked', 'class',
                                     'embark_town', 'alone'])])),
  ('svc', SVC())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer()),
                                                  ('standardscaler',
                      

In [23]:
from sklearn import set_config; set_config(display='diagram')
pipe

In [37]:
pipe.fit(X_train, y_train)
pipe.predict(X_test)
pipe.score(X_test, y_test)

0.8246268656716418

In [38]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.8235483870967742

In [13]:
SimpleImputer.get_feature_names_out = (lambda self, names = None: self.feature_names_in_)

preproc_basic.get_feature_names_out()

array(['pipeline__age', 'pipeline__fare', 'pipeline__sibsp',
       'pipeline__parch', 'onehotencoder__pclass_1',
       'onehotencoder__pclass_2', 'onehotencoder__pclass_3',
       'onehotencoder__sex_female', 'onehotencoder__sex_male',
       'onehotencoder__embarked_C', 'onehotencoder__embarked_Q',
       'onehotencoder__embarked_S', 'onehotencoder__embarked_nan',
       'onehotencoder__class_First', 'onehotencoder__class_Second',
       'onehotencoder__class_Third',
       'onehotencoder__embark_town_Cherbourg',
       'onehotencoder__embark_town_Queenstown',
       'onehotencoder__embark_town_Southampton',
       'onehotencoder__embark_town_nan', 'onehotencoder__alone_False',
       'onehotencoder__alone_True'], dtype=object)

<details>
    <summary>👩🏻‍🏫 <i>Pipeline</i> vs. <i>make_pipeline</i></summary>

* When you create a Pipeline with `Pipeline()`, you have to:
    - specify all the ***sequential steps of the pipeline*** in a list
    - each step is a tuple with:
        - "name_of_the_step"
        - official Scikit-Learn name of the step
    
```python
Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
```
  
* When you create a Pipeline with `make_pipeline()`,
    - you don't have give a name to each step
    - you can simply chain all the steps together using their official Scikit-Learn name
    - the names of the steps are automatically induced by `make_pipeline`
    
```python
make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
```
    
</details>

<details>
    <summary>👩🏻‍🏫 <i>ColumnTransformer</i> vs. <i>make_column_transformer</i></summary>

* When you create a ColumnTransformer with `ColumnTransformer()`, you have to:
    - specify all the ***parallel steps of the columns' transformer*** in a list
    - each step is a tuple with:
        - "name_of_the_transformer"
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)
])
```
  
* When you create a ColumnTransformer with `make_column_transformer()`,
    - you don't have give a name to each parallel step
    - each step is a tuple with:
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features)
)
```
    
</details>

❓ Chain this preprocessing pipeline with a classifier and optimize it ❓

In [14]:
pd.DataFrame(preproc_basic.fit_transform(X_train), columns=preproc_basic.get_feature_names_out())

Unnamed: 0,pipeline__age,pipeline__fare,pipeline__sibsp,pipeline__parch,onehotencoder__pclass_1,onehotencoder__pclass_2,onehotencoder__pclass_3,onehotencoder__sex_female,onehotencoder__sex_male,onehotencoder__embarked_C,...,onehotencoder__embarked_nan,onehotencoder__class_First,onehotencoder__class_Second,onehotencoder__class_Third,onehotencoder__embark_town_Cherbourg,onehotencoder__embark_town_Queenstown,onehotencoder__embark_town_Southampton,onehotencoder__embark_town_nan,onehotencoder__alone_False,onehotencoder__alone_True
0,0.665633,3.734148,-0.461146,-0.476122,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.000000,-0.489619,-0.461146,-0.476122,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.139094,-0.483853,-0.461146,-0.476122,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.000000,-0.489619,-0.461146,-0.476122,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.833659,-0.486816,-0.461146,-0.476122,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,0.000000,-0.486816,-0.461146,-0.476122,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
619,-0.596929,-0.489138,-0.461146,-0.476122,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
620,0.428903,-0.486816,-0.461146,-0.476122,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
621,-0.675839,-0.488658,-0.461146,-0.476122,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


❓ What are the best params and the best score ❓

In [39]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'fare', 'sibsp', 'parch']),
                                   ('onehotencoder', OneHotEncoder(),
                                    ['pclass', 'sex', 'embarked', 'class',
                                     'embark_town', 'alone'])])),
  ('svc', SVC())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer()),
                                                  ('standardscaler',
                      

In [40]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the Pipeline, as far back as you want
        'svc__C': [0.1,1, 10, 100],
        'svc__gamma': [1,0.1,0.01,0.001],
        'svc__kernel': ['rbf', 'poly', 'sigmoid']
    },
    cv=5,
    scoring="accuracy")

grid_search.fit(X_train, y_train)

grid_search.best_params_

{'svc__C': 1, 'svc__gamma': 0.1, 'svc__kernel': 'poly'}

In [42]:
# pipe_tuned = grid_search.best_estimator_
pipe_tuned.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'fare', 'sibsp', 'parch']),
                                   ('onehotencoder', OneHotEncoder(),
                                    ['pclass', 'sex', 'embarked', 'class',
                                     'embark_town', 'alone'])])),
  ('svc', SVC(C=1, gamma=0.1, kernel='poly'))],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer()),
                                                  ('standardsc

In [43]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
cross_val_score(pipe_tuned, X_train, y_train, cv=5, scoring='accuracy').mean()

0.8251225806451613

In [44]:
pipe_tuned.fit(X_train, y_train)

In [45]:
pipe_tuned.predict(X_test)
pipe_tuned.score(X_test, y_test)

0.8395522388059702

## (3) How could we design a Custom Encoder to keep track of the columns' names?

In [None]:
# By default, OneHotEncoder works with Numpy and loses track of columns' names...
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

In [None]:
# ... however, we can access the one-hot-encoded names as follows
ohe.get_feature_names_out()

❓ Try to create your own OneHotEncoder so that it preserves the columns names ❓

In [123]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OneHotEncoder
import numpy as np

class MyCustomEncoder(OneHotEncoder):
    # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
    # TransformerMixin creates the fit_transform() method from fit() and transform()
    
    def transform(self, X, y=None):
        return pd.DataFrame(X).column_names

In [127]:
mce = MyCustomEncoder()
mce.fit(X_train)
mce.get_feature_names_out()
# mce.transform(X_train)

array(['pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male',
       'age_0.42', 'age_0.67', 'age_0.75', 'age_0.92', 'age_1.0',
       'age_2.0', 'age_3.0', 'age_4.0', 'age_5.0', 'age_6.0', 'age_7.0',
       'age_8.0', 'age_9.0', 'age_10.0', 'age_11.0', 'age_12.0',
       'age_13.0', 'age_14.0', 'age_15.0', 'age_16.0', 'age_17.0',
       'age_18.0', 'age_19.0', 'age_20.0', 'age_20.5', 'age_21.0',
       'age_22.0', 'age_23.0', 'age_23.5', 'age_24.0', 'age_24.5',
       'age_25.0', 'age_26.0', 'age_27.0', 'age_28.0', 'age_28.5',
       'age_29.0', 'age_30.0', 'age_30.5', 'age_31.0', 'age_32.0',
       'age_32.5', 'age_33.0', 'age_34.0', 'age_34.5', 'age_35.0',
       'age_36.0', 'age_37.0', 'age_38.0', 'age_39.0', 'age_40.0',
       'age_40.5', 'age_41.0', 'age_42.0', 'age_43.0', 'age_44.0',
       'age_45.0', 'age_45.5', 'age_46.0', 'age_47.0', 'age_48.0',
       'age_49.0', 'age_50.0', 'age_51.0', 'age_52.0', 'age_54.0',
       'age_55.0', 'age_55.5', 'age_56.0', 'age_57.0', 'a

🏁 If you want to build a very advanced pipeline, feel free to explore the Optional Challenge dealing the `cars dataset` !

💾 Don't forget to git add/commit/push your notebook.

👏 Congratulations, you are now a master at Pipeline and ColumnTransformer.