# Re-create your own _One Hot Encoder_ 

In [1]:
import pandas as pd
import seaborn as sns

In [48]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [19]:
from sklearn import set_config; set_config(display = "diagram")

## (1) The Titanic Dataset

In [2]:
# Loading 100% of the dataset. 
# Choose 0.5 to load only 50% of the rows randomly

data = sns.load_dataset('titanic').sample(frac = 1) 
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
186,1,3,female,,1,0,15.5,Q,Third,woman,False,,Queenstown,yes,False
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
880,1,2,female,25.0,0,1,26.0,S,Second,woman,False,,Southampton,yes,False
388,0,3,male,,0,0,7.7292,Q,Third,man,True,,Queenstown,no,True
517,0,3,male,,0,0,24.15,Q,Third,man,True,,Queenstown,no,True


In [3]:
from sklearn.model_selection import train_test_split

X = data.drop(columns = ['survived', 'alive', 'who', 'adult_male'])
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [4]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
624,3,male,21.0,0,0,16.1000,S,Third,,Southampton,True
485,3,female,,3,1,25.4667,S,Third,,Southampton,False
366,1,female,60.0,1,0,75.2500,C,First,D,Cherbourg,False
206,3,male,32.0,1,0,15.8500,S,Third,,Southampton,False
588,3,male,22.0,0,0,8.0500,S,Third,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...
767,3,female,30.5,0,0,7.7500,Q,Third,,Queenstown,True
639,3,male,,1,0,16.1000,S,Third,,Southampton,False
578,3,female,,1,0,14.4583,C,Third,,Cherbourg,False
669,1,female,,1,0,52.0000,S,First,C,Southampton,False


## (2) A first pipeline

‚ùì Create a basic Pipeline which ***encodes categorical features*** and ***scales numerical features*** ‚ùì

üí° Use [`make_pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) and [`make_column_transformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html)

In [12]:
X_train.isna().sum()

pclass           0
sex              0
age            125
sibsp            0
parch            0
fare             0
embarked         2
class            0
deck           479
embark_town      2
alone            0
dtype: int64

In [27]:
num_features = ['age','fare','sibsp','parch']
cat_features = ['pclass','sex','embarked','class','embark_town','alone']

In [22]:
num_pipeline = Pipeline(
    [
        #(id , transformer object )
        ('imputer', SimpleImputer(strategy='median') ),
        ('scaler', RobustScaler() )
    ]
)
num_pipeline  

In [33]:
num_pipeline.fit(X_train[num_features])
num_pipeline.transform(X_train[num_features])

array([[-5.38461538e-01,  7.03083512e-02,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  4.71451820e-01,  3.00000000e+00,
         1.00000000e+00],
       [ 2.46153846e+00,  2.60349893e+00,  1.00000000e+00,
         0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  1.60778158e+00,  1.00000000e+00,
         0.00000000e+00],
       [-1.03846154e+00, -1.75588865e-04,  1.00000000e+00,
         0.00000000e+00]])

In [35]:
cat_pipeline = Pipeline(
    [
        #(id , transformer object )
        ('imputer', SimpleImputer(strategy='most_frequent') ),
        ('encoder', OneHotEncoder(sparse=False) )
    ]
)
cat_pipeline

In [39]:
cat_pipeline.fit(X_train[cat_features])
cat_pipeline.transform(X_train[cat_features])

array([[0., 0., 1., ..., 1., 0., 1.],
       [0., 0., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [40]:
preprocessing = ColumnTransformer(
    [
        #(name, transformer obect, columns)
        ('num_pipeline', num_pipeline, num_features ),
        ('cat_pipeline', cat_pipeline, cat_features )
    ]
)
preprocessing

In [41]:
preprocessing.fit(X_train)
preprocessing.transform(X_train)

array([[-5.38461538e-01,  7.03083512e-02,  0.00000000e+00, ...,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 0.00000000e+00,  4.71451820e-01,  3.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.46153846e+00,  2.60349893e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  1.60778158e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.03846154e+00, -1.75588865e-04,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00]])

In [30]:
pipeline = Pipeline(
 [ 
     ('preprocessing', preprocessing),
     ('classifier', LogisticRegression())
 ]
)
pipeline

In [44]:
pipeline.fit(X_train, y_train)
pipeline.predict(X_train)

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,

<details>
    <summary>üë©üèª‚Äçüè´ <i>Pipeline</i> vs. <i>make_pipeline</i></summary>

* When you create a Pipeline with `Pipeline()`, you have to:
    - specify all the ***sequential steps of the pipeline*** in a list
    - each step is a tuple with:
        - "name_of_the_step"
        - official Scikit-Learn name of the step
    
```python
Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
```
  
* When you create a Pipeline with `make_pipeline()`,
    - you don't have give a name to each step
    - you can simply chain all the steps together using their official Scikit-Learn name
    - the names of the steps are automatically induced by `make_pipeline`
    
```python
make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
```
    
</details>

<details>
    <summary>üë©üèª‚Äçüè´ <i>ColumnTransformer</i> vs. <i>make_column_transformer</i></summary>

* When you create a ColumnTransformer with `ColumnTransformer()`, you have to:
    - specify all the ***parallel steps of the columns' transformer*** in a list
    - each step is a tuple with:
        - "name_of_the_transformer"
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)
])
```
  
* When you create a ColumnTransformer with `make_column_transformer()`,
    - you don't have give a name to each parallel step
    - each step is a tuple with:
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features)
)
```
    
</details>

‚ùì Chain this preprocessing pipeline with a classifier and optimize it ‚ùì

In [47]:
# YOUR CODE HERE

# train set
print(f"train sccuracy: {pipeline.score(X_train, y_train)}")

# test set
print(f"test sccuracy: {pipeline.score(X_test, y_test)}")



train sccuracy: 0.8089887640449438
test sccuracy: 0.7947761194029851


‚ùì What are the best params and the best score ‚ùì

In [50]:
cross_val_score(pipeline, X, y, ).mean()

0.8013746783001696

In [None]:
# YOUR CODE HERE

## (3) How could we design a Custom Encoder to keep track of the columns' names?

In [51]:
# By default, OneHotEncoder works with Numpy and loses track of columns' names...
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [52]:
# ... however, we can access the one-hot-encoded names as follows
ohe.get_feature_names_out()

array(['sex_female', 'sex_male'], dtype=object)

‚ùì Try to create your own OneHotEncoder so that it preserves the columns names ‚ùì

In [54]:
df = pd.DataFrame(ohe.fit_transform(X_train[['sex']]), 
                  columns=ohe.get_feature_names_out())
df

Unnamed: 0,sex_female,sex_male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0
...,...,...
618,1.0,0.0
619,0.0,1.0
620,1.0,0.0
621,1.0,0.0


In [57]:
# YOUR CODE HERE
class CustomOneHotEncoder(OneHotEncoder):
    
    def transform(self, X):
        df = pd.DataFrame(  super().transform(X), 
                           columns=super().get_feature_names_out())
        return df


In [58]:
ohe = CustomOneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

Unnamed: 0,sex_female,sex_male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0
...,...,...
618,1.0,0.0
619,0.0,1.0
620,1.0,0.0
621,1.0,0.0


In [62]:
cat_pipeline_2 = Pipeline(
    [
        #(id , transformer object )
        ('imputer', SimpleImputer(strategy='most_frequent') ),
        ('encoder', CustomOneHotEncoder(sparse=False) )
    ]
)
cat_pipeline_2

In [65]:
cat_pipeline_2.fit(X_train[cat_features])
cat_pipeline_2.transform(X_train[cat_features])

Unnamed: 0,x0_1,x0_2,x0_3,x1_female,x1_male,x2_C,x2_Q,x2_S,x3_First,x3_Second,x3_Third,x4_Cherbourg,x4_Queenstown,x4_Southampton,x5_False,x5_True
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
619,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
620,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
621,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [63]:
preprocessing = ColumnTransformer(
    [
        #(name, transformer obect, columns)
        ('num_pipeline', num_pipeline, num_features ),
        ('cat_pipeline', cat_pipeline_2, cat_features )
    ]
)
preprocessing

In [64]:
preprocessing.fit_transform(X_train)

array([[-5.38461538e-01,  7.03083512e-02,  0.00000000e+00, ...,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 0.00000000e+00,  4.71451820e-01,  3.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.46153846e+00,  2.60349893e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  1.60778158e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.03846154e+00, -1.75588865e-04,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00]])

üèÅ If you want to build a very advanced pipeline, feel free to explore the Optional Challenge dealing the `cars dataset` !

üíæ Don't forget to git add/commit/push your notebook.

üëè Congratulations, you are now a master at Pipeline and ColumnTransformer.