In [62]:
import sklearn
import sklearn.preprocessing as preprocessing
import sklearn.impute as impute
import sklearn.compose as compose
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

sklearn.set_config(transform_output="pandas")

In [30]:
# Preprocessing [DataFrame] : Category -> Spars matrix
test_data = pd.DataFrame({
    "Name" : ["Ramy", "Samy", "Fadia", "Ameer"],
    "Sex" : ["Male", "Male", "Female", np.nan]
})
pd.get_dummies(test_data, columns=['Sex'])

# Note, it didn't create a new column for np.nan value and gave it all zeros

Unnamed: 0,Name,Sex_Female,Sex_Male
0,Ramy,0,1
1,Samy,0,1
2,Fadia,1,0
3,Ameer,0,0


In [22]:
# Pre-processing [SkLearn]: Cateogry -> 0, 1, 2, 3 ..etc
test_data = pd.DataFrame({
    "Color" : ["A", "A", "B", "C"]
})

encoder = preprocessing.OrdinalEncoder()
encoder.fit(test_data)
test_data_tr = encoder.transform(test_data)
print(f"Categories: \n\t {encoder.categories_}")
print(f"Transformed: \n {test_data_tr}")

Categories: 
	 [array(['A', 'B', 'C'], dtype=object)]
Transformed: 
    Color
0    0.0
1    0.0
2    1.0
3    2.0


In [43]:
# Pre-processing [SkLearn-Impute]: Imputers fill in the missing values with (constant, mean, median, most_frequent)
test_data = pd.DataFrame({
    "Age" : [10, 20, 20, 40, 50, 60, np.nan]
})

imputer = impute.SimpleImputer(strategy="most_frequent", fill_value=5)
imputer.fit(test_data)
test_data_tr = imputer.transform(test_data)
print(f"Statistics: \n\t {imputer.statistics_}")
print(f"Transformed: \n {test_data_tr}")

Statistics: 
	 [20.]
Transformed: 
     Age
0  10.0
1  20.0
2  20.0
3  40.0
4  50.0
5  60.0
6  20.0


In [67]:
# Sklearn-Pipeline: Combine Encoder/Imputer in a pipeline

test_data = pd.DataFrame({
    "Title" : ["Developer", "Developer", "QA", "BA", np.nan],
    "Age" : [10, 20, 20, np.nan, np.nan]
})

transformers = [
    ("encoder", preprocessing.OrdinalEncoder(encoded_missing_value=-1), ["Title"]),
    ("imputer", impute.SimpleImputer(strategy="mean"), ["Age"])
]
column_transformer = compose.ColumnTransformer(transformers)

pipeline = Pipeline(steps=[
    ("transformers", column_transformer),
])
pipeline.fit_transform(test_data)

Unnamed: 0,encoder__Title,imputer__Age
0,1.0,10.0
1,1.0,20.0
2,2.0,20.0
3,0.0,16.666667
4,-1.0,16.666667
