In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
import pandas as pd # to use .mean()

# we will define 3 methods for the class: _init_, fit and transform
# The fit_transform() will be inherited since we are using BaseEstimator and TransformerMixin

# define your transformer name and as an argument inherit the base classes
class MyCustomTransformerForMeanImputation(BaseEstimator, TransformerMixin):

  #### here you define the variables you need to parse when you initialize the class
  def __init__(self, variables):
    # we make sure the variable will be a list, even if only one element
    if not isinstance(variables, list): 
      self.variables = [variables]
    else: self.variables = variables

  #### here is where the learning happens. We perform the operation we are interested in,
  #### in this case, calculate the mean
  def fit(self, X, y=None):
   
    # we want to keep the mean value in a dictionary
    self.imputer_dict_ = {}
      
    # loop over each variable, calculate the mean and save in the dictionary.  
    for feature in self.variables:
        self.imputer_dict_[feature] = X[feature].mean()
    
    return self

  #### here you transform the variables based on what you learned in the .fit()
  #### You can transform into the train set, test set or real-time data
  def transform(self, X):
    # loop over the variables and .fillna() in a given feature based on the 
    # mean of a given feature
    for feature in self.variables:
      X[feature].fillna(self.imputer_dict_[feature], inplace=True)
      
    return X

In [4]:
# The comments relate to the new concepts for this exercise

class ConvertUpperCase(BaseEstimator, TransformerMixin):
  def __init__(self, variables):
    if not isinstance(variables, list): 
      self.variables = [variables]
    else: self.variables = variables

  # we don't need to learn anything here, we just return self
  # we need to do that anyway to be compatible with scikit-learn format
  def fit(self, X, y=None):
      return self

  # here we convert the variables using a method called .upper()
  # We lopp over all the variables, check if it is an object, then use a lambda function...
  # ...to apply .upper() to all rows
  def transform(self, X):
    for feature in self.variables:
      if X[feature].dtype == 'object':
        X[feature] = X[feature].apply(lambda x: x.upper())
      else:
        print(f"Warning: {feature} data type should be object to use ConvertUpperCase()")

    return X

In [5]:
df = sns.load_dataset('penguins')
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [7]:
from feature_engine.imputation import CategoricalImputer

pipeline = Pipeline([
      ( 'custom_transf', MyCustomTransformerForMeanImputation(variables=['bill_length_mm',
                                                                         'bill_depth_mm',
                                                                         'flipper_length_mm',
                                                                         'body_mass_g'] )),
                     
      ( 'categorical_imputer', CategoricalImputer(imputation_method='missing',
                                                  fill_value='Missing',
                                                  variables=['sex']) ),
      
      ('upper_case' , ConvertUpperCase(variables=['sex'])),
])

df_transformed = pipeline.fit_transform(df)
df_transformed.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,MISSING
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [8]:
df_transformed.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [9]:
df[['bill_length_mm' , 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].mean()

bill_length_mm         43.921930
bill_depth_mm          17.151170
flipper_length_mm     200.915205
body_mass_g          4201.754386
dtype: float64

In [10]:
pipeline['custom_transf'].imputer_dict_

{'bill_length_mm': 43.9219298245614,
 'bill_depth_mm': 17.151169590643278,
 'flipper_length_mm': 200.91520467836258,
 'body_mass_g': 4201.754385964912}