### Creating Preprocessing classes for model pipeline

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin

#### TypeSelector for processing columns of a certain type

In [2]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

#### FeatureLogTransformer for taking the logarithm of features that are heavily skewed with outliers

In [3]:
class FeatureLogTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
        
    def get_features(self, X):
        return X.columns
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return np.log1p(X)

#### Column selector for initial selector of columns for processing

In [4]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

#### Column dropper for dropping any required columns

In [68]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        try:
            return X.drop(self.columns,axis=1)
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

#### Column aggregator (Specific for compressing transportation figures)

**Example**
>`walk_to_work` & `walk_from_work` variables ---> `walk_work` variable (figures are summed)
>`cardriver_to_friends_nhb` & `cardriver_from_friends_nhb` ---> `cardriver_friends_nhb` (figures are summed)

In [5]:
class TransportAggregate(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass#self.columns = columns

    def fit(self, X, y=None):
        return self


    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        new_columns = []
        old_columns = []
        Y = X.copy()
        for x in range(0,Y.shape[1],2):
            two_variables = Y.iloc[:,x:x+2].columns
            old_columns.extend(two_variables)
            mode = two_variables[0].split("_")[0]
            if two_variables[0].split("_")[-1] == 'nhb':
                
                activity = "_".join([two_variables[0].split("_")[-2],two_variables[0].split("_")[-1]])
            else:
                activity = two_variables[0].split("_")[-1]
            
            new_columns.append(f'{mode}_{activity}')
            Y[f'{mode}_{activity}'] = Y.iloc[:,x:x+2].sum(axis=1)
        
        Y = Y.drop(old_columns,axis=1)
        
        old_columns = []
        for x in range(0,Y.shape[1]-8):
            first = Y.iloc[:,x].name
            second = Y.iloc[:,x+8].name
            if first.split("_")[0]==second.split("_")[0]:
                Y[first] = Y[first]+Y[second]
                old_columns.append(second)
                #print(first,second)

        Y = Y.drop(old_columns,axis=1)
               
        X = Y
            
        return X