## Import Libraries

In [2]:
import pandas as pd
import numpy as np 


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin

In [5]:
## Custom Transformer for Feature Engineering

class DateTimeFeatrueExtractor(BaseEstimator,TransformerMixin):
    ''' Extracts datetime columns. '''
    def __init__(self):
        pass

    def fit(self,X,y = None):
        return self

    def transform(self,X):
        X = X.copy()

        X['tpep_pickup_datetime'] = pd.to_datetime(X['tpep_pickup_datetime'])
        X['tpep_dropoff_datetime'] = pd.to_datetime(X['tpep_dropoff_datetime'])

        
        X['trip_duration'] = (X['tpep_pickup_datetime'] - X['tpep_dropoff_datetime']).dt.total_seconds()/60
        
        X['pickup_hour'] = X['tpep_pickup_datetime'].dt.hour

        X['pickup_day'] = X['tpep_pickup_datetime'].dt.dayofweek

        X = X.drop(columns = ['tpep_pickup_datetime','tpep_dropoff_datetime'])

        return X


## Custom Transformer for Outlier Capping (Numerical)

class OutlierCapper(BaseEstimator,TransformerMixin):
    ''' handling outliers '''
    def fit(self,X,y=None):
        self.lower_bounds_ = {}
        self.upper_bounds_ = {}

        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)


            IQR = Q3 - Q1

            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

            self.lower_bounds_[col] = lower
            self.upper_bounds_[col] = upper
 
        return self

    def transform(self,X):
        X = X.copy()

        for col in X.columns:
            lower = self.lower_bounds_[col]
            upper = self.upper_bounds_[col]

            X[col] = np.clip(X[col],lower,upper)
        return X


## Function to Create Full Preprocessing Pipeline
def create_preproecssor(df):

    numerical_cols = df.select_dtypes(include = ['int64','float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()


    if 'total_amount' in numerical_cols:
        numerical_cols.remove('numerical_cols')

    ## Numerical Pipeline
    numerical_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                                        ('OutlierCapper',OutlierCapper()),
                                        ('scaler',StandardScaler())])


    ## categorical Pipeline
    categorical_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                                          ('encoder',OneHotEncoder(handle_unknown='ignore')),
                                          ])

    ## Column Transformer
    preprocessor = ColumnTransformer(transformers=[('num',numerical_pipeline,numerical_cols),
                                                  ('cat',categorical_pipeline,categorical_cols)])


    ## Full Pipeline
    full_pipeline = Pipeline(steps=[('feature_engineering',DateTimeFeatrueExtractor()),
                                   ('preprocessing',preprocessor)])
    return full_pipeline