In [1]:
import os 
%pwd

'd:\\ML_Deployment\\Automadata\\research'

In [2]:
os.chdir('../')

In [3]:
%pwd

'd:\\ML_Deployment\\Automadata'

In [4]:
import numpy as np

In [5]:
from dataclasses import dataclass
from pathlib import Path
from src.logging import logger
import pandas as pd
from src.constants import *
from src.utils.common import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder

In [6]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path : Path
    preprocessor_obj_file_path : str
    target_column: str

In [7]:
class ConfigManager:
    
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        schema = self.schema.TARGET_COLUMN
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path= config.data_path,
            preprocessor_obj_file_path =config.preprocessor_obj_file_path,
            target_column= schema.name  
        )
        
        return data_transformation_config

In [9]:
class DataTransformation:
    def __init__(self, config:DataTransformationConfig):
        self.config = config
        
    def generate_features(self, input_data: pd.DataFrame) -> pd.DataFrame:

        # Convert pickup and dropoff cols to datetime
        input_data['tpep_pickup_datetime'] = pd.to_datetime(input_data['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')
        input_data['tpep_dropoff_datetime'] = pd.to_datetime(input_data['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p')
        #create month
        input_data['month'] = input_data['tpep_pickup_datetime'].dt.strftime('%b').str.lower()
        # create day col
        input_data['day'] = input_data['tpep_pickup_datetime'].dt.day_name().str.lower()
        # create time of the day
        input_data['am_rush'] = input_data['tpep_pickup_datetime'].dt.hour
        input_data['day_time'] = input_data['tpep_pickup_datetime'].dt.hour
        input_data['pm_rush'] = input_data['tpep_pickup_datetime'].dt.hour
        input_data['night time'] = input_data['tpep_pickup_datetime'].dt.hour

        input_data['am_rush'] = input_data['am_rush'].apply(lambda x: 1 if 6 <= x < 10 else 0)
        input_data['day_time'] = input_data['am_rush'].apply(lambda x: 1 if 10 <= x < 16 else 0)
        input_data['pm_rush'] = input_data['am_rush'].apply(lambda x: 1 if 16<= x < 20 else 0)
        input_data['night_time'] = input_data['am_rush'].apply(lambda x : 1 if (20 <= x < 24) or (0 <= x < 6) else 0)

        # drop redundant columns
        drop_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime',
                    'payment_type', 'trip_distance', 'store_and_fwd_flag',
                    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
                    'improvement_surcharge', 'total_amount', 'tip_percent']
        # convert catergorical features to string
        cols_to_str = ['RatecodeID', 'VendorID', 'DOLocationID', 'PULocationID']

        # Convert each column to string
        for col in cols_to_str:
            input_data[col] = input_data[col].astype('str')

        input_data = input_data.drop(columns=drop_cols, axis=1)

        return input_data
        
    def get_data_transformer_object(self):

        try:
            
            numerical_columns = ['passenger_count', 'mean_duration', 'mean_distance', 'predicted_fare',
                                'am_rush', 'day_time', 'pm_rush', 'night_time']
            categorical_columns = ['VendorID', 'RatecodeID', 'day', 
                                    'month', 'PULocationID', 'DOLocationID']

            num_pipeline = Pipeline(
                steps = [
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]
            )

            cat_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', categories='auto')),
                    ('scaler', StandardScaler(with_mean=False) )
                ]
            )

            preprocessor = ColumnTransformer(
                [

                ('num_pipeline', num_pipeline, numerical_columns),
                ('cat_pipeline', cat_pipeline, categorical_columns)

                ]
            )
            
            return preprocessor
        except Exception as e:
            raise e
    
    def data_splitting_preprocessing(self):
        
        data = pd.read_csv(self.config.data_path)
        data_clean =self.generate_features(data)
        data_clean.dropna(inplace=True)
        column_names = data_clean.columns
        # split train and test data and save it as artifact 
        train, test = train_test_split(data_clean, test_size=0.25, random_state=0)
        train_x = train.drop([self.config.target_column], axis=1)
        train_y = train[[self.config.target_column]]
        test_x = test.drop([self.config.target_column], axis=1)
        test_y = test[[self.config.target_column]]
        
        preprocessor_obj = self.get_data_transformer_object()
        X_train_transformed = preprocessor_obj.fit_transform(train_x)
        X_test_transformed = preprocessor_obj.transform(test_x)
        
        # convert sparse matrix t0 numpy array  
        X_train_arr = X_train_transformed.toarray()
        X_test_arr = X_test_transformed.toarray()
        y_train_arr = train_y.to_numpy()
        y_test_arr = test_y.to_numpy()
        
        
        train_arr = np.c_[X_train_arr, y_train_arr]
        test_arr = np.c_[X_test_arr,  y_test_arr]
        
        
        
        logger.info(f"save preprocessor object to {self.config.preprocessor_obj_file_path}")
        save_bin(object=preprocessor_obj, path =os.path.join(self.config.preprocessor_obj_file_path, "preprocessor.joblib"))
        
        logger.info("Preprocessed train and test data are saved ")
        # Save the NumPy arrays to .npy files
        np.save(os.path.join(self.config.root_dir, "train.npy"), train_arr)
        np.save(os.path.join(self.config.root_dir, "test.npy"), test_arr)
        logger.info(f"train data shape : {train_arr.shape}")
        logger.info(f"test data shape : {test_arr.shape}")
        

    
    
        

In [10]:
try:
    config = ConfigManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.data_splitting_preprocessing()
except Exception as e:
    raise e 

[2024-01-13 20:43:06,361: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-13 20:43:06,364: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-13 20:43:06,367: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-01-13 20:43:06,367: INFO: common: created directory at: artifacts]
[2024-01-13 20:43:06,367: INFO: common: created directory at: artifacts/data_transformation]
[2024-01-13 20:43:06,659: INFO: 793293800: save preprocessor object to artifacts/data_transformation]
[2024-01-13 20:43:06,664: INFO: common: binary file saved at: artifacts/data_transformation\preprocessor.joblib]
[2024-01-13 20:43:06,664: INFO: 793293800: Preprocessed train and test data are saved ]
[2024-01-13 20:43:06,690: INFO: 793293800: train data shape : (10875, 342)]
[2024-01-13 20:43:06,691: INFO: 793293800: test data shape : (3626, 342)]
