In [7]:
!pwd
import os
os.chdir('../')
!pwd

/Users/Bingumalla Likith/Desktop/MLOPS/Project-2/research
/Users/Bingumalla Likith/Desktop/MLOPS/Project-2


### Modular Programming

In [264]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pickle

In [265]:
import pandas as pd 
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [266]:
from src.data_science.constants import *
from src.data_science import logger
from src.data_science.utils.common import read_yaml, create_directories

In [267]:
class ConfigurationManager:
    def __init__(self,
                config_path = CONFIG_FILE_PATH,
                params_path = PARAMS_FILE_PATH,
                schema_path = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        self.schema = read_yaml(schema_path)
        
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_validation_config = DataTransformationConfig(**config)
        return data_validation_config

In [268]:
from sklearn.model_selection import train_test_split

In [269]:
class DataTransformation:
    def __init__(self, config:DataTransformationConfig):
        self.config = config
    
    def train_test_splitting(self, data):
        train, test = train_test_split(data, test_size=0.2)
        train.save(os.path.join(self.config.root_dir, 'train.npy'))
        test.save(os.path.join(self.config.root_dir, 'test.npy'))

        logger.info("Splitted data into train and test sets")
        logger.info(f"Train data shape -> {train.shape}")
        logger.info(f"Test data shape -> {test.shape}")

        print(train.shape , test.shape)
    
    def transformation(self):
        data = pd.read_csv(self.config.data_path)
        numerical_columns = data.select_dtypes(include='float64').columns.to_list()
        categorical_columns = data.select_dtypes(include='object').columns.to_list()

        numerical_columns.remove('Trip_Price') ## Target variable
        #Based on observation, Passenger count has discrete values that can be encoded as a one-hot-vector
        numerical_columns.remove('Passenger_Count')
        categorical_columns.append('Passenger_Count')

        data.dropna(subset=['Trip_Price'], inplace=True) # Remove rows with null output
        categorical_pipe = Pipeline([
            ('imputer' , SimpleImputer(strategy='most_frequent')),
            ('encoder' , OneHotEncoder(sparse_output=False))
        ])

        numerical_pipe = Pipeline([
            ('imputer' , SimpleImputer(strategy='mean')),
            ('scaler' , StandardScaler())
        ])

        pipeline = ColumnTransformer([
            ('column-pipeline' , categorical_pipe , categorical_columns),
            ('numerical-pipeline' , numerical_pipe , numerical_columns)
        ])

        pipeline.fit_transform(data)
        self.train_test_split(data)

        with open(os.path.join(self.config.root_dir, 'pipeline.pkl'), 'wb') as file:
            pickle.dump(pipeline, file)

        logger.info(f'Saved the pipeline at {self.config.root_dir} ✅')

In [270]:
try: 
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)

    data_transformation.transformation()
except Exception as e:
    raise e

[2025-01-05 15:51:36,334 : INFO : common : Yaml file : config/config.yaml loaded successfully !!]
[2025-01-05 15:51:36,339 : INFO : common : Yaml file : params.yaml loaded successfully !!]
[2025-01-05 15:51:36,343 : INFO : common : Yaml file : schema.yaml loaded successfully !!]
[2025-01-05 15:51:36,344 : INFO : common : Created directory at : artifacts]


[2025-01-05 15:51:36,344 : INFO : common : Created directory at : artifacts/data_transformation]


TypeError: 'Index' object is not callable

### Research 

In [1]:
import pandas as pd

In [8]:
path = 'artifacts/data_ingestion/taxi_trip_pricing.csv'
data = pd.read_csv(path)

In [19]:
data.info()
numerical_data = data.select_dtypes(include = "float64")
categorical_data = data.select_dtypes(include = "object")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [21]:
numerical_data.head()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,3.0,3.56,0.8,0.32,53.82,36.2624
1,47.59,1.0,,0.62,0.43,40.57,
2,36.87,1.0,2.7,1.21,0.15,37.27,52.9032
3,30.33,4.0,3.48,0.51,0.15,116.81,36.4698
4,,3.0,2.93,0.63,0.32,22.64,15.618


In [22]:
numerical_data.describe()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
count,950.0,950.0,950.0,950.0,950.0,950.0,951.0
mean,27.070547,2.476842,3.502989,1.233316,0.292916,62.118116,56.874773
std,19.9053,1.102249,0.870162,0.429816,0.115592,32.154406,40.469791
min,1.23,1.0,2.01,0.5,0.1,5.01,6.1269
25%,12.6325,1.25,2.73,0.86,0.19,35.8825,33.74265
50%,25.83,2.0,3.52,1.22,0.29,61.86,50.0745
75%,38.405,3.0,4.26,1.61,0.39,89.055,69.09935
max,146.067047,4.0,5.0,2.0,0.5,119.84,332.043689


In [26]:
for col in categorical_data.columns:
    print(col , end=" -> ")
    print(categorical_data[col].unique())

Time_of_Day->['Morning' 'Afternoon' 'Evening' 'Night' nan]
Day_of_Week->['Weekday' 'Weekend' nan]
Traffic_Conditions->['Low' 'High' 'Medium' nan]
Weather->['Clear' nan 'Rain' 'Snow']
