In [119]:
import os

In [120]:
%pwd

'C:\\Users\\anand\\Desktop\\reume_projet\\AutoPrice-AI'

In [121]:
os.chdir("C:/Users/anand/Desktop/reume_projet/AutoPrice-AI")

In [122]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class DataTransformationConfig:
    root_dir: str
    data_path: str
    target_column: str
    target_encode_cols: list
    test_size: float
    random_state: int
    

In [123]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories
from mlProject import logger

In [124]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH,
        params_filepath: str = PARAMS_FILE_PATH,
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        try:
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)
            create_directories([self.config.artifacts_root])
            logger.info("Configuration loaded successfully")
        except Exception as e:
            logger.error(f"Config loading failed: {str(e)}")
            raise

    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            # Verify config sections exist
            if not hasattr(self.config, 'data_transformation'):
                raise ValueError("Missing 'data_transformation' in config")
            if not hasattr(self.schema, 'TARGET_COLUMN'):
                raise ValueError("Missing 'TARGET_COLUMN' in schema")

            config = self.config.data_transformation
            schema = self.schema.TARGET_COLUMN

            # Set default params if not specified
            target_encode_cols = getattr(self.params, 'target_encode_cols', ["model"])
            test_size = getattr(self.params, 'test_size', 0.25)
            random_state = getattr(self.params, 'random_state', 42)

            create_directories([config.root_dir])

            return DataTransformationConfig(
                root_dir=config.root_dir,
                data_path=config.data_path,
                target_column=schema.name,
                target_encode_cols=target_encode_cols,
                test_size=test_size,
                random_state=random_state
            )
        except Exception as e:
            logger.error(f"Failed to create data transformation config: {str(e)}")
            raise

In [125]:
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.impute import SimpleImputer

In [126]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.ownership_map = {
            'First Owner': 1,
            'Second Owner': 2,
            'Third Owner': 3,
            'Fourth Owner': 4
        }
        self.transmission_map = {
            'Manual': 0,
            'Automatic': 1
        }

    def _preprocess_data(self, data):
        """Apply initial preprocessing to data"""
        # Convert mappings
        data['ownership'] = data['ownership'].map(self.ownership_map)
        data['transmission'] = data['transmission'].map(self.transmission_map)
        return data

    def train_test_splitting(self):
        """Complete data transformation pipeline with leakage prevention"""
        try:
            # Load data
            data = pd.read_csv(self.config.data_path)
            logger.info(f"Original data shape: {data.shape}")

            # Initial split to prevent leakage
            train, test = train_test_split(
                data,
                test_size=self.config.test_size,
                random_state=self.config.random_state
            )
            logger.info(f"Initial split - Train: {train.shape}, Test: {test.shape}")

            # Apply base preprocessing
            train = self._preprocess_data(train)
            test = self._preprocess_data(test)

            # Target encoding for 'model'
            model_encoder = ce.TargetEncoder(cols=['model'], smoothing=5.0)
            model_encoder.fit(train['model'], train['price'])
            train['model_encoded'] = model_encoder.transform(train['model'])
            test['model_encoded'] = model_encoder.transform(test['model'])

            # One-hot encoding for other categorical features
            ohe_columns = ['make', 'fuel', 'city']
            ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
            ohe.fit(train[ohe_columns])
            
            # Transform features
            train_ohe = pd.DataFrame(
                ohe.transform(train[ohe_columns]),
                columns=ohe.get_feature_names_out(ohe_columns),
                index=train.index
            )
            test_ohe = pd.DataFrame(
                ohe.transform(test[ohe_columns]),
                columns=ohe.get_feature_names_out(ohe_columns),
                index=test.index
            )
            
            # Combine encoded data
            train = pd.concat([
                train.drop(columns=['model'] + ohe_columns),
                train_ohe
            ], axis=1)
            
            test = pd.concat([
                test.drop(columns=['model'] + ohe_columns),
                test_ohe
            ], axis=1)

            # Save processed data
            os.makedirs(self.config.root_dir, exist_ok=True)
            train_path = os.path.join(self.config.root_dir, "train.csv")
            test_path = os.path.join(self.config.root_dir, "test.csv")
            train.to_csv(train_path, index=False)
            test.to_csv(test_path, index=False)

            logger.info(f"Final train shape: {train.shape}, test shape: {test.shape}")
            logger.info(f"Processed data saved to {self.config.root_dir}")
            return train_path, test_path

        except Exception as e:
            logger.error(f"Data transformation failed: {str(e)}")
            raise


In [127]:
try:
        config = ConfigurationManager()
        data_transformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        train_path, test_path = data_transformation.train_test_splitting()
        logger.info(f"Pipeline completed successfully")
except Exception as e:
        logger.exception("Pipeline failed")
        raise e

[2025-06-16 02:28:50,940: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-16 02:28:50,943: INFO: common: yaml file: params.yaml loaded successfully]
[2025-06-16 02:28:50,946: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-06-16 02:28:50,948: INFO: common: created directory at: artifacts]
[2025-06-16 02:28:50,949: INFO: 2903215021: Configuration loaded successfully]
[2025-06-16 02:28:50,950: INFO: common: created directory at: artifacts/data_transformation]
[2025-06-16 02:28:50,963: INFO: 2318304100: Original data shape: (2252, 11)]
[2025-06-16 02:28:50,966: INFO: 2318304100: Initial split - Train: (1689, 11), Test: (563, 11)]
[2025-06-16 02:28:51,039: INFO: 2318304100: Final train shape: (1689, 36), test shape: (563, 36)]
[2025-06-16 02:28:51,040: INFO: 2318304100: Processed data saved to artifacts/data_transformation]
[2025-06-16 02:28:51,041: INFO: 2719237422: Pipeline completed successfully]


[2025-06-16 02:27:36,508: INFO: 482361250: Starting model training pipeline]
[2025-06-16 02:27:36,510: ERROR: 482361250: Data loading failed: [Errno 2] No such file or directory: '/artifacts/data_transformation/train.csv']
[2025-06-16 02:27:36,511: ERROR: 482361250: Model training failed]
Traceback (most recent call last):
  File "C:\Users\anand\AppData\Local\Temp\ipykernel_19900\482361250.py", line 59, in train
    X_train, X_test, y_train, y_test = self._load_data()
                                       ^^^^^^^^^^^^^^^^^
  File "C:\Users\anand\AppData\Local\Temp\ipykernel_19900\482361250.py", line 22, in _load_data
    train_data = pd.read_csv("/artifacts/data_transformation/train.csv")
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anand\Desktop\reume_projet\AutoPrice-AI\mlproj\Lib\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File

FileNotFoundError: [Errno 2] No such file or directory: '/artifacts/data_transformation/train.csv'