In [1]:
import os

In [2]:
pwd

'e:\\gemstone\\NoteBooks'

In [3]:
os.chdir('../')

In [4]:
pwd

'e:\\gemstone'

In [5]:
import pandas as pd
import numpy as np
from src.utils.utils import read_yaml,create_dir,save_obj
from src.constant.ymal_path import *
from dataclasses import dataclass

In [16]:
from pathlib import Path

@dataclass
class DataTransformationConfig:
    dir: Path
    train_data: Path
    test_data: Path
    preproecss_obj: Path
    target_col:str
    train_arr:Path
    test_arr: Path

In [17]:
class ConfigManager:
    def __init__(self,
                    config_filr_path=Config_ymal_file_path,
                    prams_file_path=Param_ymal_file_path,
                    scheema_file_path=Schema_ymal_file_path):
            self.config=read_yaml(config_filr_path)
            self.params=read_yaml(prams_file_path)
            self.schema=read_yaml(scheema_file_path)

            create_dir([self.config.artifacts_root])

    def get_data_transformation_config(self):
          config=self.config.data_transformation
          schema=self.schema.TARGET_COLUMN

          create_dir([config.dir])

          data_transformation_config=DataTransformationConfig(
                dir=config.dir,
                train_data=config.train_data,
                test_data=config.test_data,
                preproecss_obj=config.preproecss_obj,
                target_col=schema.name,
                train_arr=config.train_arr,
                test_arr=config.test_arr
          )
          return data_transformation_config

In [8]:
from src.logging.logger import logging
from src.exception.exception import CustomException

In [9]:
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import sys


In [24]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig) -> None:
        self.config=config
       

    def get_data_transformation_obj(self):
        try:
            num_cols=['carat', 'depth', 'table', 'x', 'y', 'z']
            catagorical_cols=['cut', 'color', 'clarity']

            color_labels=['D', 'E', 'F', 'G', 'H', 'I', 'J']
            cut_labels=['Fair', 'Good', 'Very Good','Premium','Ideal']
            clarity_labels=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

            logging.info('Pipeline creation started')

            num_cols_pipline=Pipeline(
                steps=[
                    ('IMPUTE',SimpleImputer(strategy='median')),
                    ('SCALING',StandardScaler())
                ]
            )

            catagorical_cols_pipline=Pipeline(
                steps=[
                    ('ODINAL ENCODING',OrdinalEncoder(categories=[cut_labels,color_labels,clarity_labels])),
                    ('IMPUTE',SimpleImputer(strategy='most_frequent')),
                    ('SCALING',StandardScaler())
                ]
            )

            logging.info('pipline created')
            preprocesser_obj=ColumnTransformer(
                [
                    ('num_cols',num_cols_pipline,num_cols),
                    ('categorical_cols',catagorical_cols_pipline,catagorical_cols)
                ]
            )
            logging.info(f'preprocesser object competed {preprocesser_obj}')
            return preprocesser_obj
            
        except CustomException as e:
            logging.info(f'Error cooured {str(e)}')
            raise CustomException(sys,e)   


    def initiating_data_transformation(self):
        try:
            train_data=pd.read_csv(self.config.train_data)
            test_data=pd.read_csv(self.config.test_data)
            

            logging.info('data read completed')

            Target_col=self.config.target_col
           

            logging.info('spliting data x_train,y_tran,x_test,y_test')
            # x_train
            input_feature_train_data=train_data.drop(columns=[Target_col,'Unnamed: 0' ],axis=1)
            print(input_feature_train_data.head())

            # y_train
            target_feature_train_data=train_data[Target_col]
            
            
            #x_test
            input_feature_test_data=test_data.drop(columns=[Target_col,'Unnamed: 0' ],axis=1)

            #y_test
            target_feature_test_data=test_data[Target_col]

            preprocesser=self.get_data_transformation_obj()

            transform_input_feature_train_data=preprocesser.fit_transform(input_feature_train_data)
            transform_input_feature_test_data=preprocesser.transform(input_feature_test_data)

            train_arr=np.c_[transform_input_feature_train_data,np.array(target_feature_train_data)]
            test_arr=np.c_[transform_input_feature_test_data,np.array(target_feature_test_data)]
            
            np.save(self.config.train_arr,train_arr)
            np.save(self.config.test_arr,test_arr)

            save_obj(
                file_path=self.config.preproecss_obj,
                obj=preprocesser
            )

            return(
                train_arr,
                test_arr
            )





        except CustomException as e:
            logging.info(f'Error cooured {str(e)}')
            raise CustomException(sys,e)     

In [25]:
try:
    config=ConfigManager()
    data_tranformation_config=config.get_data_transformation_config()
    data_tranformation=DataTransformation(config=data_tranformation_config)
    data_tranformation.initiating_data_transformation()
except Exception as e:
            logging.info(f'Error cooured {str(e)}')
            raise CustomException(sys,e) 

       id  carat      cut color clarity  depth  table     x     y     z
0  149864   0.90  Premium     I     SI1   63.0   59.0  6.17  6.12  3.90
1   28636   1.55  Premium     H     SI1   62.0   58.0  7.37  7.42  4.59
2   53148   0.32    Ideal     D     SI1   63.0   55.0  4.38  4.35  2.75
3    6925   0.35    Ideal     F     SI1   62.3   57.0  4.53  4.48  2.81
4   68453   0.77    Ideal     D     SI1   62.2   56.0  5.85  5.88  3.64
