In [3]:
import os

In [4]:
os.chdir('../')

In [5]:
%pwd

'd:\\MLOPS\\End-To-End-MlOps-Project'

In [6]:
from dataclasses import dataclass
from pathlib import Path

In [7]:
@dataclass
class DataTransformationConfig:
   dir: Path
   unzip_dir: Path
   train_arr: Path
   test_arr: Path
   target_col: str
   preprocess_obj: Path

In [8]:
from src.ds import logging
from src.ds.utils.utils import read_yaml,create_dir
from src.ds.constants.yaml_path import *

In [9]:
class ConfigManager:
    def __init__(self,config_file_path=Config_file_path,prams_file_path=Param_file_path,schema_file_path=Schema_file_path) -> None:
        self.config=read_yaml(config_file_path)
        self.params=read_yaml(prams_file_path)
        self.schema=read_yaml(schema_file_path)

        create_dir([self.config.artifacts_root])
    def get_transformation_config(self) -> DataTransformationConfig:
      try:
            config=self.config.Data_Transformation
            create_dir([config.dir])
            transformation_config=DataTransformationConfig(
                dir=config.dir,
                unzip_dir=config.unzip_dir,
                train_arr=config.train_arr,
                test_arr=config.test_arr,
                target_col=self.schema.TARGET_COLUMN,
                preprocess_obj=config.preprocess_obj
				)	
            return transformation_config
      except Exception as e:
         logging.info(f'Error in Transformation config: {str(e)}')
         raise e

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
from src.ds.utils.utils import save_obj

In [12]:

class DataTransformation:
   def __init__(self,config:DataTransformationConfig) -> None:
         self.config=config
   def get_preprocess_obj(self):
      try:
         num_col=['ssc_percentage', 'hsc_percentage', 'degree_percentage','emp_test_percentage', 'mba_percent']
         cate_cols=['gender', 'ssc_board', 'hsc_board', 'hsc_subject', 'undergrad_degree','work_experience', 'specialisation']
         gender_cate = ['M', 'F']
         ssc_board_cate = ['Central', 'Others']
         hsc_board_cate = ['Others', 'Central']
         hsc_subject_cate = ['Commerce', 'Science', 'Arts']
         undergrad_degree_cate = ['Comm&Mgmt', 'Sci&Tech', 'Others']
         work_experience_cate = ['No', 'Yes']
         specialisation_cate = ['Mkt&Fin', 'Mkt&HR']

         num_col_pipline=Pipeline(
				steps=[
					('Impute',SimpleImputer(strategy='mean')),
					('scaling',RobustScaler())
				]
			)
         
         cate_cols_pipline = Pipeline(
				steps=[
					('Impute', SimpleImputer(strategy='most_frequent')),
					('encoding', OneHotEncoder(categories=[
						gender_cate, ssc_board_cate, hsc_board_cate, hsc_subject_cate,
						undergrad_degree_cate, work_experience_cate, specialisation_cate
					], handle_unknown='ignore'))
				]
			)
         
         preprocess=ColumnTransformer([
				('num_col_pipline',num_col_pipline,num_col),
				('cate_cols_pipline',cate_cols_pipline,cate_cols)
			])
         
         logging.info('Data transformation preprocess obj created')

         return preprocess
      except Exception as e:
         logging.info(f'Error in Transformation preprocess obj: {str(e)}')
         raise e
   def transforming_data(self):
      try:
         data=pd.read_csv(self.config.unzip_dir)

         print(data.head())

         logging.info('data retrive successfully')

         x=data.drop(self.config.target_col,axis=1)
         y=data[self.config.target_col]

         x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.26,random_state=50)

         preprocess_obj=self.get_preprocess_obj()

         x_train=preprocess_obj.fit_transform(x_train)
         x_test=preprocess_obj.transform(x_test)


         logging.info('data preprocess completed')

         train_arr=np.c_[x_train,np.array(y_train)]
         test_arr=np.c_[x_test,np.array(y_test)]

         np.save(self.config.train_arr,train_arr)

      
         np.save(self.config.test_arr,test_arr)
         logging.info('preprocess data save completed')

         save_obj(
                            file_path=self.config.preprocess_obj,
                            obj=preprocess_obj
			)
         logging.info('preprocess object save completed')

         return (
                train_arr,
                test_arr
               )
      except Exception as e:
         logging.info(f'Error in Data Transformation : {str(e)}')
         raise e
          

In [13]:
try:
		config=ConfigManager()
		data_transformation_config=config.get_transformation_config()
		data_transformation=DataTransformation(config=data_transformation_config)
		data_transformation.get_preprocess_obj()
except Exception as e:
         logging.info('error')

[2024-11-06 13:41:29,811: INFO :utils : Yaml file config\config.yaml created]
[2024-11-06 13:41:29,816: INFO :utils : Yaml file params.yaml created]
[2024-11-06 13:41:29,825: INFO :utils : Yaml file schema.yaml created]
[2024-11-06 13:41:29,828: INFO :utils : created directory at: artifacts]
[2024-11-06 13:41:29,831: INFO :utils : created directory at: artifacts/data_transformation]
[2024-11-06 13:41:29,833: INFO :3194053651 : Data transformation preprocess obj created]
