In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ayush\\OneDrive - Sujal Dhungana\\MBA Admission Classification Project\\notebooks'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\ayush\\OneDrive - Sujal Dhungana\\MBA Admission Classification Project'

In [5]:
import pandas as pd
import numpy as np
import sys
from src import *
from src.logger import logging
from src.exception import CustomException
from dataclasses import dataclass
import warnings
from pathlib import Path
from src.utils.common import read_yaml_file, create_directory, save_object, load_object, save_transformed_data

warnings.filterwarnings("ignore")

In [6]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_path: Path
    train_arr: Path
    test_arr: Path

In [7]:
class ConfigurationManager:
    def __init__(self,
                  config_file_path = CONFIG_FILE_PATH,
                  params_file_path = PARAMS_FILE_PATH):
        try:
            self.config = read_yaml_file(config_file_path)

            logging.info("Configuration and Parameters files have been read successfully")

            logging.info("Creating directories to store artifacts")
            create_directory([self.config.artifacts_directory])
            logging.info("Directories have been created successfully")
        except Exception as e:
            raise CustomException(e, sys)
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            logging.info("Getting data transformation config")
            config = self.config.data_transformation

            logging.info("Creating directories to store transformed data")
            create_directory([config.root_dir])

            logging.info("Directories have been created successfully to store transformed data")

            logging.info("Returning data transformation config")
            data_transformation_config = DataTransformationConfig(
                preprocessor_obj_path = config.preprocessor_obj_path,
                train_arr = config.train_arr_path,
                test_arr = config.test_arr_path
            )

            return data_transformation_config

        except Exception as e:
            raise CustomException(e, sys)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [9]:
class DataTransformation:
    def __init__(self, 
                 config: DataTransformationConfig):
        
        self.config = config
    
    def create_preprocessor(self):
        '''
        This function creates a preprocessor object and saves it to the path specified in the configuration file

        Returns:
        preprocessor: ColumnTransformer object
        '''
        try:
            logging.info("Specifying the numerical and categorical features")
            numerical_features = ['gpa', 'gmat', 'work_exp']
            categorical_features = [
                    'gender',
                    'international',
                    'major',
                    'race',
                    'work_industry'
            ]

            logging.info("Creating a numerical pipeline")
            num_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='mean')),
                    ('scaler', StandardScaler())
                ]
            )

            logging.info("Successfully created numerical pipeline")

            logging.info("Creating a categorical pipeline")
            cat_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
                ]
            )

            logging.info("Successfully created categorical pipeline")

            logging.info("Creating a column transformer")
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', num_pipeline, numerical_features),
                    ('cat', cat_pipeline, categorical_features)
                ]
            )

            logging.info("Successfully created column transformer")

            logging.info("Saving the preprocessor object to the specified path")
            save_object(
                object= preprocessor, 
                object_path = self.config.preprocessor_obj_path
                )
            logging.info("Preprocessor object has been saved successfully")

            logging.info("Returning the preprocessor object")
            return preprocessor
        
        except Exception as e:
            raise CustomException(e, sys)
        

    def initiate_data_transformation(self, training_data_path, testing_data_path):
        '''
        This function transforms the data using the preprocessor object

        Returns:
        transformed_data: DataFrame
        '''
        try:
            logging.info("Reading the training and testing data")
            train_data = pd.read_csv(training_data_path)
            test_data = pd.read_csv(testing_data_path)

            logging.info("Data has been read successfully")

            logging.info("Replacing empty strings with 'Rejected' in the admission column")
            train_data['admission'] = train_data['admission'].replace('', 'Rejected')
            test_data['admission'] = test_data['admission'].replace('', 'Rejected')

            logging.info("Splitting the data into input features and target feature")
            target = ['admission']

            train_input_features = train_data.drop(columns=['admission', 'application_id'], axis=1)
            train_target_feature = train_data[target]

            test_input_features = test_data.drop(columns=['admission', 'application_id'], axis=1)
            test_target_feature = test_data[target]

            logging.info("Initializing the preprocessor object")
            preprocessor = self.create_preprocessor()
            
            logging.info("Preprocessor object has been initialized successfully")

            logging.info("Transforming the training data")
            transformed_train_data = preprocessor.fit_transform(train_input_features)
            logging.info("Training data has been transformed successfully")

            logging.info("Transforming the testing data")
            transformed_test_data = preprocessor.transform(test_input_features)
            logging.info("Testing data has been transformed successfully")

            logging.info("Initializing the target encoder")
            target_encoder = LabelEncoder()

            logging.info("Fitting the target encoder on the training target feature")
            target_encoder.fit(train_target_feature)

            logging.info("Transforming the training target feature")
            transformed_train_target = target_encoder.transform(train_target_feature)


            logging.info("Combining the transformed training data and target feature")
            train_arr = np.c_[
                transformed_train_data,
                transformed_train_target
            ]

            logging.info("Combining the transformed testing data and target feature")
            test_arr = np.c_[
                transformed_test_data,
                test_target_feature
            ]

            logging.info("Saving the transformed trained data")
            save_transformed_data(
                data = train_arr, 
                path = self.config.train_arr
            )

            logging.info("Succesfully saved transformed train data")
            
            logging.info("Saving the transformed test data")
            save_transformed_data(
                data = test_arr,
                path = self.config.test_arr
            )

            logging.info("Succesfully saved transformed train data")


            logging.info("Returning the transformed data")

            return train_arr, test_arr

        except Exception as e:
            raise CustomException(e, sys)

In [10]:
if __name__ == '__main__':
    try:
        config_manager = ConfigurationManager()
        data_transformation_config = config_manager.get_data_transformation_config()
        data_transformation = DataTransformation(data_transformation_config)
        data_transformation.initiate_data_transformation(
            training_data_path='artifacts/data_ingestion/train_data.csv', 
            testing_data_path='artifacts/data_ingestion/test_data.csv')
    except Exception as e:
        logging.error(e)
        raise CustomException(e, sys)