In [3]:
import os
%pwd

'f:\\work env\\End-to-End-MLOps-with-MLflow\\research'

In [4]:
os.chdir('../')

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
  root_dir: Path
  data_path: Path
  schema: dict

In [6]:
from MLOpsProject.constants import *
from MLOpsProject.utils.common import read_yaml, create_directories

In [7]:
# creating folders from the config file
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            schema=self.schema,
        )

        return data_transformation_config

In [13]:
import os
from MLOpsProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [32]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.data = pd.read_csv(self.config.data_path, sep=";")
    
    def categorical_columns(self):
        categorical_columns = []
        for col in self.data.columns:
            if self.data[col].dtype == "object":
                categorical_columns.append(col)
        return categorical_columns

    def handle_missing_values(self):
        imputer = SimpleImputer(strategy='most_frequent')
        self.data[self.categorical_columns()] = imputer.fit_transform(self.data[self.categorical_columns()])
        self.data.fillna(self.data.mean(), inplace=True)
    def handle_imbalanced_data(self):
        target = self.config.schema.TARGET_COLUMN.name
        X = self.data.drop(target, axis=1)
        y = self.data[target]

        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        # Create a new DataFrame with the resampled data
        resampled_df = pd.concat([
            pd.DataFrame(X_resampled, columns=X.columns),
            pd.DataFrame(y_resampled, columns=[target])
        ], axis=1)

        self.data = resampled_df

    def handle_duplicates(self):
        self.data.drop_duplicates(inplace=True)

    def handling_outliers(self):
        pass  # TODO: Add outlier handling code later

    def binning(self):
        categorical_columns = self.categorical_columns()
        for i in categorical_columns:
            self.data[i] = pd.cut(self.data[i], bins=5, labels=False)

    def encoding(self):
        categorical_columns = self.categorical_columns()
        label = LabelEncoder()
        for i in categorical_columns:
            self.data[i] = label.fit_transform(self.data[i])

    def feature_scaling(self):
        scaler = StandardScaler()
        scalled_columns = self.data.columns.drop(self.config.schema.TARGET_COLUMN.name)
        self.data[scalled_columns] = scaler.fit_transform(self.data[scalled_columns])

    def train_test_split(self):
        train, test = train_test_split(self.data, test_size=0.2, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, 'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index=False)

        logger.info("Splitted data into train and test set")
        logger.info(f"Train shape: {train.shape}, Test shape: {test.shape}")


In [33]:
try:
    # Initialize configuration manager and get data transformation config
    config_manager = ConfigurationManager()
    data_transformation_config = config_manager.get_data_transformation_config()

    # Initialize data transformation
    data_transformation = DataTransformation(data_transformation_config)

    # Apply data transformations
    data_transformation.handle_duplicates()
    data_transformation.handle_missing_values()
    data_transformation.encoding()
    data_transformation.feature_scaling()
    # data_transformation.handle_imbalanced_data()
    data_transformation.train_test_split()

except Exception as e:
    # Log the exception and re-raise it
    logger.exception("An exception occurred during data transformation.")
    raise e


[2023-09-28 16:26:10,733: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-28 16:26:10,747: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-28 16:26:10,749: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-09-28 16:26:10,750: INFO: common: directory is created at artifacts]
[2023-09-28 16:26:10,751: INFO: common: directory is created at artifacts/data_transformation]


  self.data.fillna(self.data.mean(), inplace=True)


[2023-09-28 16:26:13,629: INFO: 3148803184: Splitted data into train and test set]
[2023-09-28 16:26:13,630: INFO: 3148803184: Train shape: (36168, 17), Test shape: (9043, 17)]
