In [12]:
%pwd

'd:\\Advanced Project\\time-estimation\\notebooks'

In [13]:
import os
os.chdir("../")

In [14]:
from dataclasses import dataclass
from pathlib import Path

In [None]:
@dataclass(frozen=True)
class DataPreProcessingEntity:
    root_dir: Path
    img_dir: Path
    label_csv: Path
    train_dir: Path
    test_dir: Path
    val_dir: Path
    data_mean: float
    data_std: float
    img_height: int
    img_width: int

In [16]:
from timeEstimator.constant import *
from timeEstimator.Utils.common import read_yaml, create_directory

In [17]:
class ConfigurationManager:
    def __init__(self, params=PARAMS_FILE_PATH, config=CONFIG_FILE_PATH):
        self.params = read_yaml(params)
        self.config = read_yaml(config)

        create_directory([self.config.root_dir])
    
    def data_preprocessing_config(self):
        config = self.config.data_preprocessed
        params = self.params.data_preprocessed

        create_directory([
            config.root_dir,
            config.train_dir,
            config.test_dir,
            config.val_dir
        ])

        config = DataPreProcessingEntity(
            root_dir = Path(config.root_dir),
            img_dir = Path(config.img_dir),
            label_csv = Path(config.label_csv),
            train_dir = Path(config.train_dir),
            test_dir = Path(config.test_dir),
            val_dir = Path(config.val_dir),
            data_mean = params.data_mean,
            data_std = params.data_std,
            img_height = params.img_height,
            img_width = params.img_width
        )

        return config

In [35]:
import cv2
import pandas as pd
import numpy as np
from timeEstimator.logging import logger
from sklearn.model_selection import train_test_split
from albumentations import Compose, Resize, Normalize
from timeEstimator.Exception.exception import CustomException

In [None]:
class DataPreProcessing:
    def __init__(self, config : DataPreProcessingEntity):
        self.config = config

    def read_img(self, img_dir):
        image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
        image = np.repeat(image, 3,axis=-1)

        return image


    def preprocess(self, x, root_dir):

        aug = Compose([
            Resize(height=self.config.img_height, width=self.config.img_width, always_apply=True),
            Normalize(mean=self.config.data_mean, std=self.config.data_std)
        ])
        img_name = x['img_dir']
        img_dir = os.path.join(self.config.img_dir, img_name)
        img_load = self.read_img(img_dir)
        augmented_img = aug(image = img_load)
        img_uint8 = (augmented_img * 255).astype(np.uint8)


        img_name = img_name.split(".")[0]
        save_dir = os.path.join(root_dir, f"{img_name}.png")


        cv2.imwrite(save_dir, img_uint8)

        return save_dir


    def load_data(self):
        try:
            df = pd.read_csv(self.config.label_csv)
            logger.info(f"Label CSV File Loaded: {self.config.label_csv}")
            
            train, test = train_test_split(df, test_size=0.15, random_state=42)
            train, val = train_test_split(train, test_size=0.15, random_state=42)

            logger.info(f"Data Splitted: Train ({len(train)}), Test ({len(test)}), Validation ({len(val)})")

            train["new_img_dir"] = train.apply(lambda x: self.preprocess(x, self.config.train_dir),axis=1)
            test["new_img_dir"] = test.apply(lambda x: self.preprocess(x, self.config.test_dir), axis=1)
            val["new_img_dir"] = val.apply(lambda x: self.preprocess(x, self.config.val_dir), axis=1)

            logger.info(f"Data Pre-Processing Completed")

            train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
            test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)
            val.to_csv(os.path.join(self.config.root_dir, "val.csv"), index=False)

            logger.info(f"Train, Test and Validation Pre-processed Data saved at: {self.config.root_dir} ")
        except Exception as e:
            raise CustomException(str(e))

        

In [55]:
config = ConfigurationManager()
preprocess_config = config.data_preprocessing_config()
data_preprocessing = DataPreProcessing(preprocess_config)
data_preprocessing.load_data()

[2025-02-26 01:27:42,259]: INFO: common : Read YAML File: params.yaml
[2025-02-26 01:27:42,261]: INFO: common : Read YAML File: config\config.yaml
[2025-02-26 01:27:42,263]: INFO: common : Directory has been Created: artifacts
[2025-02-26 01:27:42,264]: INFO: common : Directory has been Created: artifacts\data_preprocessed
[2025-02-26 01:27:42,265]: INFO: common : Directory has been Created: artifacts\data_preprocessed\train
[2025-02-26 01:27:42,266]: INFO: common : Directory has been Created: artifacts\data_preprocessed\test
[2025-02-26 01:27:42,267]: INFO: common : Directory has been Created: artifacts\data_preprocessed\val
[2025-02-26 01:27:42,295]: INFO: 34648359 : Label CSV File Loaded: artifacts\data_ingestion\label.csv
[2025-02-26 01:27:42,312]: INFO: 34648359 : Data Splitted: Train (31212), Test (6480), Validatio (5508)


  Resize(height=self.config.img_height, width=self.config.img_width, always_apply=True),
  Resize(height=self.config.img_height, width=self.config.img_width, always_apply=True),
  Resize(height=self.config.img_height, width=self.config.img_width, always_apply=True),


[2025-02-26 01:36:13,846]: INFO: 34648359 : Data Pre-Processing Completed
[2025-02-26 01:36:13,952]: INFO: 34648359 : Train, Test and Validation Pre-processed Data saved at: artifacts\data_preprocessed 
