In [1]:
import os
%pwd

'/home/armando-albornoz/Desktop/ml/MLOPS_course/project1/datascienceendtoend1/research'

In [2]:
os.chdir("../")
%pwd

'/home/armando-albornoz/Desktop/ml/MLOPS_course/project1/datascienceendtoend1'

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    """
    Configuration class for data transformation operations.
    
    Attributes:
        root_dir: Directory where transformation artifacts will be stored
        data_path: Path to the validated data file
        test_size: Proportion of data to use for testing (default: 0.2)
        random_state: Random seed for reproducibility (default: 42)
    """
    root_dir: Path
    data_path: Path
    test_size: float 
    random_state: int 


In [4]:
from src.datascience.constants import * 
from src.datascience.utils.common import read_yaml, create_directories
from src.datascience import logger
from src.datascience.config.configuration import DataIngestionConfig

class ConfigurationManager:
    """
    Configuration manager for handling YAML configuration files.
    
    This class loads configuration, parameters, and schema files and provides
    methods to retrieve specific configuration objects.
    """
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath= PARAMS_FILE_PATH, schema_filepath = SCHEMA_FILE_PATH ):

        """
        Initialize the ConfigurationManager.
        
        Args:
            config_filepath (Path): Path to the main configuration file
            params_filepath (Path): Path to the parameters file
            schema_filepath (Path): Path to the schema file
        """
          
        try:
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)
            
            # Create artifacts root directory
            create_directories([self.config.artifacts_root])
            logger.info("ConfigurationManager initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing ConfigurationManager: {e}")
            raise


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        return data_ingestion_config
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path=config.data_path,
            test_size=config.test_size,
            random_state=config.random_state
        )
        return data_transformation_config

[2025-08-02 15:47:06,288: INFO: __init__: Logger initialized for the datascience package.]


In [5]:
import os
from src.datascience import logger
from sklearn.model_selection import train_test_split
import pandas as pd

class DataTransformation:
    """
    Component for data transformation operations including train-test split.
    """
 
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_split_(self):
        """
        Split the data into training and testing sets.

        Raises:
            FileNotFoundError: If the data file doesn't exists
            Exception: If there is an error during splitting
        """

        try:
            # Load data    
            logger.info(f"Loading data from: {self.config.data_path}")
            data = pd.read_csv(self.config.data_path)
            logger.info(f"Data loaded successfully. Original shape: {data.shape}")
            

            # Split the data into train and test
            logger.info(f"Splitting the data")
            train, test = train_test_split(data, test_size= self.config.test_size, random_state=self.config.random_state)
            logger.info("Splitted data into training and test sets")


            train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index = False)
            test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index = False)

            print(f"Files saved to: {self.config.root_dir}")
            logger.info(train.shape)
            logger.info(test.shape)

            print(train.shape)
            print(test.shape)

        except FileNotFoundError as e:
            logger.error(f"Data file not found: {e}")
            raise
        except Exception as e:
            logger.error(f"Error during train-test split: {e}")
            raise

In [6]:
try: 
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_split_()
except Exception as e: 
    raise e  

[2025-08-02 16:22:03,587: INFO: common: YAML file: config/config.yaml loaded successfully]
[2025-08-02 16:22:03,588: INFO: common: YAML file: params.yaml loaded successfully]
[2025-08-02 16:22:03,589: INFO: common: YAML file: schema.yaml loaded successfully]
[2025-08-02 16:22:03,590: INFO: common: Created directory at artifacts]
[2025-08-02 16:22:03,590: INFO: 276801081: ConfigurationManager initialized successfully]
[2025-08-02 16:22:03,591: INFO: common: Created directory at artifacts/data_transformation]
[2025-08-02 16:22:03,591: INFO: 2586220882: Loading data from: artifacts/data_ingestion/Employers_data.csv]
[2025-08-02 16:22:03,608: INFO: 2586220882: Data loaded successfully. Original shape: (10000, 10)]
[2025-08-02 16:22:03,610: INFO: 2586220882: Splitting the data]
[2025-08-02 16:22:03,616: INFO: 2586220882: Splitted data into training and test sets]
Files saved to: artifacts/data_transformation
[2025-08-02 16:22:03,750: INFO: 2586220882: (8000, 10)]
[2025-08-02 16:22:03,751: I