In [1]:
import os

In [2]:
%pwd

'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject/research'

In [3]:
os.chdir("../")
%pwd

'/Users/a/Documents/DataScience_World/ML10_end_to_end/dsproject/CompleteDSproject'

In [4]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from src.datascience import logger
from src.datascience.utils.common import read_yaml, create_directories

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    preprocessor_path: Path
    transformed_data_path: Path
    target_column: str = "health_risk"


In [5]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=Path("config/config.yaml"),
        params_filepath=Path("params.yaml"),
        schema_filepath=Path("schema.yaml")
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(self.config.data_ingestion.local_data_file),
            preprocessor_path=Path(config.root_dir) / "preprocessor.joblib",
            transformed_data_path=Path(config.root_dir) / "transformed_data.csv",
            target_column=self.schema.TARGET_COLUMN.name
        )
        
        return data_transformation_config


In [6]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.label_encoders = {}
        self.target_encoder = LabelEncoder()
        
    def get_data_transformer(self):
        """
        Creates the data transformation pipeline
        """
        try:
            # Categorical columns (excluding target and ID)
            categorical_columns = [
                'product_name', 'brand', 'category', 'adulterant',
                'detection_method', 'severity', 'action_taken'
            ]
            
            # Date column
            date_column = ['detection_date']
            
            # Create preprocessing steps for categorical and date features
            categorical_transformer = Pipeline(steps=[
                ('label_encoder', self.CustomLabelEncoder())
            ])
            
            date_transformer = Pipeline(steps=[
                ('date_converter', self.DateFeatureExtractor())
            ])
            
            # Combine all transformers
            preprocessor = ColumnTransformer(
                transformers=[
                    ('cat', categorical_transformer, categorical_columns),
                    ('date', date_transformer, date_column)
                ],
                remainder='passthrough'
            )
            
            return preprocessor
            
        except Exception as e:
            logger.error(f"Error in creating data transformer: {str(e)}")
            raise e
    
    class CustomLabelEncoder:
        """Custom transformer for label encoding with handling for unknown values"""
        def __init__(self):
            self.encoders = {}
            
        def fit(self, X, y=None):
            X = pd.DataFrame(X)
            for column in X.columns:
                self.encoders[column] = LabelEncoder()
                self.encoders[column].fit(X[column])
            return self
            
        def transform(self, X):
            X = pd.DataFrame(X)
            X_encoded = X.copy()
            for column in X.columns:
                encoder = self.encoders[column]
                X_encoded[column] = X[column].map(
                    lambda x: -1 if x not in encoder.classes_ else encoder.transform([x])[0]
                )
            return X_encoded
            
    class DateFeatureExtractor:
        """Custom transformer for extracting features from dates"""
        def fit(self, X, y=None):
            return self
            
        def transform(self, X):
            X = pd.DataFrame(X)
            date_df = pd.to_datetime(X.iloc[:, 0])
            return pd.DataFrame({
                'year': date_df.dt.year,
                'month': date_df.dt.month,
                'day': date_df.dt.day
            })
    
    def transform_data(self):
        """
        Transforms the data using the preprocessing pipeline
        """
        try:
            # Read the data
            df = pd.read_csv(self.config.data_path)
            
            # Separate features and target
            X = df.drop(columns=[self.config.target_column, 'adulteration_id'])
            y = df[self.config.target_column]
            
            # Create and fit the preprocessor
            preprocessor = self.get_data_transformer()
            X_transformed = preprocessor.fit_transform(X)
            
            # Transform target variable
            y_transformed = self.target_encoder.fit_transform(y)
            
            # Create feature names
            feature_names = (
                [f"{col}_{i}" for col, n_cols in zip(['product_name', 'brand', 'category', 'adulterant',
                                                     'detection_method', 'severity', 'action_taken'], 
                                                    [1]*7) for i in range(n_cols)] +
                ['year', 'month', 'day']
            )
            
            # Convert to DataFrame
            transformed_df = pd.DataFrame(
                X_transformed,
                columns=feature_names
            )
            transformed_df['target'] = y_transformed
            
            # Save the preprocessor and transformed data
            create_directories([self.config.root_dir])
            joblib.dump(preprocessor, self.config.preprocessor_path)
            transformed_df.to_csv(self.config.transformed_data_path, index=False)
            
            # Save target encoder
            joblib.dump(self.target_encoder, 
                       Path(self.config.root_dir) / "target_encoder.joblib")
            
            return transformed_df
            
        except Exception as e:
            logger.error(f"Error in transforming data: {str(e)}")
            raise e

In [7]:
# Main function to run data transformation

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
        
    logger.info("Starting data transformation...")
    transformed_data = data_transformation.transform_data()
    logger.info("Data transformation completed successfully")
    logger.info(f"Transformed data shape: {transformed_data.shape}")
        
except Exception as e:
    logger.error(f"Error in data transformation: {str(e)}")
    raise e

[2025-01-08 12:11:31,687: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-01-08 12:11:31,690: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-08 12:11:31,694: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-01-08 12:11:31,695: INFO: common: created directory at: artifacts]
[2025-01-08 12:11:31,698: INFO: common: created directory at: artifacts/data_transformation]
[2025-01-08 12:11:31,699: INFO: 314665959: Starting data transformation...]
[2025-01-08 12:11:32,577: INFO: common: created directory at: artifacts/data_transformation]
[2025-01-08 12:11:32,587: INFO: 314665959: Data transformation completed successfully]
[2025-01-08 12:11:32,588: INFO: 314665959: Transformed data shape: (1000, 11)]
