In [1]:
import os

In [2]:
os.chdir('c:\\Users\\Archana\\Desktop\\test\\assignmen_solution\\')

In [3]:
import pandas as pd

In [7]:
df = pd.read_csv('artifacts/data_ingestion/employee_data.csv', on_bad_lines='skip')


In [8]:
df.head()

Unnamed: 0,employee_id,age,gender,marital_status,salary,employment_type,region,has_dependents,tenure_years,enrolled
0,10001,60,Female,Single,55122.97,Part-time,West,No,1.5,0
1,10002,50,Female,Single,89549.66,Full-time,West,Yes,12.8,1
2,10003,36,Male,Divorced,74145.66,Part-time,Midwest,No,3.8,0
3,10004,64,Female,Married,53877.83,Full-time,Northeast,No,3.3,0
4,10005,29,Male,Single,63404.63,Contract,Midwest,Yes,10.0,0


In [9]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE : str
    unzip_data_dir: Path
    all_schema:dict

In [11]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
Using cached scipy-1.15.2-cp310-cp310-win_amd64.whl (41.2 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0


In [10]:
from src.ml_proj.constants import *
from  src.ml_proj.utils.common import read_yaml, create_directories

In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
from src.ml_proj import logger

In [13]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema,
        )
        return data_validation_config


In [17]:
class DataPreprocessing:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def encode_categorical_columns(self):
        try:
            categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
            encoder = LabelEncoder()

            for col in categorical_cols:
                self.df[col] = encoder.fit_transform(self.df[col])

            logger.info("Categorical columns encoded successfully.")
        except Exception as e:
            logger.error(f"Error encoding categorical columns: {e}")
            raise e

    def scale_features(self, features: list):
        
        try:
            scaler = StandardScaler()
            self.df[features] = scaler.fit_transform(self.df[features])

            logger.info(f"Features {features} scaled successfully.")
        except Exception as e:
            logger.error(f"Error scaling features: {e}")
            raise e



In [14]:
class DataValidation:
    def __init__(self, config=DataValidationConfig):
        self.config = config
    
    def validate_all_columns(self) -> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        except Exception as e:
            raise e

In [18]:
try:
    # Initialize Configuration Manager
    config = ConfigurationManager()

    # Get data validation configuration
    data_validation_config = config.get_data_validation_config()

    # Data validation
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()

    # Data preprocessing
    df = pd.read_csv(config.config.data_validation.unzip_data_dir)  # Load your dataframe
    data_preprocessor = DataPreprocessing(df)

    # Encode categorical columns
    data_preprocessor.encode_categorical_columns()

    # Scale numerical features
    features_to_scale = ["age", "salary", "tenure_years"]
    data_preprocessor.scale_features(features_to_scale)

except Exception as e:
    logger.error(f"Error in processing: {e}")
    raise e

[2025-03-27 00:26:06,519: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-27 00:26:06,521: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-27 00:26:06,525: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-27 00:26:06,528: INFO: common: created directory at: artifacts]
[2025-03-27 00:26:06,530: INFO: common: created directory at: artifacts/data_validation]
[2025-03-27 00:26:06,599: INFO: 1313075877: Categorical columns encoded successfully.]
[2025-03-27 00:26:06,619: INFO: 1313075877: Features ['age', 'salary', 'tenure_years'] scaled successfully.]
