In [1]:
import os

In [2]:
pwd

'c:\\Users\\asbpi\\Desktop\\ASB\\Data_Science\\Projects\\MY Projects\\Stroke-Risk-Prediction\\research'

In [3]:
os.chdir('../')

In [4]:
pwd

'c:\\Users\\asbpi\\Desktop\\ASB\\Data_Science\\Projects\\MY Projects\\Stroke-Risk-Prediction'

In [5]:
# Entity

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessConfig:
    root_dir: Path
    data_dir: Path
    dataset_name: Path
    save_data_file: Path

In [7]:
# Configuration Manager

In [8]:
from stroke_risk.constants import *
from stroke_risk.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preprocess_config(self) -> DataPreprocessConfig:
        config = self.config.data_preprocess

        create_directories([config.root_dir])

        data_preprocess_config = DataPreprocessConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
            dataset_name=config.dataset_name,
            save_data_file=config.save_data_file 
        )

        return data_preprocess_config

In [10]:
# Componants

In [11]:
import pandas as pd
import numpy as np
from stroke_risk import logger

In [12]:
class DataPreprocess:
    def __init__(self, config: DataPreprocessConfig):
        self.config = config


    def preprocess(self):

        data_dir=self.config.data_dir
        dataset_name=self.config.dataset_name
        save_data_file=self.config.save_data_file

        data = pd.read_csv(Path(data_dir,dataset_name))

        data.age = data.age.astype(np.int64)
        cata_col = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
        data[cata_col] = data[cata_col].astype('category')
        logger.info('Datatype Fixed')

        data = data[data['gender'] != 'Other']
        data['smoking_status'] = data['smoking_status'].replace('Unknown', 'formerly smoked')
        data['work_type'] = data['work_type'].replace('children', 'Never_worked')
        logger.info('Data Substituted')

        data['age_bin'] = pd.cut(data['age'], bins=[0, 35, 50, 65, 75, np.inf], labels=['0-35', '36-50', '51-65', '65-75', '75+'])
        data = data[~data.age_bin.isnull()]

        data.loc[:,'gender_age']=data.gender.astype(str) + '_' + data.age_bin.astype(str)

        logger.info('age-bin and gender-age created')

        mean_bmi = data.groupby('gender_age')['bmi'].transform('mean')
        data.loc[data['bmi'].isnull(), 'bmi'] = mean_bmi

        logger.info('Null value in bmi imputed')

        col_to_drop = ['id', 'age_bin', 'gender_age']
        data = data.drop(columns=col_to_drop)

        logger.info('Unnessary columns deleted')

        data.to_csv(os.path.join(save_data_file),index = False)

        logger.info(f'File saved in {save_data_file}')

In [13]:
# Pipeline

In [14]:
try:
    config = ConfigurationManager()
    data_preprocess_config = config.get_data_preprocess_config()
    data_preprocess = DataPreprocess(config=data_preprocess_config)
    data_preprocess.preprocess()
except Exception as e:
    raise e

[2023-12-23 20:36:13,367: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-23 20:36:13,370: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-23 20:36:13,375: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-23 20:36:13,382: INFO: common: created directory at: artifacts]
[2023-12-23 20:36:13,384: INFO: common: created directory at: artifacts/data_preprocess]
[2023-12-23 20:36:13,406: INFO: 2889185664: Datatype Fixed]
[2023-12-23 20:36:13,420: INFO: 2889185664: Data Substituted]
[2023-12-23 20:36:13,433: INFO: 2889185664: age-bin and gender-age created]
[2023-12-23 20:36:13,440: INFO: 2889185664: Null value in bmi imputed]
[2023-12-23 20:36:13,443: INFO: 2889185664: Unnessary columns deleted]
[2023-12-23 20:36:13,507: INFO: 2889185664: File saved in artifacts/data_preprocess/data.csv]
