In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card'

### Data Preprocessing Config

This code will be apply in `src/MLProject/entity/config_entity.py`

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    data_path: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    params_test_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    model_dir: Path
    scaler_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [6]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        ingest_config = self.config.data_ingestion
        dataset_params = self.params

        create_directories([dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=dump_config.root_dir,
            data_path=ingest_config.data_path,
            input_train_path=dump_config.input_train_path,
            input_test_path=dump_config.input_test_path,
            input_valid_path=dump_config.input_valid_path,
            output_train_path=dump_config.output_train_path,
            output_test_path=dump_config.output_test_path,
            output_valid_path=dump_config.output_valid_path,
            params_test_size=dataset_params.TEST_SIZE,
            params_valid_size=dataset_params.VALID_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        scaler_config = self.config.scale_data
        train_config = self.config.train_model

        create_directories([scaler_config.root_dir, train_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=scaler_config.root_dir,
            input_train_path=Path(dump_config.input_train_path),
            input_test_path=Path(dump_config.input_test_path),
            input_valid_path=Path(dump_config.input_valid_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            scaled_valid_path=Path(scaler_config.scaled_valid_path),
            model_dir=train_config.root_dir,
            scaler_model_path=Path(scaler_config.scaler_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/MLProject/components/preprocessing.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Scalling dataset using `Standard Scaler`

As stated before; let’s load, select columns, and drop null values from dataset.

In [12]:
import pandas as pd

df = pd.read_csv('artifacts/data-ingestion/credit_card.csv')

df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [13]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from MLProject import logger

class Preprocessing:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset = pd.read_csv(self.config.data_path)
        dataset = dataset.drop(columns=['id']).copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split data file to data train and test-valid.")
        X_train, X_test_valid, y_train, y_test_valid = train_test_split(
            dataset.drop(columns=["Class"]), 
            dataset["Class"], 
            test_size=(self.config.params_test_size + self.config.params_valid_size),
            stratify=dataset["Class"],
        )

        logger.info(f"Split data file to data test and valid.")
        X_test, X_valid, y_test, y_valid = train_test_split(
            X_test_valid, 
            y_test_valid, 
            test_size=self.config.params_valid_size / (self.config.params_test_size + self.config.params_valid_size),
            stratify=y_test_valid,
        )
        
        # NOTE: data save as pandas dataframe and y as series
        logger.info(f"Dump data train into {self.config.root_dir} directory.")
        X_train.to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        X_valid.to_pickle(self.config.input_valid_path)
        
        # NOTE: data save as pandas dataframe and y as serie
        logger.info(f"Dump data test into {self.config.root_dir} directory.")
        y_train.to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)
        y_valid.to_pickle(self.config.output_valid_path)
        
    def scaling_data(self) -> None:
        """scaling the splited dataset and dump vectorizer model
        """
        scaler = StandardScaler()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)

        logger.info(f"Load data test in {self.config.input_valid_path}.")
        X_valid = joblib.load(self.config.input_valid_path)
        
        logger.info(f"scaled the data.")
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_valid_scaled = scaler.transform(X_valid)
        
        logger.info(f"Dump the scaled data.")
        joblib.dump(X_train_scaled, self.config.scaled_train_path)
        joblib.dump(X_test_scaled, self.config.scaled_test_path)
        joblib.dump(X_valid_scaled, self.config.scaled_valid_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the scaler model.")
        joblib.dump(scaler, self.config.scaler_model_path)  

### Dump the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [14]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = Preprocessing(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-21 20:55:27,133: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-21 20:55:27,137: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-21 20:55:27,138: INFO: common: created directory at: artifacts]
[2024-07-21 20:55:27,142: INFO: common: created directory at: artifacts/data]
[2024-07-21 20:55:27,144: ERROR: 1521379623: DataDumpConfig.__init__() got an unexpected keyword argument 'input_valid_path']


TypeError: DataDumpConfig.__init__() got an unexpected keyword argument 'input_valid_path'

**Debug**: Read data

In [11]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
248425,1.754927,-0.393407,-0.135828,-0.448529,0.482552,-0.037760,0.526623,-0.180708,0.779576,0.449311,...,-0.413421,-0.167299,-0.387322,0.173231,1.341959,0.006559,-1.153600,-0.265986,-0.184989,4475.75
368877,1.350748,0.220422,-0.453574,0.830867,1.506291,-0.491155,0.968543,-0.216497,-0.534833,0.214951,...,-0.279746,-0.156183,-0.302623,-0.276316,-0.726265,1.108809,0.631058,-0.244019,0.108113,22117.95
532244,-0.841089,0.374503,-0.201332,0.214072,-0.062069,-0.249024,-0.082528,-0.612379,1.258322,0.497275,...,-0.498229,0.536935,-0.038184,-0.533327,0.830661,-0.601900,-0.813047,-2.722369,-1.311557,468.40
481608,-1.720216,1.581100,-1.688087,1.322418,-1.997766,-1.429178,-2.140528,0.245107,-1.850792,-2.100497,...,0.234697,-0.199256,1.055361,-0.184031,1.666981,-0.053483,-0.478214,-1.792901,-1.318596,9334.90
349450,-0.404037,-0.219671,0.813908,0.241831,0.398790,0.539169,1.074539,-0.577045,0.622878,2.089588,...,-1.011491,0.056856,0.122865,0.134410,1.241832,-0.761747,-0.188068,-2.035071,0.514193,3412.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303825,-0.193397,-0.078099,-0.440035,0.968583,-0.590128,1.707636,1.738593,-0.618271,-0.626989,0.167888,...,0.030509,-0.470086,0.421709,1.100014,-0.235584,-1.738505,-0.119052,0.307710,-1.162891,18912.44
554311,-1.041764,-0.100731,-0.894626,0.976506,-0.378641,-1.275961,-1.061479,0.509417,-0.975342,-1.099466,...,-0.047407,0.340712,-0.001662,-0.968152,-0.731330,-0.511095,0.137207,1.878921,0.490004,12395.53
465933,-0.886378,0.787433,-0.425658,0.416940,0.527014,0.921393,-0.443577,-1.615920,1.720843,0.488760,...,-1.990262,2.491013,-1.893197,0.678938,0.269405,-0.785934,-0.610359,-0.142763,1.951391,11597.58
516417,-0.249557,-0.659285,-0.050171,0.690608,-0.339590,0.520996,0.695975,-0.065356,-0.342146,-0.031197,...,1.250261,0.138683,-0.424106,0.816490,0.509086,-0.306592,-0.603388,-0.159258,0.396689,14099.10


In [12]:
X_train.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [13]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

248425    0
368877    1
532244    1
481608    1
349450    1
         ..
303825    1
554311    1
465933    1
516417    1
493682    1
Name: Class, Length: 454904, dtype: int64

In [14]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
52110,-0.116818,-0.248589,2.922757,-0.063179,0.461436,1.225138,0.491299,-0.049917,0.146354,0.636162,...,-0.235489,-0.071978,0.385581,-0.139144,-0.514381,-0.515858,-0.398453,-0.271864,-0.375929,20576.25
109127,-0.112475,-0.431780,1.426174,-2.061761,-0.193818,-0.311142,0.274667,-0.103169,-0.661350,0.903608,...,-0.715801,-0.127235,-0.081664,-0.132992,1.477290,-0.147477,-0.946635,-0.396348,-0.066150,9824.14
476534,-0.686366,0.513716,-0.367999,0.633684,-0.205819,-0.005135,-0.564524,-0.234850,-0.764342,-0.759013,...,-0.155669,0.815267,-0.254633,-0.237173,-0.240431,-0.215776,-0.161853,0.597339,0.724689,58.84
515122,-2.555837,3.215724,-2.447316,1.269472,-3.164991,-2.830061,-2.400773,5.552248,-1.074798,-1.187134,...,1.481279,0.572102,-1.801065,-1.277487,0.371164,3.558455,-0.487390,1.580922,0.813345,23918.72
442807,-0.074895,0.420404,-0.756136,0.458728,-0.121655,-1.188452,-0.127484,0.024477,-0.709669,-0.798536,...,0.636476,0.179774,0.128855,-0.074680,-0.301984,0.349960,1.405818,0.381513,0.351455,2273.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29683,-0.056498,-0.257266,0.805537,-1.534281,0.337074,0.410124,0.547526,-0.122747,1.782357,0.279671,...,-0.112098,-0.105475,0.425974,-0.291806,-1.256130,-0.304725,-1.625209,0.366358,0.586947,18132.86
558364,-0.066242,-0.273777,0.105378,-0.475602,-0.075110,0.311033,0.324552,-0.113194,0.252111,-0.136220,...,0.107537,0.035610,0.311072,0.294875,0.431774,-0.616818,0.661082,0.059775,0.124470,23272.48
300052,-2.072565,2.363135,-2.044458,1.188158,-2.418887,-2.393441,-1.957068,3.516053,-1.201753,-1.345101,...,1.352673,0.549575,-1.179747,-0.778238,0.266523,2.095308,-0.530051,2.134751,1.072106,22390.92
277732,1.593532,-0.347313,0.084152,0.639841,0.684134,0.950815,0.527477,-0.161853,0.166379,1.250833,...,-0.251511,-0.082301,0.290137,-0.136611,0.201527,0.658163,0.436224,-0.278750,-0.208974,4463.31


In [15]:
X_test.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [16]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

52110     0
109127    0
476534    1
515122    1
442807    1
         ..
29683     0
558364    1
300052    1
277732    0
6158      0
Name: Class, Length: 113726, dtype: int64

### Scaling the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [17]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.scaling_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-21 20:12:20,687: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-21 20:12:20,691: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-21 20:12:20,693: INFO: common: created directory at: artifacts]
[2024-07-21 20:12:20,694: INFO: common: created directory at: artifacts/preprocessing]
[2024-07-21 20:12:20,695: INFO: common: created directory at: artifacts/models]
[2024-07-21 20:12:20,696: INFO: 3949480610: Load data train in artifacts/data/X_train.pkl.]
[2024-07-21 20:12:20,855: INFO: 3949480610: Load data test in artifacts/data/X_test.pkl.]
[2024-07-21 20:12:20,878: INFO: 3949480610: Vectorize the data.]
[2024-07-21 20:12:21,132: INFO: 3949480610: Dump the scaled data.]
[2024-07-21 20:12:21,538: INFO: 3949480610: Creating artifacts/models directory.]
[2024-07-21 20:12:21,540: INFO: 3949480610: Save the scaler model.]


**Debug**: Read data

In [18]:
X_train_vec = joblib.load(preprocessing_config.scaled_train_path)
X_train_vec

array([[ 1.75521599, -0.39302408, -0.13637258, ..., -0.26420433,
        -0.18408708, -1.0936573 ],
       [ 1.35097936,  0.22074015, -0.45392315, ..., -0.24229097,
         0.10690423,  1.457165  ],
       [-0.84117124,  0.37480516, -0.20183609, ..., -2.71464132,
        -1.30254183, -1.67306564],
       ...,
       [-0.88646666,  0.7876924 , -0.42602433, ..., -0.14127958,
         1.93690777, -0.06393749],
       [-0.24955533, -0.65887459, -0.05076821, ..., -0.15773552,
         0.3934018 ,  0.2977483 ],
       [-1.9691726 ,  1.99697615, -2.16753983, ...,  2.96903466,
        -3.55339649, -0.93980146]])

In [19]:
X_test_vec = joblib.load(preprocessing_config.scaled_test_path)
X_test_vec

array([[-0.11679648, -0.24822155,  2.92033489, ..., -0.27006837,
        -0.37365203,  1.23425614],
       [-0.11245329, -0.43139367,  1.42467032, ..., -0.39425118,
        -0.0661032 , -0.32035281],
       [-0.68642663,  0.51400399, -0.36840066, ...,  0.59703068,
         0.71903992, -1.73228244],
       ...,
       [-2.07282409,  2.36322879, -2.04383069, ...,  2.13072093,
         1.06395485,  1.49663275],
       [ 1.59379803, -0.34693534,  0.08347262, ..., -0.27693775,
        -0.20789894, -1.09545595],
       [ 0.94884217, -0.35626505,  1.04764398, ..., -0.22492686,
        -0.03539971, -0.35156468]])