In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card'

### Data Preprocessing Config

This code will be apply in `src/MLProject/entity/config_entity.py`

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    data_path: Path
    input_train_path: Path
    input_test_path: Path
    input_valid_path: Path
    output_train_path: Path
    output_test_path: Path
    output_valid_path: Path
    params_test_size: float
    params_valid_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    input_valid_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    scaled_valid_path: Path
    model_dir: Path
    scaler_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [6]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        ingest_config = self.config.data_ingestion
        dataset_params = self.params

        create_directories([dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=dump_config.root_dir,
            data_path=ingest_config.data_path,
            input_train_path=dump_config.input_train_path,
            input_test_path=dump_config.input_test_path,
            input_valid_path=dump_config.input_valid_path,
            output_train_path=dump_config.output_train_path,
            output_test_path=dump_config.output_test_path,
            output_valid_path=dump_config.output_valid_path,
            params_test_size=dataset_params.TEST_SIZE,
            params_valid_size=dataset_params.VALID_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        scaler_config = self.config.scale_data
        train_config = self.config.train_model

        create_directories([scaler_config.root_dir, train_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=scaler_config.root_dir,
            input_train_path=Path(dump_config.input_train_path),
            input_test_path=Path(dump_config.input_test_path),
            input_valid_path=Path(dump_config.input_valid_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            scaled_valid_path=Path(scaler_config.scaled_valid_path),
            model_dir=train_config.root_dir,
            scaler_model_path=Path(scaler_config.scaler_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/MLProject/components/preprocessing.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Scalling dataset using `Standard Scaler`

As stated before; let’s load, select columns, and drop null values from dataset.

In [8]:
import pandas as pd

df = pd.read_csv('artifacts/data-ingestion/credit_card.csv')

df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,437378,0.420468,-0.070194,-0.569266,0.191673,-0.009607,0.426903,-0.356728,0.096143,0.077806,...,0.134969,0.070433,0.04777,-0.851622,0.102876,-0.375436,0.820807,0.665983,8633.18,1
1,504222,-0.238944,0.250929,-0.374408,0.152938,-0.105008,-0.039028,-0.293004,0.133771,-0.591631,...,0.192405,0.289441,-0.255187,-0.817462,0.308284,1.582688,0.574425,0.478489,12299.55,1
2,4794,-0.117796,-0.147961,2.130455,-0.325762,0.325616,0.271351,0.772625,-0.244342,1.240012,...,-0.271739,-0.404654,-0.121235,0.857659,0.54192,0.756534,-0.238177,-0.403038,5215.87,0
3,388411,-0.855315,0.137014,-0.628116,0.613733,-0.643573,-0.664283,-0.88004,0.466586,-1.045508,...,0.405505,0.16756,0.446262,-0.205976,0.492582,0.658619,1.609128,-0.025592,19282.98,1
4,424512,0.257686,0.035247,-0.203112,0.506745,-0.242235,-0.192608,-0.289297,0.044488,-0.396122,...,0.162191,0.165912,-0.181999,0.331451,1.043095,0.029799,0.643273,0.736723,19114.27,1


In [9]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from MLProject import logger

class Preprocessing:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset = pd.read_csv(self.config.data_path)
        dataset = dataset.drop(columns=['id']).copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split data file to data train and test-valid.")
        X_train, X_test_valid, y_train, y_test_valid = train_test_split(
            dataset.drop(columns=["Class"]), 
            dataset["Class"], 
            test_size=(self.config.params_test_size + self.config.params_valid_size),
            stratify=dataset["Class"],
        )

        logger.info(f"Split data file to data test and valid.")
        X_test, X_valid, y_test, y_valid = train_test_split(
            X_test_valid, 
            y_test_valid, 
            test_size=self.config.params_valid_size / (self.config.params_test_size + self.config.params_valid_size),
            stratify=y_test_valid,
        )
        
        # NOTE: data save as pandas dataframe and y as series
        logger.info(f"Dump data train into {self.config.root_dir} directory.")
        X_train.to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        X_valid.to_pickle(self.config.input_valid_path)
        
        # NOTE: data save as pandas dataframe and y as serie
        logger.info(f"Dump data test into {self.config.root_dir} directory.")
        y_train.to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)
        y_valid.to_pickle(self.config.output_valid_path)
        
    def scaling_data(self) -> None:
        """scaling the splited dataset and dump vectorizer model
        """
        scaler = StandardScaler()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)

        logger.info(f"Load data test in {self.config.input_valid_path}.")
        X_valid = joblib.load(self.config.input_valid_path)
        
        logger.info(f"scaled the data.")
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_valid_scaled = scaler.transform(X_valid)
        
        logger.info(f"Dump the scaled data.")
        joblib.dump(X_train_scaled, self.config.scaled_train_path)
        joblib.dump(X_test_scaled, self.config.scaled_test_path)
        joblib.dump(X_valid_scaled, self.config.scaled_valid_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the scaler model.")
        joblib.dump(scaler, self.config.scaler_model_path)  

### Dump the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [10]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = Preprocessing(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-24 20:53:49,411: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-24 20:53:49,415: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-24 20:53:49,418: INFO: common: created directory at: artifacts]
[2024-07-24 20:53:49,420: INFO: common: created directory at: artifacts/data]
[2024-07-24 20:53:49,422: INFO: 3933285488: Read reviews file.]
[2024-07-24 20:53:49,810: INFO: 3933285488: Split data file to data train and test-valid.]
[2024-07-24 20:53:49,873: INFO: 3933285488: Split data file to data test and valid.]
[2024-07-24 20:53:49,883: INFO: 3933285488: Dump data train into artifacts/data directory.]
[2024-07-24 20:53:49,901: INFO: 3933285488: Dump data test into artifacts/data directory.]


**Debug**: Read data

In [11]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
40560,-0.606188,0.042910,-0.653307,1.296325,2.403087,-1.724521,0.204214,-0.200625,-1.097473,-0.505038,...,-0.317482,-0.071103,0.102092,0.703532,-0.537791,1.779387,1.378051,0.340615,-0.287612,5058.50
17030,0.039617,-0.122287,1.315971,-0.647920,0.262272,0.071734,0.643968,-0.125851,0.195151,0.470638,...,-0.082959,-0.192707,-0.605974,-0.041019,0.548366,-0.429177,0.144161,0.045170,0.109076,13880.76
47377,-0.767186,0.726053,-0.614696,1.120234,-0.791218,-0.233513,-0.950387,0.585912,-1.356075,-0.926251,...,0.412879,0.526474,0.974542,-0.252210,0.033566,-0.512466,0.714766,0.449056,0.623465,8789.27
47058,-0.289064,0.619993,-0.734958,1.041007,-0.173575,-0.715364,-0.598869,0.081948,-1.150742,-1.093434,...,0.846932,0.318796,0.069890,-0.451402,-0.715167,0.961237,1.344099,1.269687,1.007395,12388.53
44228,-0.329835,0.001587,0.197118,0.337499,0.109434,-0.317566,-0.135162,0.012745,-0.247663,-0.324182,...,-0.099860,0.061168,-0.196193,-0.348874,0.285685,0.367806,-0.329220,0.422200,0.474238,4975.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15316,-1.289071,0.432992,-1.042407,0.999901,-0.756605,-0.316872,-1.315465,-0.297598,-1.215548,-1.295429,...,0.170882,-0.465900,0.823063,-0.109617,-0.873770,-0.569032,-0.331719,1.011456,0.654774,7956.04
14762,-1.071709,0.874377,-0.998591,1.517927,-1.309743,-1.064708,-1.486060,0.915845,-1.917249,-1.694891,...,0.740918,0.705942,0.149469,-0.061734,1.284946,0.045184,0.280920,2.083297,1.027698,16807.26
41,-0.023786,0.011606,-0.074335,0.341224,0.209782,0.337235,-0.065902,0.014201,-0.348783,-0.215936,...,0.048293,0.062631,0.095934,-0.465785,-1.226540,0.907561,0.051885,0.249423,0.434155,5835.14
14461,-0.266711,-0.989152,1.225030,-1.054525,2.012747,2.276832,0.071171,0.124571,1.266773,0.339567,...,0.260807,-0.049009,0.311172,-0.085574,2.408900,0.179955,0.847481,-0.332244,-0.275698,11490.24


In [12]:
X_train.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [13]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

40560    1
17030    0
47377    1
47058    1
44228    1
        ..
15316    1
14762    1
41       1
14461    0
32521    1
Name: Class, Length: 39804, dtype: int64

In [14]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
40836,1.602382,-0.508243,-0.121497,-0.441644,0.379312,0.038122,0.493986,-0.193561,1.191804,0.256160,...,-0.145063,-0.189177,-0.541065,0.116179,1.017617,-0.202895,-0.268531,-0.263501,-0.095802,13392.33
4409,-1.804101,1.423515,-1.938136,2.300602,-1.224214,-1.333085,-2.061291,-0.295748,-2.000241,-1.954043,...,0.864161,-1.162500,0.664955,-1.151392,-1.529782,-0.177202,1.440314,3.582961,-3.040183,11683.97
52893,-0.723593,0.474055,0.089372,-0.673716,-0.439202,-0.209688,-0.019147,0.543071,0.206557,0.637355,...,-0.161108,-0.133625,-0.694344,0.331487,0.974091,0.069667,0.196125,-0.200953,-0.042141,15704.16
30924,-1.089966,1.406100,-1.479548,1.593309,-1.498762,-2.150589,-1.644531,1.121994,-1.293817,-1.899642,...,0.327317,0.782427,0.443702,0.458831,0.789477,-0.540106,-0.463575,-0.510875,0.803468,5873.39
46879,-0.192840,0.457027,-0.874618,0.435372,-0.477221,-0.803457,-0.341185,0.212458,-0.676896,-0.646424,...,0.082671,0.322422,0.835149,0.128620,0.266981,-0.487012,0.563420,0.127734,0.001304,8829.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51227,0.247414,0.446563,-0.415373,0.707124,2.382367,0.217849,0.778184,-0.424715,-0.679894,-0.145366,...,-0.178266,-0.111965,-0.709566,-0.706425,-1.377310,1.387377,0.704564,0.052430,0.913077,5562.86
49432,-0.180910,-0.013008,0.810914,-0.935192,0.464002,-0.012794,0.788255,-0.204784,0.886838,0.905086,...,0.169563,-0.210690,-0.244446,0.276207,0.068493,-1.850671,-0.561073,0.376931,0.567338,22525.05
20925,1.356709,-0.757911,-0.395405,-0.870553,1.505402,2.115533,0.349940,0.031415,1.075083,0.249470,...,0.145387,-0.157435,-0.659003,0.022326,1.246947,-0.251278,-0.193394,-0.265611,-0.074441,13946.88
8629,-1.005875,0.755674,-1.085828,1.590640,-1.188284,-1.455095,-1.453409,0.775681,-1.925220,-1.700825,...,0.785450,0.737393,0.554169,-0.369455,1.159878,-0.471235,0.556048,2.753041,0.742794,22527.13


In [15]:
X_test.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [16]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

40836    0
4409     1
52893    0
30924    1
46879    1
        ..
51227    1
49432    0
20925    0
8629     1
47886    1
Name: Class, Length: 8529, dtype: int64

### Scaling the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [17]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.scaling_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-24 20:53:50,083: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-24 20:53:50,086: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-24 20:53:50,088: INFO: common: created directory at: artifacts]
[2024-07-24 20:53:50,091: INFO: common: created directory at: artifacts/preprocessing]
[2024-07-24 20:53:50,093: INFO: common: created directory at: artifacts/models]
[2024-07-24 20:53:50,095: INFO: 3933285488: Load data train in artifacts/data/X_train.pkl.]
[2024-07-24 20:53:50,107: INFO: 3933285488: Load data test in artifacts/data/X_test.pkl.]
[2024-07-24 20:53:50,110: INFO: 3933285488: Load data test in artifacts/data/X_valid.pkl.]
[2024-07-24 20:53:50,114: INFO: 3933285488: scaled the data.]
[2024-07-24 20:53:50,135: INFO: 3933285488: Dump the scaled data.]
[2024-07-24 20:53:50,188: INFO: 3933285488: Creating artifacts/models directory.]
[2024-07-24 20:53:50,188: INFO: 3933285488: Save the scaler model.]


**Debug**: Read data

In [18]:
X_train_vec = joblib.load(preprocessing_config.scaled_train_path)
X_train_vec

array([[-0.60216117,  0.04147381, -0.65380256, ...,  0.34606264,
        -0.290806  , -1.00947662],
       [ 0.04370281, -0.12103418,  1.3138411 , ...,  0.05053922,
         0.11650905,  0.2640311 ],
       [-0.76317381,  0.71349712, -0.61522366, ...,  0.45453181,
         0.6446775 , -0.47093384],
       ...,
       [-0.01970582,  0.01067977, -0.07531117, ...,  0.25484612,
         0.45029613, -0.89736736],
       [-0.26265262, -0.97378934,  1.22297519, ..., -0.32697326,
        -0.27857278, -0.08104439],
       [-0.49770878, -0.67489172, -0.18472197, ...,  0.2702715 ,
         0.64281521,  0.67461769]])

In [19]:
X_test_vec = joblib.load(preprocessing_config.scaled_test_path)
X_test_vec

array([[ 1.6066124 , -0.50070763, -0.12243421, ..., -0.25821212,
        -0.09385802,  0.19352543],
       [-1.80018498,  1.39960629, -1.93756409, ...,  3.5892603 ,
        -3.11711529, -0.05307914],
       [-0.71957719,  0.46560119,  0.08825985, ..., -0.19564825,
        -0.03875948,  0.52724189],
       ...,
       [ 1.36091676, -0.74631194, -0.39611382, ..., -0.26032303,
        -0.07192404,  0.27357563],
       [-1.00188475,  0.74263593, -1.08596389, ...,  2.7591222 ,
         0.76720312,  1.51214881],
       [-1.5967886 ,  1.40501152, -1.59130851, ..., -1.54981078,
        -1.01970954, -0.76168836]])