In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card'

### Data Preprocessing Config

This code will be apply in `src/MLProject/entity/config_entity.py`

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    data_path: Path
    input_train_path: Path
    input_test_path: Path
    input_valid_path: Path
    output_train_path: Path
    output_test_path: Path
    output_valid_path: Path
    params_test_size: float
    params_valid_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    input_valid_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    scaled_valid_path: Path
    model_dir: Path
    scaler_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [16]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        ingest_config = self.config.data_ingestion
        dataset_params = self.params

        create_directories([dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=dump_config.root_dir,
            data_path=ingest_config.data_path,
            input_train_path=dump_config.input_train_path,
            input_test_path=dump_config.input_test_path,
            input_valid_path=dump_config.input_valid_path,
            output_train_path=dump_config.output_train_path,
            output_test_path=dump_config.output_test_path,
            output_valid_path=dump_config.output_valid_path,
            params_test_size=dataset_params.TEST_SIZE,
            params_valid_size=dataset_params.VALID_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        scaler_config = self.config.scale_data
        train_config = self.config.train_model

        create_directories([scaler_config.root_dir, train_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=scaler_config.root_dir,
            input_train_path=Path(dump_config.input_train_path),
            input_test_path=Path(dump_config.input_test_path),
            input_valid_path=Path(dump_config.input_valid_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            scaled_valid_path=Path(scaler_config.scaled_valid_path),
            model_dir=train_config.root_dir,
            scaler_model_path=Path(scaler_config.scaler_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/MLProject/components/preprocessing.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Scalling dataset using `Standard Scaler`

As stated before; let’s load, select columns, and drop null values from dataset.

In [18]:
import pandas as pd

df = pd.read_csv('artifacts/data-ingestion/credit_card.csv')

df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [19]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from MLProject import logger

class Preprocessing:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset = pd.read_csv(self.config.data_path)
        dataset = dataset.drop(columns=['id']).copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split data file to data train and test-valid.")
        X_train, X_test_valid, y_train, y_test_valid = train_test_split(
            dataset.drop(columns=["Class"]), 
            dataset["Class"], 
            test_size=(self.config.params_test_size + self.config.params_valid_size),
            stratify=dataset["Class"],
        )

        logger.info(f"Split data file to data test and valid.")
        X_test, X_valid, y_test, y_valid = train_test_split(
            X_test_valid, 
            y_test_valid, 
            test_size=self.config.params_valid_size / (self.config.params_test_size + self.config.params_valid_size),
            stratify=y_test_valid,
        )
        
        # NOTE: data save as pandas dataframe and y as series
        logger.info(f"Dump data train into {self.config.root_dir} directory.")
        X_train.to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        X_valid.to_pickle(self.config.input_valid_path)
        
        # NOTE: data save as pandas dataframe and y as serie
        logger.info(f"Dump data test into {self.config.root_dir} directory.")
        y_train.to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)
        y_valid.to_pickle(self.config.output_valid_path)
        
    def scaling_data(self) -> None:
        """scaling the splited dataset and dump vectorizer model
        """
        scaler = StandardScaler()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)

        logger.info(f"Load data test in {self.config.input_valid_path}.")
        X_valid = joblib.load(self.config.input_valid_path)
        
        logger.info(f"scaled the data.")
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_valid_scaled = scaler.transform(X_valid)
        
        logger.info(f"Dump the scaled data.")
        joblib.dump(X_train_scaled, self.config.scaled_train_path)
        joblib.dump(X_test_scaled, self.config.scaled_test_path)
        joblib.dump(X_valid_scaled, self.config.scaled_valid_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the scaler model.")
        joblib.dump(scaler, self.config.scaler_model_path)  

### Dump the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [22]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = Preprocessing(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-21 21:01:04,672: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-21 21:01:04,677: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-21 21:01:04,679: INFO: common: created directory at: artifacts]
[2024-07-21 21:01:04,680: INFO: common: created directory at: artifacts/data]
[2024-07-21 21:01:04,683: INFO: 3933285488: Read reviews file.]
[2024-07-21 21:01:08,530: INFO: 3933285488: Split data file to data train and test-valid.]
[2024-07-21 21:01:08,930: INFO: 3933285488: Split data file to data test and valid.]
[2024-07-21 21:01:09,016: INFO: 3933285488: Dump data train into artifacts/data directory.]
[2024-07-21 21:01:09,124: INFO: 3933285488: Dump data test into artifacts/data directory.]


**Debug**: Read data

In [23]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
539542,-0.865434,0.465581,-0.587272,-0.022044,-0.346194,-0.696039,-0.369136,-0.325587,0.095468,-0.513853,...,-0.224528,0.547444,-0.492085,-0.100069,0.843336,-0.129402,0.456544,-1.233494,0.250060,8102.42
284663,1.693146,-0.512716,-0.018202,-0.314323,0.348820,0.336217,0.420116,-0.172630,1.312974,0.346047,...,-0.238029,-0.286042,-0.887840,0.208676,0.879254,-0.269516,0.579120,-0.282308,-0.155918,9887.42
379123,-0.896558,0.952383,-1.180169,1.061646,-1.406798,-0.597352,-1.131460,0.824366,-0.985156,-1.499121,...,-0.211286,0.581046,0.711542,0.367809,0.096113,-0.214489,0.223802,0.201746,-0.072632,7871.52
516022,-0.097381,0.327770,-0.808009,0.416244,-0.620445,-0.722578,-0.431444,0.212123,-0.652799,-0.608235,...,0.171754,0.323936,0.741274,0.156854,-0.071084,-0.040666,1.223442,-0.035771,-0.453318,3297.30
354686,-1.567707,1.423345,-1.488722,1.809393,-1.825161,-1.112963,-1.849314,1.046031,-1.905642,-1.854721,...,-0.369130,0.322558,-0.366835,0.463889,1.412762,-0.674550,-0.978389,-1.214063,-0.695793,12993.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387224,-1.195074,-3.823027,-0.609790,0.607151,4.401988,-3.968994,-0.776645,0.199397,0.137655,-0.499089,...,2.569114,0.342669,-0.889108,-0.299248,-1.060201,0.114178,0.879510,0.669854,-3.083106,4777.21
118672,-0.019616,-0.041373,0.920747,-0.297015,0.329352,0.397734,0.576309,-0.058084,0.046590,0.407929,...,-0.291661,-0.021570,0.495311,-0.154172,-0.446445,-0.487695,-0.664975,-0.161646,0.130211,22758.42
534010,1.620903,-0.230040,0.369676,0.668706,0.380577,0.414652,0.425673,-0.189001,0.840529,1.095530,...,-0.513937,-0.179358,-0.247031,0.193758,-0.322632,-0.259663,-0.289329,-0.290361,-0.209229,5113.74
153158,-0.342003,-0.919806,0.286880,-0.033545,1.395499,-0.570171,0.815500,-0.259149,1.147333,0.427896,...,-0.539989,-0.128469,0.525346,1.407895,-0.246653,0.226857,-1.326509,-0.059317,0.005386,6439.34


In [24]:
X_train.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [25]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

539542    1
284663    0
379123    1
516022    1
354686    1
         ..
387224    1
118672    0
534010    1
153158    0
556432    1
Name: Class, Length: 398041, dtype: int64

In [26]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
530897,0.180484,0.081122,-0.687799,0.526181,-0.422928,0.425467,-0.574866,-0.137837,-0.136568,-0.635641,...,0.498195,-0.059807,0.604249,0.522006,-1.415873,-1.667076,-0.551078,0.566498,0.296167,754.28
83749,0.827036,-0.735325,1.283235,-0.351278,-0.150000,0.786241,0.158040,-0.101956,0.232993,0.839164,...,-0.539008,-0.140383,0.233600,-0.025414,0.234771,0.263146,-0.537735,-0.099602,0.033533,1177.88
383028,-0.610893,0.307630,-0.675099,1.244253,1.950087,-1.409095,0.211873,-0.409980,-1.115937,-0.545804,...,-0.248209,-0.288773,0.338910,0.805936,-0.471118,1.578802,1.305293,0.470267,-0.903819,9119.73
352430,1.049404,-0.038214,0.179114,0.481182,0.553060,-0.165593,0.559286,-0.201271,0.676483,0.316642,...,-0.351011,-0.231264,-0.667515,-0.094396,0.168458,0.761291,0.029536,-0.259094,0.067775,11142.48
230969,0.246219,-0.053851,0.073081,-0.679092,0.717866,0.190741,0.529602,-0.532123,0.603012,0.052145,...,-0.546415,0.445409,-0.452824,-0.453593,-1.414579,2.070133,1.742484,-0.024900,0.547021,3334.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220143,-0.757215,0.298466,0.013222,-0.714306,-0.438084,0.037874,-0.062436,0.520732,0.893707,0.419039,...,-0.171293,0.043701,0.915655,-0.094932,0.085318,0.130280,-0.155435,0.120141,0.041036,9354.06
12411,0.797189,-0.528831,2.428695,0.479331,-0.208845,1.122464,0.026177,-0.024068,2.296544,0.621901,...,-0.408536,-0.111900,0.480726,-0.119839,1.080884,0.503560,0.419908,-0.174433,-0.036156,5703.67
408084,0.375412,0.044479,-0.230982,0.527990,-0.009407,0.045875,-0.167937,0.005292,-0.406726,-0.262386,...,0.169048,0.087560,-0.109909,-0.305636,-1.057349,1.171820,0.629104,0.460371,0.636629,23155.36
16330,1.008399,-0.891020,1.409506,-0.722798,-0.292784,0.855718,0.019091,-0.063109,0.830426,0.790463,...,-0.656976,-0.335817,-0.713081,0.007976,0.167798,0.134586,2.039663,-0.224165,-0.077715,5028.42


In [27]:
X_test.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [28]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

530897    1
83749     0
383028    1
352430    1
230969    0
         ..
220143    0
12411     0
408084    1
16330     0
271572    0
Name: Class, Length: 85294, dtype: int64

### Scaling the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [29]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.scaling_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-21 21:01:09,572: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-21 21:01:09,577: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-21 21:01:09,579: INFO: common: created directory at: artifacts]
[2024-07-21 21:01:09,583: INFO: common: created directory at: artifacts/preprocessing]
[2024-07-21 21:01:09,585: INFO: common: created directory at: artifacts/models]
[2024-07-21 21:01:09,586: INFO: 3933285488: Load data train in artifacts/data/X_train.pkl.]
[2024-07-21 21:01:09,714: INFO: 3933285488: Load data test in artifacts/data/X_test.pkl.]
[2024-07-21 21:01:09,738: INFO: 3933285488: Load data test in artifacts/data/X_valid.pkl.]
[2024-07-21 21:01:09,762: INFO: 3933285488: scaled the data.]
[2024-07-21 21:01:09,909: INFO: 3933285488: Dump the scaled data.]
[2024-07-21 21:01:10,068: INFO: 3933285488: Creating artifacts/models directory.]
[2024-07-21 21:01:10,069: INFO: 3933285488: Save the scaler model.]


**Debug**: Read data

In [30]:
X_train_vec = joblib.load(preprocessing_config.scaled_train_path)
X_train_vec

array([[-0.86577158,  0.46474019, -0.5875832 , ..., -1.22597813,
         0.24870818, -0.56967333],
       [ 1.69422545, -0.50991794, -0.01875457, ..., -0.28099261,
        -0.15409279, -0.311731  ],
       [-0.89691302,  0.949731  , -1.1802284 , ...,  0.19990663,
        -0.07145902, -0.60303966],
       ...,
       [ 1.62194231, -0.22829334,  0.36895898, ..., -0.28899257,
        -0.20698724, -1.00155406],
       [-0.34205126, -0.91549395,  0.28619821, ..., -0.05945493,
         0.00594836, -0.80999756],
       [-0.59089245,  0.61423027, -0.572421  , ...,  0.75834728,
         0.27054304,  1.55844054]])

In [31]:
X_test_vec = joblib.load(preprocessing_config.scaled_test_path)
X_test_vec

array([[ 0.18072536,  0.08171137, -0.68806809, ...,  0.56228095,
         0.29445432, -1.63152006],
       [ 0.82763555, -0.73169858,  1.28212977, ..., -0.09947778,
         0.03387508, -1.57030752],
       [-0.61108974,  0.30737655, -0.6753727 , ...,  0.46667725,
        -0.89614176, -0.42266643],
       ...,
       [ 0.37576148,  0.0452044 , -0.23144415, ...,  0.45684577,
         0.63225257,  1.60555945],
       [ 1.00909912, -0.88681507,  1.40834809, ..., -0.22322862,
        -0.076502  , -1.01388327],
       [ 0.1890253 , -0.15193148,  0.64938649, ..., -0.25315119,
         0.17707896, -0.81942654]])