In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/e2e_mlops_credit/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/adhitizki/playground/pacmann/e2e_mlops_credit'

### Data Preprocessing Config

This code will be apply in `src/MLProject/entity/config_entity.py`

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    data_path: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    params_test_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    model_dir: Path
    scaler_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [6]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        ingest_config = self.config.ingest_from_sql
        dataset_params = self.params

        create_directories([dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=dump_config.root_dir,
            data_path=ingest_config.data_path,
            input_train_path=dump_config.input_train_path,
            input_test_path=dump_config.input_test_path,
            output_train_path=dump_config.output_train_path,
            output_test_path=dump_config.output_test_path,
            params_test_size=dataset_params.TEST_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        scaler_config = self.config.scaler_data
        train_config = self.config.train_model

        create_directories([scaler_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=scaler_config.root_dir,
            input_train_path=Path(dump_config.input_train_path),
            input_test_path=Path(dump_config.input_test_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            model_dir=train_config.root_dir,
            scaler_model_path=Path(scaler_config.scaler_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/MLProject/components/preprocessing.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Scalling dataset using `Standard Scaler`

As stated before; let’s load, select columns, and drop null values from dataset.

In [8]:
import pandas as pd

df = pd.read_csv('artifacts/data-ingestion/data.csv')

df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [9]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from MLProject import logger

class Preprocessing:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset = pd.read_csv(self.config.data_path)
        dataset = dataset.drop(columns=['id']).copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split data file to data train and test.")
        X_train, X_test, y_train, y_test = train_test_split(
            dataset.drop(columns=["Class"]), 
            dataset["Class"], 
            test_size=self.config.params_test_size,
            stratify=dataset["Class"],
        )
        
        # NOTE: data save as pandas dataframe and y as series
        logger.info(f"Dump data train into {self.config.root_dir} directory.")
        X_train.to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        
        # NOTE: data save as pandas dataframe and y as serie
        logger.info(f"Dump data test into {self.config.root_dir} directory.")
        y_train.to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)
        
    def scaling_data(self) -> None:
        """scaling the splited dataset and dump vectorizer model
        """
        scaler = StandardScaler()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)
        
        logger.info(f"Vectorize the data.")
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        logger.info(f"Dump the scaled data.")
        joblib.dump(X_train_scaled, self.config.scaled_train_path)
        joblib.dump(X_test_scaled, self.config.scaled_test_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the scaler model.")
        joblib.dump(scaler, self.config.scaler_model_path)  

### Dump the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [10]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = Preprocessing(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-08 23:45:05,602: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-08 23:45:05,605: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-08 23:45:05,607: INFO: common: created directory at: artifacts]
[2024-07-08 23:45:05,609: INFO: common: created directory at: artifacts/data]
[2024-07-08 23:45:05,610: INFO: 3949480610: Read reviews file.]
[2024-07-08 23:45:09,112: INFO: 3949480610: Split data file to data train and test.]
[2024-07-08 23:45:09,709: INFO: 3949480610: Dump data train into artifacts/data directory.]
[2024-07-08 23:45:09,838: INFO: 3949480610: Dump data test into artifacts/data directory.]


**Debug**: Read data

In [11]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
325383,-1.585916,1.328054,-1.607021,1.372654,-1.851692,-1.920890,-1.898775,1.227844,-1.863764,-2.023053,...,-0.038603,0.431274,0.000897,-0.236513,1.162772,-0.217475,0.579077,-0.716873,-1.169708,21496.78
243312,1.223159,-0.837426,-0.068499,-0.419974,0.146596,-0.231319,0.694539,-0.267728,0.662563,0.597854,...,0.348070,0.056318,0.527768,-0.288558,0.044553,-0.110456,1.550757,-0.400068,-0.132805,14683.02
475129,-0.343425,0.480864,-0.910956,0.210400,-0.830749,0.034147,0.340472,0.109524,0.300224,-0.797792,...,-1.053411,0.074486,0.453183,0.305951,-0.544083,-1.976121,-1.746312,0.110644,-0.410026,5919.11
291944,-1.600727,1.295482,-1.564553,1.807110,-1.960160,-1.384706,-1.805404,1.991244,-2.217033,-1.879710,...,-0.907509,0.881012,0.295516,-0.355751,1.238811,-0.008768,0.607287,-0.915790,-1.824577,2346.66
564915,-0.275026,0.476846,-0.498773,0.352888,-0.319713,-1.196001,-0.369601,0.172437,0.154254,-0.853172,...,0.182543,0.069208,-0.494912,0.111910,0.747072,-0.448274,0.694221,0.406078,0.417514,18957.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236610,1.580930,-0.635414,0.293287,-0.495845,0.076159,0.465479,0.260182,-0.118079,1.216798,0.590527,...,-0.261485,-0.144692,-0.357402,0.345896,1.545408,-0.830335,-1.743169,-0.220845,-0.160005,12028.57
340987,-0.392925,0.088845,-0.890587,0.995373,-0.517287,-0.820671,-0.630739,0.131452,-1.297437,-1.225239,...,1.888041,0.412633,-0.707973,-0.734913,-0.633580,1.092553,2.160232,1.495936,1.819714,14861.01
556907,-1.972490,2.499348,-1.925283,2.178969,-2.268546,0.030497,-2.779425,-1.174666,-2.450072,-2.313520,...,1.998108,-2.192913,2.079451,1.550237,0.782019,-1.806698,-0.321890,-2.731089,-1.324606,14249.02
360063,-0.723259,0.205870,-0.338720,0.078709,-0.205245,-0.648866,-0.342213,-0.407804,0.363461,-0.501401,...,0.123765,0.401216,-0.467623,-0.388315,0.142222,-0.017830,-0.248056,-0.769034,-0.410425,20255.23


In [12]:
X_train.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [13]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

325383    1
243312    0
475129    1
291944    1
564915    1
         ..
236610    0
340987    1
556907    1
360063    1
366060    1
Name: Class, Length: 113726, dtype: int64

In [14]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
191966,1.559703,-1.181942,-0.282706,-1.334765,0.003233,-0.662962,0.604883,-0.324066,-0.492257,1.499070,...,-0.061493,0.012607,0.634085,-0.423998,0.248187,0.795191,0.601112,-0.381693,-0.188971,6778.58
41857,-0.075582,-0.519871,0.926084,-2.307943,0.940884,2.390777,0.165702,0.117028,0.187697,0.671752,...,-0.416342,-0.301792,-0.815539,-0.112187,2.263960,-0.344441,1.697693,-0.029036,0.304163,3096.42
324435,0.479809,0.303414,-0.593477,0.961782,0.040170,-0.478638,-0.267872,-0.010870,-0.701402,-0.603421,...,0.337605,0.029925,-0.772600,-0.006949,-0.847223,0.538681,-0.400107,0.843853,0.902038,18763.47
401097,-0.481875,0.299117,-0.327928,-0.103305,-0.280778,-0.380007,-0.148056,-0.597529,0.310821,-0.449330,...,0.353263,0.550130,-0.664561,0.029501,0.088516,-0.154014,0.540915,-0.484762,-1.807161,2610.92
357564,0.365944,-0.030908,-0.144444,0.354887,-0.149982,-0.061394,-0.207680,0.002968,-0.199652,-0.313510,...,0.169612,0.106036,0.064926,-0.215600,-0.425369,0.877507,0.920785,0.467385,0.637022,12402.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245629,1.580235,-0.511135,-0.056884,-0.049193,0.311623,-0.207951,0.665029,-0.225787,0.800311,0.712848,...,-0.389031,-0.082426,0.035901,-0.053587,-0.067355,0.426164,-1.345978,-0.281769,-0.214768,10512.99
323665,-0.857513,1.712464,-1.646441,2.061574,-0.845240,-1.689317,-1.442951,1.156138,-1.738449,-1.732035,...,1.222707,0.636083,0.173948,0.538109,-1.932568,-0.826499,1.334529,2.103167,1.635153,1117.32
148696,0.251678,-0.101657,0.357316,-0.795755,0.751785,0.057092,0.780956,-0.191907,1.346382,0.173240,...,-0.149042,-0.299716,-0.896310,0.110875,0.943586,-0.782456,0.194758,-0.004794,0.090851,22297.38
202804,1.188326,-0.938100,0.046625,-0.566960,0.221483,0.888563,0.366105,-0.136433,1.159700,0.506839,...,0.378586,0.058554,0.609676,-0.283519,-0.523376,-0.125285,-1.465226,-0.235907,-0.076511,7346.37


In [15]:
X_test.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

In [16]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

191966    0
41857     0
324435    1
401097    1
357564    1
         ..
245629    0
323665    1
148696    0
202804    0
113935    0
Name: Class, Length: 454904, dtype: int64

### Scaling the Data Train and Data Test

This code in `src/MLProject/pipeline/step_02_preprocessing.py`.

In [17]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.scaling_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-08 23:45:26,131: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-08 23:45:26,136: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-08 23:45:26,138: INFO: common: created directory at: artifacts]
[2024-07-08 23:45:26,140: INFO: common: created directory at: artifacts/preprocessing]
[2024-07-08 23:45:26,142: INFO: 3949480610: Load data train in artifacts/data/X_train.pkl.]
[2024-07-08 23:45:26,186: INFO: 3949480610: Load data test in artifacts/data/X_test.pkl.]
[2024-07-08 23:45:26,335: INFO: 3949480610: Vectorize the data.]
[2024-07-08 23:45:26,538: INFO: 3949480610: Dump the scaled data.]
[2024-07-08 23:45:26,660: INFO: 3949480610: Creating artifacts/models directory.]
[2024-07-08 23:45:26,662: INFO: 3949480610: Save the scaler model.]


**Debug**: Read data

In [28]:
X_train_vec = joblib.load(preprocessing_config.scaled_train_path)
X_train_vec

array([[ 0.06094922, -0.18955873,  1.25855013, ..., -0.07782055,
         0.36339232,  0.89936518],
       [-0.86139932,  1.51590095, -1.38999813, ...,  2.37708441,
         1.95973634, -1.60156251],
       [-0.34312652,  0.00904238,  0.53247851, ..., -1.06482397,
        -0.9147499 ,  1.16602877],
       ...,
       [ 1.02240489, -0.72607063,  0.89021836, ..., -0.21311237,
        -0.09062099,  1.46457377],
       [-1.21386698,  0.56859304, -1.17354336, ...,  1.46915613,
        -0.62218705, -0.36572278],
       [ 1.50064404, -0.50091578, -0.11137851, ..., -0.26387942,
        -0.10947182,  0.0956501 ]])

In [29]:
X_test_vec = joblib.load(preprocessing_config.scaled_test_path)
X_test_vec

array([[-0.30739972, -0.04872451,  0.09215893, ..., -0.65114668,
         0.27250065, -0.60068706],
       [-0.82413668,  0.4721215 , -0.76207498, ..., -2.47648042,
         1.20070583, -0.08544962],
       [-0.32464549,  0.49081557, -0.53260638, ...,  0.09390874,
         0.23441367, -1.72451695],
       ...,
       [ 1.05715967, -0.32790458,  0.51067391, ..., -0.22684353,
        -0.06757386,  1.17800236],
       [ 0.98327302,  0.29918207, -0.70332098, ...,  0.3324606 ,
         0.55968714,  1.13639322],
       [ 1.00375308, -0.26000417,  0.28410007, ..., -0.28613909,
        -0.02183575, -1.00276282]])