In [1]:
import os

In [2]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/adhitizki/playground/pacmann/mlops_credit_card/mlops-credit-card'

### Training Config

This code will be apply in `src/MLProject/entity/config_entity.py`.

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    input_train_path: Path
    output_train_path: Path
    output_test_path: Path
    scaled_train_path: Path
    scaled_test_path: Path
    model_path: Path
    params_C: list
    params_solver: list
    params_n_trials: list

### Training Config Manager

This code will be apply in `src/MLProject/config/configurations.py`.

In [6]:
from MLProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MLProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_training_config(self) -> TrainingConfig:
        """read training config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: TrainingConfig type
        """
        data_dump_config = self.config.dump_data
        scaler_config = self.config.scale_data
        train_config = self.config.train_model
        train_params = self.params

        config = TrainingConfig(
            input_train_path=Path(data_dump_config.input_train_path),
            output_train_path=Path(data_dump_config.output_train_path),
            output_test_path=Path(data_dump_config.output_test_path),
            scaled_train_path=Path(scaler_config.scaled_train_path),
            scaled_test_path=Path(scaler_config.scaled_test_path),
            model_path=Path(train_config.model_path),
            params_C=train_params.C,
            params_solver=train_params.SOLVER,
            params_n_trials=train_params.N_TRIALS,
        )

        return config

### Perform Training

This code in `src/MLProject/components/training.py`.

For this example, for initial run we could use logistic regression, later on we could try:
+ another model
+ another data enrichment technique
+ another model tweaking

In [8]:
import joblib
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


from MLProject import logger

class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    def objective(self, trial):
        # Define hyperparameters to optimize
        C = trial.suggest_float('C', *self.config.params_C, log=True)
        solver = trial.suggest_categorical('solver', self.config.params_solver)
        
        # Initialize and train the Logistic Regression classifier
        model = LogisticRegression(
            C=C,
            solver=solver,
            random_state=42
        )
        
        # Fit the model on the training data
        model.fit(self.X_train_scaled, self.y_train)
        
        # Predict on the validation set
        y_test_pred = model.predict(self.X_test_scaled)
        
        # Compute the F1 score for class 1
        f1 = f1_score(self.y_test, y_test_pred, labels=[1], average='binary')
        return f1

    def hpo_logistic_regression(self) -> None:
        """train the data with random forest model using hyperparameter optimization and dump the data
        """
        logger.info(f"Load scaled data train from {self.config.scaled_train_path}.")
        self.X_train_scaled = joblib.load(self.config.scaled_train_path)

        logger.info(f"Load scaled data test from {self.config.scaled_test_path}.")
        self.X_test_scaled = joblib.load(self.config.scaled_test_path)
        
        logger.info(f"Load data train output from {self.config.output_train_path}.")
        self.y_train = joblib.load(self.config.output_train_path)

        logger.info(f"Load data test output from {self.config.output_test_path}.")
        self.y_test = joblib.load(self.config.output_test_path)

        logger.info(f"Find best parameter using hyperparameter optimization")
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=self.config.params_n_trials)

        logger.info(f"Get best parameter")
        best_params = study.best_params
        
        logger.info(f"Train the model.")
        model = LogisticRegression(
            C=best_params['C'],
            solver=best_params['solver'],
            random_state=42
        )
        model.fit(self.X_train_scaled, self.y_train)
        
        logger.info(f"Dump the model.")
        joblib.dump(model, self.config.model_path)

### Traning the Model

This code in `src/MLProject/pipeline/step_03_training.py`.

In [9]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.hpo_logistic_regression()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-24 20:54:02,479: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-24 20:54:02,484: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-24 20:54:02,485: INFO: common: created directory at: artifacts]
[2024-07-24 20:54:02,488: INFO: 2455932979: Load scaled data train from artifacts/preprocessing/X_train.csv.]
[2024-07-24 20:54:02,500: INFO: 2455932979: Load scaled data test from artifacts/preprocessing/X_test.csv.]
[2024-07-24 20:54:02,504: INFO: 2455932979: Load data train output from artifacts/data/y_train.pkl.]
[2024-07-24 20:54:02,789: INFO: 2455932979: Load data test output from artifacts/data/y_test.pkl.]
[2024-07-24 20:54:02,792: INFO: 2455932979: Find best parameter using hyperparameter optimization]


[I 2024-07-24 20:54:02,793] A new study created in memory with name: no-name-85ef50e0-ee33-451e-8a56-3c74b56f09e5
[I 2024-07-24 20:54:02,943] Trial 0 finished with value: 0.9567534827377348 and parameters: {'C': 0.002179088825240364, 'solver': 'newton-cholesky'}. Best is trial 0 with value: 0.9567534827377348.
[I 2024-07-24 20:54:03,203] Trial 1 finished with value: 0.9495541712470991 and parameters: {'C': 0.0011228288839936558, 'solver': 'liblinear'}. Best is trial 0 with value: 0.9567534827377348.
[I 2024-07-24 20:54:04,041] Trial 2 finished with value: 0.9645880529390723 and parameters: {'C': 0.43588643133452115, 'solver': 'liblinear'}. Best is trial 2 with value: 0.9645880529390723.
[I 2024-07-24 20:54:04,888] Trial 3 finished with value: 0.9654926225606854 and parameters: {'C': 4.047463851527298, 'solver': 'liblinear'}. Best is trial 3 with value: 0.9654926225606854.
[I 2024-07-24 20:54:05,014] Trial 4 finished with value: 0.9651065856853638 and parameters: {'C': 0.234268804957098

[2024-07-24 20:54:05,015: INFO: 2455932979: Get best parameter]
[2024-07-24 20:54:05,016: INFO: 2455932979: Train the model.]
[2024-07-24 20:54:05,807: INFO: 2455932979: Dump the model.]


**Debug**: Predict by showing the data training prediction result.

In [10]:
import pandas as pd

X_train = joblib.load(training_config.input_train_path)
X_train_scaled = joblib.load(training_config.scaled_train_path)
y_train = joblib.load(training_config.output_train_path)
model = joblib.load(training_config.model_path)

y_pred = pd.Series(model.predict(X_train_scaled), index = X_train.index)
y_pred

40560    1
17030    0
47377    1
47058    1
44228    1
        ..
15316    1
14762    1
41       1
14461    0
32521    1
Length: 39804, dtype: int64

In [11]:
from sklearn.metrics import classification_report

print(f"{classification_report(y_train, y_pred)}")

              precision    recall  f1-score   support

           0       0.95      0.98      0.96     19949
           1       0.98      0.95      0.96     19855

    accuracy                           0.96     39804
   macro avg       0.96      0.96      0.96     39804
weighted avg       0.96      0.96      0.96     39804

