# Chapter 12: MLOps

## 12.4 ML Pipeline

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

In [None]:
chd_df = pd.read_csv( "SAheart.csv", index_col=[0] )

In [None]:
chd_df.columns

Index(['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity',
       'alcohol', 'age', 'chd'],
      dtype='object')

In [None]:
chd_df.head(5)

Unnamed: 0_level_0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
2,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
3,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
4,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
5,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1


In [None]:
X = chd_df.drop('chd', axis=1)
y = chd_df['chd']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
categorical_features = ['famhist']
numerical_features = ['sbp',
                      'tobacco',
                      'ldl',
                      'adiposity',
                      'typea',
                      'obesity',
                      'alcohol',
                      'age']

In [None]:
categorical_features

['famhist']

In [None]:
numerical_features

['sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'obesity', 'alcohol', 'age']

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [None]:
pipeline

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.92      0.84        59
           1       0.79      0.56      0.66        34

    accuracy                           0.78        93
   macro avg       0.79      0.74      0.75        93
weighted avg       0.79      0.78      0.77        93



In [None]:
from joblib import dump
dump(pipeline, 'chd.pickle')

['chd.pickle']

In [None]:
import mlflow
from mlflow.models import infer_signature

In [None]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("CHD_Prediction")

# Start an MLflow run
with mlflow.start_run():
    # Log the loss metric
    mlflow.log_metric("roc", np.round(roc_auc_score(y_test, y_pred), 3))

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training", "Logistic Regression")

    # Infer the model signature
    signature = infer_signature(X_train,
                                pipeline.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="logreg",
        signature=signature,
        input_example=X_train,
        registered_model_name="logistic",
    )