In [1]:
import logging
import sklearn
from sklearn.base import is_classifier, is_regressor


class Metrics:
    """
    A class used to determinte appropriate metrics.
    Used for standardization across model types.

    Methods
    -------
    infer_metrics(model)
        Returns metrics based on whether a model is a classifier or regressor.
    """

    @staticmethod
    def infer_metrics(model):
        if is_classifier(model):
            return {
                'F1': sklearn.metrics.f1_score,
                'Accuracy': sklearn.metrics.accuracy_score
            }
        elif is_regressor(model):
            return {
                'MSE': sklearn.metrics.mean_squared_error,
                'MAE': sklearn.metrics.mean_absolute_error,
            }
        else:
            logging.warning("Model is neither a regressor or classifier")


In [2]:
from pandas import DataFrame
import importlib
import logging
import sklearn
from typing import List
import mlflow

class Model:
    """
    A class used to represent an ML model and relevant use cases.

    ...

    Attributes
    ----------
    model : sklearn.base.BaseEstimator
        sklearn model to be used for predictions
    metrics : List[dict]
        list of metrics to be used during evaluation

    """

    def __init__(
        self,
        model_name: str,
        import_module: str,
        model_params: dict = {}
    ):
        self.model = self.get_model(model_name, import_module, model_params)
        self.metrics = Metrics.infer_metrics(self.model)

    def get_model(self,
                  model_name: str, import_module: str, model_params: dict
                  ) -> sklearn.base.BaseEstimator:
        """
        Returns instantiated sklearn model defined by parameters.

        Parameters
        ----------
        model_name : str
            Name of sklearn model e.g. "RandomForestClassifier"

        import_module : str
            Name of import module of sklearn base estimator e.g. "sklearn.ensemble"

        model_params : dict
           Dictionary defining sklearn model parameters 

        """
        model_class = getattr(
            importlib.import_module(import_module), model_name)
        model = model_class(**model_params)
        return model

    def predict(self, X: DataFrame):
        """
        Returns array-like of predicted values given features.

        Parameters
        ----------
        X : DataFrame
            DataFrame of input data to be used for predictions

        """
        return self.model.predict(X)

    def evaluate(self, true, pred) -> List[dict]:
        """
        Creates evaluation metrics comparing predicted and actual values.

        Parameters
        ----------
        true : array-like
            actual values to compare predictions against

        pred : array-like
            predictions from model

        """
        values = []
        for metric in self.metrics:
            values.append({
                'metric_name': metric,
                'metric_value': self.metrics[metric](true, pred)
            })
        logging.info(values)
        return values

    def log_metrics(self, metrics: List[dict]):
        """
        Uses the log() function to iterate through metrics returned by evaluate()

        Parameters
        ----------
        metrics : List[dict]
            metrics with metric name and metric value e.g. {'RMSE': 0.9}

        """
        for metric in metrics:
            self.log(metric['metric_name'], metric['metric_value'])

    def log(self, metric_name: str, metric_value: float):
        """
        Uses MLflow to log metric.


        Parameters
        ----------
        metric_name : str
            name of metric

        metric_value : number
            metric value

        """
        mlflow.log_metric(metric_name, metric_value)

    def mlflow_log_model(self):
        """
        Logs model within ML run folder in /artifacts created by MLflow

        """
        mlflow.sklearn.log_model(self.model)


In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from typing import List


class PreProcessor:
    def __init__(self,
                 numeric_features: List[str] = [],
                 categorical_features: List[str] = []
                 ):
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features

    def x_y_split(self, df: pd.DataFrame, label: str):
        df_X = df.drop(label, axis=1)
        df_y = df[label]
        return df_X, df_y

    def train_test_split(self, X: pd.DataFrame, y, split_ratio: float = 0.8):
        assert 0 < split_ratio < 1.0, "split_ratio must be a value between 0 and 1"
        return train_test_split(X, y, train_size=split_ratio)

    def create_transformer(self, numerical_imputer: str = 'median') -> ColumnTransformer:
        transformers = []
        if self.numeric_features:
            numeric_transformer = Pipeline(
                steps=[("imputer", SimpleImputer(strategy=numerical_imputer)),
                       ("scaler", StandardScaler())]
            )
            transformers.append(
                ("num", numeric_transformer, self.numeric_features))
        if self.categorical_features:
            categorical_transformer = OneHotEncoder(
                handle_unknown="infrequent_if_exist")
            transformers.append(
                ("cat", categorical_transformer, self.categorical_features))

        transformer = ColumnTransformer(
            transformers=transformers
        )

        return transformer


In [4]:
from sklearn.pipeline import Pipeline
import pandas as pd

model_save_path = 'saved_models/RandomForestClassifier'
train_data_path = 'data/train.csv'
model_name = 'RandomForestClassifier'
import_module = 'sklearn.ensemble'
model_params = {}

def train():
    data = pd.read_csv(train_data_path)

    model = Model(
        model_name=model_name,
        import_module=import_module,
        model_params=model_params
    )

    preprocessor = PreProcessor(
        categorical_features=[2, 5, 6],
        numeric_features=[0, 1, 3, 4]
    )

    X, y = preprocessor.x_y_split(data, 'y')

    X_train, X_test, y_train, y_test = preprocessor.train_test_split(X, y)

    transformer = preprocessor.create_transformer()

    clf = Pipeline(
        steps=[("preprocessor", transformer),
               ("classifier", model.model)]
    )

    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)

    metrics = model.evaluate(y_test, pred)

    model.log_metrics(metrics)

    model.save_model()

    joblib.dump(clf, model_save_path)

In [5]:
train()

AttributeError: 'Model' object has no attribute 'save_model'

In [131]:
clf.predict(x_test.iloc[0:1])

array([1])

In [137]:
x_test.iloc[0:1].to_numpy().tolist()

[[2.79584, 21.592293, 'Thu', -1.139312, 122.274831, 'California', 'ford']]

In [136]:
clf.predict(x_test.iloc[0:1].to_numpy().tolist())



array([1])

In [146]:
train()

In [147]:
clf = joblib.load('saved_models/RandomForestClassifier')

In [149]:
clf.predict([[2.79584, 21.592293, 'Thu', -1.139312, 122.274831, 'California', 'ford']]).tolist()



[1]