In [1]:
# Source: https://cloud.google.com/ai-platform/prediction/docs/exporting-for-prediction
# Accessed 2021-01-01
#found at https://devopedia.org/machine-learning-model


#infra this: myMBpro 
#      env: base
#      confirmed Python 3.10.13
#      numpy 1.24.3, scikit-learn  1.2.2, joblib 1.2.0

#input: Iris dataset from sklearn  $config
#output: persisted models w/ joblib


#history
#5/31/2024 SKLEARN MODELING
#      RF, pipeline, custom pipeline
#      'feature_selection' chi2
#      ds Iris


In [11]:
import sys
import numpy as np

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

import joblib

In [13]:
#python version $my
print(sys.version)
print(np.__version__, joblib.__version__)

3.10.13 (main, Sep 11 2023, 08:39:02) [Clang 14.0.6 ]
1.24.3 1.2.0


# Exporting models for prediction

In [2]:
#get data
iris = datasets.load_iris() #class sklearn.utils._bunch.Bunch

#preview
iris.data[:5], iris.target[:5], iris.data[-5:], iris.target[-5:] #class numpy.ndarray

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]),
 array([0, 0, 0, 0, 0]),
 array([[6.7, 3. , 5.2, 2.3],
        [6.3, 2.5, 5. , 1.9],
        [6.5, 3. , 5.2, 2. ],
        [6.2, 3.4, 5.4, 2.3],
        [5.9, 3. , 5.1, 1.8]]),
 array([2, 2, 2, 2, 2]))

## SKLearn Classifier

In [3]:
# ---------------- Save model ----------------
clf1 = RandomForestClassifier()
clf1.fit(iris.data, iris.target)
 
joblib.dump(clf1, 'clf1.joblib')


['clf1.joblib']

In [4]:
clf1.predict([[5.1, 3.5, 1.4, 0.2],
        [6.2, 3.4, 5.4, 2.3]])

array([0, 2])

## SKLearn Classifier Pipeline

In [5]:
# -------------- Save pipeline ---------------
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
      ('feature_selection', SelectKBest(chi2, k=2)),
      ('classification', RandomForestClassifier())
    ])
pipeline.fit(iris.data, iris.target)
 
joblib.dump(pipeline, 'clf2.joblib')

['clf2.joblib']

In [6]:
pipeline.predict([[5.1, 3.5, 1.4, 0.2],
        [6.2, 3.4, 5.4, 2.3]])

array([0, 2])

## SKLearn Classifier Custome Pipeline

In [7]:
#custom my_module.py

# from sklearn.base import BaseEstimator
# from sklearn.base import TransformerMixin
# from sklearn.utils.validation import check_is_fitted

# def add_sum(X):
#   sums = X.sum(1).reshape((-1,1))
#   transformed_X = np.append(X, sums, 1)
#   return transformed_X

# class MySimpleScaler(BaseEstimator, TransformerMixin):
#   def fit(self, X, y=None):
#     self.means = np.mean(X, axis=0)
#     self.stds = np.std(X, axis=0)
#     if not self.stds.all():
#       raise ValueError('At least one column has standard deviation of 0.')
#     return self

#   def transform(self, X):
#     check_is_fitted(self, ('means', 'stds'))
#     transformed_X = (X - self.means) / self.stds
#     return transformed_X

In [8]:
import my_module
from sklearn.preprocessing import FunctionTransformer

iris = datasets.load_iris()
pipeline_c = Pipeline([
      ('scale_data', my_module.MySimpleScaler()),
      ('add_sum_column', FunctionTransformer(my_module.add_sum)),
      ('classification', RandomForestClassifier())
    ])
pipeline_c.fit(iris.data, iris.target)

joblib.dump(pipeline_c, 'model.joblib')

['model.joblib']

In [9]:
pipeline_c.predict([[5.1, 3.5, 1.4, 0.2],
        [6.2, 3.4, 5.4, 2.3]])

array([0, 2])