In [1]:
import io
import os
import pathlib
import shutil
import zipfile

import category_encoders
import numpy as np
import pandas as pd
import requests
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing

import fiddler as fdl

In [2]:
%load_ext autoreload
%autoreload 2

# Intro
In tutorial_01, we demonstrated how to quickly and easily upload a `scikit-learn` model to Fiddler. In this tutorial, we show how to upload complex models using the custom model upload API endpoint.

This notebook is organized into three sections
1. Loading the Bikeshare data, preprocessing it, and uploading it to Fiddler.
2. Building a multilayer perceptron model using Tensorflow 1.x to predict hourly rentals inthe Bikeshare data.
2. Uploading this MLP model to the Fiddler platform.

# Section 1: Loading data

As we saw in tutorial_01, as long as your data can be dumped into a DataFrame object, there is nothing else you need to do to get it ready to upload to Fiddler. However, for an MLP model we will need to transform our dataset with proper categorical encoding and standardization. We will demonstrate uploading both the original and preprocessed data.

## 1.1 Downloading the UCI bikeshare dataset

In [3]:
zip_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
z = zipfile.ZipFile(io.BytesIO(requests.get(zip_url).content))

# here we pre-configure the datatypes for our dataframe
# so it doesn't require any datatype modification after import
bikeshare_dtypes = dict(season='category', holiday='bool',
                        workingday='bool', weathersit='category')
bikeshare_datetime_columns = ['dteday']
bikeshare_index_column = 'instant'
with z.open('hour.csv') as csv:
    df = pd.read_csv(csv, 
                     dtype=bikeshare_dtypes, 
                     parse_dates=bikeshare_datetime_columns,
                     index_col=bikeshare_index_column)

# split train/test by year
is_2011 = df['yr'] == 0
df_2011 = df[is_2011].reset_index(drop=True)
df_2012 = df[~is_2011].reset_index(drop=True)

# peek at the data
display(df.sample(3, random_state=0).T)

# print info about train-test split
print(f'Train set (bikeshare rentals in 2011) has {df_2011.shape[0]} rows,'
      f' test set (bikeshare rentals in 2012) has {df_2012.shape[0]} rows')

instant,3440,6543,15471
dteday,2011-05-28 00:00:00,2011-10-05 00:00:00,2012-10-11 00:00:00
season,2,4,4
yr,0,0,1
mnth,5,10,10
hr,5,4,19
holiday,False,False,False
weekday,6,3,4
workingday,False,True,True
weathersit,1,1,1
temp,0.56,0.44,0.44


Train set (bikeshare rentals in 2011) has 8645 rows, test set (bikeshare rentals in 2012) has 8734 rows


In [4]:
# specify which columns are features and which are not
target = 'cnt'
not_used_as_features = ['dteday', 'yr', 'casual', 'registered']
non_feature_columns = [target] + not_used_as_features
feature_columns = list(set(df_2011.columns) - set(non_feature_columns))

# split our data into features and targets
x_train = df_2011.drop(columns=non_feature_columns)
x_test = df_2012.drop(columns=non_feature_columns)
y_train = df_2011[target]
y_test = df_2012[target]

## 1.2 Feature preprocessing
As we can see above, this dataset contains some categorical features (`season` and `workingday`) as well as features on different scales (`hr` and `temp`). Since encoding categorical variables can be a pain in `sklearn`, we will use the `category_encoders` package, and combine this with the `StandardScaler` transformation from scikit-learn in a `Pipeline` object.

In [5]:
onehot = category_encoders.OneHotEncoder(cols=df.select_dtypes('category').columns.tolist())
standard_scaler = sklearn.preprocessing.StandardScaler()
preprocessor = sklearn.pipeline.make_pipeline(onehot, standard_scaler)
preprocessor.fit(x_train)
x_train_processed = preprocessor.transform(x_train)
x_test_processed = preprocessor.transform(x_test)

## 1.3 Uploading the data to Fiddler

### Before you start: set up your API connection

#### Onebox
If you're using a Onebox deployment, make sure you've run the `start.sh` script to launch Onebox locally.

#### Cloud
For the cloud version of our product, look up your authentication token in the [Fiddler settings dashboard](https://app.fiddler.ai/settings/credentials)

#### Create a FiddlerApi object
In order to get your data and models into the Fiddler Engine, you'll need to connect using the API. The `FiddlerApi` object to handles most of the nitty-gritty for you, so all you have to do is specify some details about the Fiddler system you're connecting to.

In [6]:
# NOTE: typically the API url for your running instance of Fiddler will be "https://api.fiddler.ai" (or "http://localhost:4100" for onebox)
# however, use "http://host.docker.internal:4100" as our URL if Jupyter is running in a docker VM on the same macOS machine as onebox
url = 'http://host.docker.internal:4100'

# see <Fiddler URL>/settings/credentials to find, create, or change this token
token = os.getenv('FIDDLER_API_TOKEN')

# see <Fiddler URL>/settings/general to find this id (listed as "Organization Name")
org_id = 'onebox'

fiddler_api = fdl.FiddlerApi(url=url, org_id=org_id, auth_token=token)

In [7]:
# delete the datasets if we've uploaded them previously
fiddler_api.delete_dataset('bikeshare')
fiddler_api.delete_dataset('bikeshare_processed')

'Dataset deleted bikeshare_processed'

In [8]:
# let's upload the original dataset
fiddler_api.upload_dataset(
    dataset={'train': df_2011, 'test': df_2012}, 
    dataset_id='bikeshare')

Heads up! We are inferring the details of your dataset from the dataframe(s) provided. Please take a second to check our work.

If the following DatasetInfo is an incorrect representation of your data, you can construct a DatasetInfo with the DatasetInfo.from_dataframe() method and modify that object to reflect the correct details of your dataset.

After constructing a corrected DatasetInfo, please re-upload your dataset with that DatasetInfo object explicitly passed via the `info` parameter of FiddlerApi.upload_dataset().

You may need to delete the initially uploaded versionvia FiddlerApi.delete_dataset('bikeshare').

Inferred DatasetInfo to check:
  DatasetInfo:
    display_name: bikeshare
    files: []
    columns:
              column     dtype count(possible_values)
      0       dteday    STRING                      -
      1       season  CATEGORY                      4
      2           yr   INTEGER                      -
      3         mnth   INTEGER                      -
 

{'row_count': 17379,
 'col_count': 16,
 'log': ['Importing dataset bikeshare',
  'Creating table for bikeshare',
  'Importing data file: test.csv',
  'Importing data file: train.csv']}

In [9]:
# let's also upload the preprocessed version of the dataset
df_2011_processed = pd.concat([pd.DataFrame(x_train_processed, columns=onehot.feature_names), y_train], axis=1)
df_2012_processed = pd.concat([pd.DataFrame(x_test_processed, columns=onehot.feature_names), y_test], axis=1)
fiddler_api.upload_dataset(
    dataset={'train': df_2011_processed, 'test': df_2012_processed}, 
    dataset_id='bikeshare_processed')

Heads up! We are inferring the details of your dataset from the dataframe(s) provided. Please take a second to check our work.

If the following DatasetInfo is an incorrect representation of your data, you can construct a DatasetInfo with the DatasetInfo.from_dataframe() method and modify that object to reflect the correct details of your dataset.

After constructing a corrected DatasetInfo, please re-upload your dataset with that DatasetInfo object explicitly passed via the `info` parameter of FiddlerApi.upload_dataset().

You may need to delete the initially uploaded versionvia FiddlerApi.delete_dataset('bikeshare_processed').

Inferred DatasetInfo to check:
  DatasetInfo:
    display_name: bikeshare_processed
    files: []
    columns:
                column    dtype count(possible_values)
      0       season_1    FLOAT                      -
      1       season_2    FLOAT                      -
      2       season_3    FLOAT                      -
      3       season_4    FLOAT

{'row_count': 17379,
 'col_count': 18,
 'log': ['Importing dataset bikeshare_processed',
  'Creating table for bikeshare_processed',
  'Importing data file: test.csv',
  'Importing data file: train.csv']}

In [10]:
# we see that the 'bikeshare' and 'bikeshare_processed' shows up in the list of all datasets
fiddler_api.list_datasets()

['imdb_rnn',
 'ieee_fraud',
 'iris',
 'bank_churn',
 '20news',
 'p2p_loans',
 'bikeshare_processed',
 'titanic',
 'winequality',
 'bikeshare']

### Accessing the data on Fiddler
We can also verify everything worked by looking at the web UI:
- http://localhost:4100/datasets

(or if you used cloud instead of onebox)
- https://app.fiddler.ai/datasets

# Section 2: Building a Tensorflow model

In [11]:
# ensure TF 1 (and latest version), but not TF 2
!pip install --upgrade tensorflow==1.*

Requirement already up-to-date: tensorflow==1.* in /usr/local/lib/python3.6/dist-packages (1.15.0)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [12]:
import tensorflow as tf
# NOTE: we can only run tf.saved_model.save() on a tf.keras model if we use eager execution
tf.compat.v1.enable_eager_execution()

In [13]:
# triple-check version is 1.x
print(tf.__version__)
assert tf.__version__[0] == '1', 'Stop! This tutorial is meant to use TF 1.x!'

1.15.0


In [14]:
# Train a 2-layer MLP model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(1)
])
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.003),
    loss='mse')
model.fit(x_train_processed, y_train.values, batch_size=16, epochs=2)
model.optimizer.learning_rate = 0.01
model.fit(x_train_processed, y_train.values, batch_size=32, epochs=8)

y_hat = model.predict(x_test_processed)
r2 = sklearn.metrics.r2_score(y_test, y_hat)
print(f'The model achieves a test-set r2 score of {r2:.2f}')

Train on 8645 samples
Epoch 1/2
Epoch 2/2
Train on 8645 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
The model achieves a test-set r2 score of 0.50


# Section 3: Uploading TF model to Fiddler
Now that we have trained a model, let's move on to uploding the model to Fiddler.

For Tensorflow, the [SavedModel format](https://www.tensorflow.org/guide/saved_model) provides a powerful way to save and deploy models across versions and languages. Below we demonstrate a generic wrapper class that can be used to quickly and easily deploy to Fiddler any TF model.

First, we save our model to the SavedModel format using `tf.saved_model.save`. Then, we save a `package.py` file that contains the glue code needed for the Fiddler platform to loand and run the SavedModel. Lastly, we'll upload the SavedModel and the `package.py` file using the `fdl.FiddlerApi.upload_model_custom()` function.

### TF SavedModel and `package.py`
Since it is a common task to run TF models saved in the SavedModel format, the Fiddler Python package provides a generic model loader class called `TFSavedModel`. Let's take a look at the source code below. We'll notice that the `__init__` method loads the model from the SavedModel format, and that it offers a single public method `.predict()` which runs the model on input provided as a pandas DataFrame. This is the standard convention for all model-runners in Fiddler. 

We also notice that this class takes an optional keyword argument `input_transformation` which allows the user to define a custom mapping the DataFrame input into the matrix/tensor inputs consumed by the model. Later we will use this functionality to upload our `preprocessor` object along with the model to avoid the necessity of preprocessing our dataset before uploading it to Fiddler (using the un-processed dataset also offers the benefit of explanations being computed in terms of the original dataset features, rather than the model's internal preprocessed features, which are often less human-interpretable).

If the `TFSavedModel` class is not sufficient to run your TF model, it might still be helpful to either subclass it with your own custom model loader or use its source code as a starting point for writing a model loader from scratch.

In [15]:
from fiddler.model_loaders import TFSavedModel
??TFSavedModel

[0;31mInit signature:[0m
[0mTFSavedModel[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msaved_model_path[0m[0;34m:[0m[0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_column_names[0m[0;34m:[0m[0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mis_binary_classification[0m[0;34m:[0m[0mbool[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m:[0m[0mint[0m[0;34m=[0m[0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput_transformation[0m[0;34m:[0m[0mCallable[0m[0;34m[[0m[0;34m[[0m[0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m][0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mnumpy[0m[0;34m.[0m[0mndarray[0m[0;34m][0m[0;34m][0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mTFSavedModel[0m[0;34m.[0m[0;34m<[0m[0;32mla

## 3.1 Saving the model
We begin by creating a new directory to save our model files and `package.py` file into. Then we save the model using the Tensorflow SavedModel format.

In [16]:
# (re-)create directory for the model
model_dir = pathlib.Path('tf_model')
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()
# save the model
tf.saved_model.save(model, str(model_dir / 'saved_model'))

W1206 20:41:56.768882 140608237893440 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1781: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


## 3.2 Making the SavedModel run in Fiddler
In order for the model to run on Fiddler, we need to author a `package.py` file to glue things together. Let's develop the logic that fits into this file piece by piece here.

In [17]:
# first, let's make sure we can run our model using the TFSavedModel class
# to do this, all we need is the path to our SavedModel files and a name
# for our model output
model_prediction_name = 'predicted_bike_rentals'
fiddler_model = TFSavedModel(
    saved_model_path=model_dir / 'saved_model', 
    output_column_names=[model_prediction_name],
    is_binary_classification=False
)
input_df = pd.DataFrame(x_test_processed[:1], columns=onehot.feature_names)
pred = fiddler_model.predict(input_df)
display(pred)
success = pred.values == model.predict(x_test_processed[:1])
assert success
print('Model loader works!')    

W1206 20:41:56.981302 140608237893440 module_wrapper.py:139] From /mnt/client/fiddler/model_loaders.py:62: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W1206 20:41:56.983628 140608237893440 deprecation.py:323] From /mnt/client/fiddler/model_loaders.py:65: load (from tensorflow.python.saved_model.loader_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
W1206 20:41:57.070469 140608237893440 module_wrapper.py:139] From /mnt/client/fiddler/model_loaders.py:69: The name tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY is deprecated. Please use tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY instead.



Unnamed: 0,predicted_bike_rentals
0,32.117195


Model loader works!


### Ensuring model output consistency
Above we arbitrarily chose the name 'predicted_bike_rentals' for our model output. However, as we showed in tutorial_01, when a model is uploaded to Fiddler it is accompanied by a `ModelInfo` object that specifies metatdata like the names, types, and order of model inputs and outputs. To ensure consistency, it is best practice to programatically read the output name from the `ModelInfo` rather than storing it in two places at once (i.e. `ModelInfo` *and* `package.py`).

In [18]:
# let's start by creating a ModelInfo for our model
# where the model inputs are the processed data
processed_dataset_info = fdl.DatasetInfo.from_dataframe(df_2011_processed)
model_of_processed_features_info = fdl.ModelInfo.from_dataset_info(
    processed_dataset_info,
    target='cnt',
    features=onehot.feature_names,
    input_type=fdl.ModelInputType.TABULAR
)

# we can set the desired output name in the ModelInfo
model_of_processed_features_info.outputs[0].name = 'predicted_bike_rentals'

model_of_processed_features_info

ModelInfo:
  display_name:  model
  description: None
  input_type: ModelInputType.TABULAR
  model_task: ModelTask.REGRESSION
  inputs and outputs:
                        column column_type  dtype count(possible_values)
    0                 season_1       input  FLOAT                      -
    1                 season_2       input  FLOAT                      -
    2                 season_3       input  FLOAT                      -
    3                 season_4       input  FLOAT                      -
    4                     mnth       input  FLOAT                      -
    5                       hr       input  FLOAT                      -
    6                  holiday       input  FLOAT                      -
    7                  weekday       input  FLOAT                      -
    8               workingday       input  FLOAT                      -
    9             weathersit_1       input  FLOAT                      -
    10            weathersit_2       input  FLOAT

In [19]:
# now we can pull the output names from the ModelInfo when we load the model
output_names = model_of_processed_features_info.get_output_names()
fiddler_model = TFSavedModel(
    saved_model_path=model_dir / 'saved_model', 
    output_column_names=output_names,
    is_binary_classification=False
)
input_df = pd.DataFrame(x_test_processed[:1], columns=onehot.feature_names)
success = fiddler_model.predict(input_df).values == model.predict(x_test_processed[:1])
assert success
print('Model loader works!')

Model loader works!


### Loading ModelInfo inside `package.py`
When a model is uploaded to Fiddler, the ModelInfo is serialized alongsize the `package.py` file in the form of a `model.yaml` file. Here we will show you how you can write a method to load the ModelInfo within the Fiddler engine, and how to mock this process locally to ensure your `package.py` code will operate properly when running on Fiddler.

In [20]:
import yaml
def mock_model_yaml(model_info, model_dir):
    """Mock writing ModelInfo to model.yaml
    NOTE: the ModelInfo.to_dict() dictionary should be nested inside another 
        dictionary before it is written to model.yaml
    """
    yaml_file_path = pathlib.Path(model_dir) / 'model.yaml'
    print(f'Writing ModelInfo to {str(yaml_file_path)}...')
    with yaml_file_path.open('w') as yaml_file:
        yaml.dump({'model': model_info.to_dict()}, yaml_file)
        
def un_mock_model_yaml(model_dir):
    """Clean up a mocked model.yaml file"""
    model_yaml_path = pathlib.Path(model_dir) / 'model.yaml'
    try:
        model_yaml_path.unlink()
        print(f'{model_yaml_path} successfully removed.')
    except FileNotFoundError:
        print('No model.yaml file found')
    
def load_model_info(model_dir):
    """Load ModelInfo from a model.yaml file"""
    with (pathlib.Path(model_dir) / 'model.yaml').open('r') as yaml_file:
        return fdl.ModelInfo.from_dict(yaml.load(yaml_file, Loader=yaml.SafeLoader))
    
mock_model_yaml(model_of_processed_features_info, model_dir)
test_model_info = load_model_info(model_dir)
un_mock_model_yaml(model_dir)
success = test_model_info.get_output_names() == model_of_processed_features_info.get_output_names()
assert success
print('ModelInfo properly read from disk!')

Writing ModelInfo to tf_model/model.yaml...
tf_model/model.yaml successfully removed.
ModelInfo properly read from disk!


### The `get_model()` function
Lastly, we need to address the convention of `package.py`, which is to provide a function `get_model()` that returns a model object which offers a DataFrame -> DataFrame `.predict()` method. Let's write and test a `get_model()` function here.

In [21]:
# now, in `package.py` we can simply load this model info as follows
def get_model(model_dir, tf_saved_model_dir):
    model_info = load_model_info(model_dir)
    output_names = model_info.get_output_names()
    is_binary_classification = (
        model_info.model_task.name 
            == fdl.ModelTask.BINARY_CLASSIFICATION.name
    )
    return TFSavedModel(
        tf_saved_model_dir, 
        output_column_names=output_names,
        is_binary_classification=is_binary_classification
    )

In [22]:
# test get_model()
mock_model_yaml(model_of_processed_features_info, model_dir)
fiddler_model = get_model(model_dir, model_dir / 'saved_model')
un_mock_model_yaml(model_dir)

input_df = pd.DataFrame(x_test_processed[:1], columns=onehot.feature_names)
success = fiddler_model.predict(input_df).values == model.predict(x_test_processed[:1])
assert success
print('Model loader works!')  

Writing ModelInfo to tf_model/model.yaml...
tf_model/model.yaml successfully removed.
Model loader works!


### Writing package.py
Now all we need to do is combine our `load_model_info()` and `get_model()` functions into a script called `package.py`.

In doing so, there is one tricky part: we need to make sure to specify the `model_dir` and `tf_saved_model_dir` keyword arguments in the `get_model()` function, since the Fiddler engine will not not pass any arguments when it calls this method (the arguments above are just to enable us to test the method in this notebook). What is tricky about this is that we have to programatically ascertain the file's path, since the caller's current working directory may differ from the directory in which the model is stored. To do this, we recommend the following idiom:

`MODEL_DIR = pathlib.Path(__file__).parent`

In [23]:
package_py_contents = '''
import pathlib

import yaml

import fiddler as fdl
from fiddler.model_loaders import TFSavedModel

MODEL_DIR = pathlib.Path(__file__).parent

def load_model_info(model_dir):
    """Load ModelInfo from a model.yaml file"""
    with (pathlib.Path(model_dir) / 'model.yaml').open('r') as yaml_file:
        return fdl.ModelInfo.from_dict(yaml.load(yaml_file, Loader=yaml.SafeLoader))

def get_model(model_dir=MODEL_DIR, tf_saved_model_dir=MODEL_DIR / 'saved_model'):
    model_info = load_model_info(model_dir)
    output_names = model_info.get_output_names()
    is_binary_classification = (
        model_info.model_task.name 
            == fdl.ModelTask.BINARY_CLASSIFICATION.name
    )
    return TFSavedModel(
        tf_saved_model_dir, 
        output_column_names=output_names,
        is_binary_classification=is_binary_classification
    )
'''
with (model_dir / 'package.py').open('w') as f:
    f.write(package_py_contents)

## 3.3 Uploading the model
Now that we have saved our model, created a `ModelInfo`, and written a working `package.py`, we can now upload the model using the Fiddler API.

In [24]:
project_id = 'bikeshare_forecasting'
model_id = 'processed_features_mlp'

# create a project to organize our models
fiddler_api.create_project(project_id)

# (re-)upload our model
if model_id in fiddler_api.list_models(project_id):
    fiddler_api.delete_model(project_id, model_id)
fiddler_api.upload_model_custom(
    artifact_path=model_dir, 
    info=model_of_processed_features_info, 
    project_id=project_id, 
    model_id=model_id,
    associated_dataset_ids=['bikeshare_processed']
)

Project already exists, no change.


{'model': {'name': ' model',
  'input-type': 'structured',
  'model-task': 'regression',
  'inputs': [{'column-name': 'season_1', 'data-type': 'float'},
   {'column-name': 'season_2', 'data-type': 'float'},
   {'column-name': 'season_3', 'data-type': 'float'},
   {'column-name': 'season_4', 'data-type': 'float'},
   {'column-name': 'mnth', 'data-type': 'float'},
   {'column-name': 'hr', 'data-type': 'float'},
   {'column-name': 'holiday', 'data-type': 'float'},
   {'column-name': 'weekday', 'data-type': 'float'},
   {'column-name': 'workingday', 'data-type': 'float'},
   {'column-name': 'weathersit_1', 'data-type': 'float'},
   {'column-name': 'weathersit_2', 'data-type': 'float'},
   {'column-name': 'weathersit_3', 'data-type': 'float'},
   {'column-name': 'weathersit_4', 'data-type': 'float'},
   {'column-name': 'temp', 'data-type': 'float'},
   {'column-name': 'atemp', 'data-type': 'float'},
   {'column-name': 'hum', 'data-type': 'float'},
   {'column-name': 'windspeed', 'data-type'

In [25]:
# verify the model works
pred = fiddler_api.run_model(project_id, model_id, df_2012_processed.head(3))
success = np.isclose(pred, model.predict(x_test_processed[:3])).all()
display(pred)
assert success
if success:
    print('Model executes properly!')

Unnamed: 0,predicted_bike_rentals
0,32.117157
1,22.05661
2,12.183211


Model executes properly!


## 3.4 Putting it all together
Whew, that felt like a lot building up a working `package.py`! Luckly, most of the above was just going into detail, the actual code needed to upload a TF model is not so long. Here we'll put everything needed into a single cell.

In [26]:
# preliminary info
project_id = 'bikeshare_forecasting'
model_id = 'processed_features_mlp'
tf_model = model
example_data = df_2011_processed
target_column_name = 'cnt'
model_dir = pathlib.Path('tf_model')

# create a ModelInfo
model_of_processed_features_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=fdl.DatasetInfo.from_dataframe(example_data),
    target=target_column_name,
    features=example_data.columns.difference([target_column_name]).tolist(),
    input_type=fdl.ModelInputType.TABULAR,
)

# (re-)create directory for the model
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

# save the tensorflow model
tf.saved_model.save(tf_model, str(model_dir / 'saved_model'))

# create package.py
package_py_contents = '''
import pathlib

import yaml

import fiddler as fdl
from fiddler.model_loaders import TFSavedModel

MODEL_DIR = pathlib.Path(__file__).parent

def load_model_info(model_dir):
    """Load ModelInfo from a model.yaml file"""
    with (pathlib.Path(model_dir) / 'model.yaml').open('r') as yaml_file:
        return fdl.ModelInfo.from_dict(yaml.load(yaml_file, Loader=yaml.SafeLoader))

def get_model(model_dir=MODEL_DIR, tf_saved_model_dir=MODEL_DIR / 'saved_model'):
    model_info = load_model_info(model_dir)
    output_names = model_info.get_output_names()
    is_binary_classification = (
        model_info.model_task.name 
            == fdl.ModelTask.BINARY_CLASSIFICATION.name
    )
    return TFSavedModel(
        tf_saved_model_dir, 
        output_column_names=output_names,
        is_binary_classification=is_binary_classification
    )
'''
with (model_dir / 'package.py').open('w') as f:
    f.write(package_py_contents)
    
# upload the model


# (re-)upload our model
fiddler_api.create_project(project_id)
if model_id in fiddler_api.list_models(project_id):
    fiddler_api.delete_model(project_id, model_id)
fiddler_api.upload_model_custom(
    artifact_path=model_dir, 
    info=model_of_processed_features_info, 
    project_id=project_id, 
    model_id=model_id,
    associated_dataset_ids=['bikeshare_processed']
)

# clean up local directory
shutil.rmtree(model_dir, ignore_errors=True)

# verify the uploaded model runs
pred = fiddler_api.run_model(project_id, model_id, example_data.head(1))
print(f'Running on Fiddler, the model predicts {pred.iat[0,0]:.2f} for the first example row!')

Project already exists, no change.
Running on Fiddler, the model predicts 10.65 for the first example row!


## 3.5 Deploying feature preprocessing to Fiddler
You probably noticed that the above example runs on the `bikeshare_processed` dataset. Below, we show how to upload the `preprocessor` object along with the TF model and configure Fiddler to run the preprocessor alongside the model. We will do this by serializing the `preprocessor` using `pickle` and passing a custom `input_transformation` function to the `TFSavedModel` in our `package.py` which tells it to un-pickle the preprocessor and run it.

In [27]:
import pickle

# (re-)create directory for the model
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

# save the preprocessor
with (model_dir / 'preprocessor.pkl').open('wb') as pkl_file:
    pickle.dump(preprocessor, pkl_file)

# save the tensorflow model
tf.saved_model.save(tf_model, str(model_dir / 'saved_model'))

# define a custom input_transformation
def custom_input_transformation(input_df):
    # load preprocessor
    with (model_dir / 'preprocessor.pkl').open('rb') as pkl_file:
        preproc = pickle.load(pkl_file)
    return [preproc.transform(input_df)]

# verify this transformation works
success = np.all(custom_input_transformation(x_train) == x_train_processed)
assert success
print('Custom input transformation works!')

Custom input transformation works!


In [28]:
# now, let's also create a new ModelInfo
model_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=fdl.DatasetInfo.from_dataframe(df_2011),
    target=target_column_name,
    features=example_data.columns.difference([target_column_name]).tolist(),
    input_type=fdl.ModelInputType.TABULAR,
)

In [29]:
# load a TFSavedModel with this custom input transformation
fiddler_model = TFSavedModel(
    saved_model_path=model_dir / 'saved_model', 
    output_column_names=model_info.get_output_names(),
    input_transformation=custom_input_transformation
)

# verify this model works
margin_of_error = 0.001
max_error = np.max(np.abs(fiddler_model.predict(x_test) - model.predict(x_test_processed)))[0]
success = max_error < margin_of_error
assert success
print(f'Success! All test-set predictions differ by less than {margin_of_error}.')

Success! All test-set predictions differ by less than 0.001.


### Speeding things up
There are a few tweaks we can make to ensure our model runs quickly on Fiddler. 

1. For models with a small memory footprint, we can safely increase the batch size of the TFSavedModel model loader, which will reduce the overhead incurred by feeding the Tensorflow graph many times.

2. Since Fiddler caches models rather than always reloading from disk, we can create a custom_input_transformation that loads the preprocessor before the model is initialized, as opposed to afterward during every .predict() call.

In [30]:
# define a custom input_transformation
def get_fast_model():
    # load preprocessor before we define custom_input_transformation()
    with (model_dir / 'preprocessor.pkl').open('rb') as pkl_file:
        preproc = pickle.load(pkl_file)

    def custom_input_transformation(input_df):
        return [preproc.transform(input_df)]
    
    return TFSavedModel(
        saved_model_path=model_dir / 'saved_model', 
        output_column_names=model_info.get_output_names(),
        # use a big batch size
        batch_size=512,
        input_transformation=custom_input_transformation,
    )

fiddler_model = get_fast_model()
max_error = np.max(np.abs(fiddler_model.predict(x_test) - model.predict(x_test_processed)))[0]
success = max_error < margin_of_error
assert success
print(f'Success! All test-set predictions differ by less than {margin_of_error}.')

Success! All test-set predictions differ by less than 0.001.


### Putting it all together with feature preprocessing
Let's put this all together to show how we can upload our model along with the preprocessor!

In [31]:
# preliminary info
project_id = 'bikeshare_forecasting'
model_id = 'raw_features_mlp'
tf_model = model
example_data = pd.concat([x_test, y_test], axis=1)
target_column_name = 'cnt'
model_dir = pathlib.Path('tf_model')

# create a ModelInfo
model_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=fdl.DatasetInfo.from_dataframe(example_data),
    target=target_column_name,
    features=example_data.columns.difference([target_column_name]).tolist(),
    input_type=fdl.ModelInputType.TABULAR,
)

# (re-)create directory for the model
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

# save the preprocessor
with (model_dir / 'preprocessor.pkl').open('wb') as pkl_file:
    pickle.dump(preprocessor, pkl_file)

# save the tensorflow model
tf.saved_model.save(tf_model, str(model_dir / 'saved_model'))

# create package.py
package_py_contents = '''
import pathlib
import pickle

import yaml

import fiddler as fdl
from fiddler.model_loaders import TFSavedModel

MODEL_DIR = pathlib.Path(__file__).parent

def load_model_info(model_dir):
    """Load ModelInfo from a model.yaml file"""
    with (pathlib.Path(model_dir) / 'model.yaml').open('r') as yaml_file:
        return fdl.ModelInfo.from_dict(yaml.load(yaml_file, Loader=yaml.SafeLoader))

def get_model(model_dir=MODEL_DIR, tf_saved_model_dir=MODEL_DIR / 'saved_model'):
    model_info = load_model_info(model_dir)
    output_names = model_info.get_output_names()
    is_binary_classification = (
        model_info.model_task.name 
            == fdl.ModelTask.BINARY_CLASSIFICATION.name
    )
    # load preprocessor before we define custom_input_transformation()
    with (MODEL_DIR / 'preprocessor.pkl').open('rb') as pkl_file:
        preproc = pickle.load(pkl_file)

    def custom_input_transformation(input_df):
        return [preproc.transform(input_df)]
    
    return TFSavedModel(
        tf_saved_model_dir, 
        output_column_names=output_names,
        is_binary_classification=is_binary_classification,
        # use a big batch size since the model is small
        batch_size=512,
        input_transformation=custom_input_transformation,
    )
'''
with (model_dir / 'package.py').open('w') as f:
    f.write(package_py_contents)

# (re-)upload our model
fiddler_api.create_project(project_id)
if model_id in fiddler_api.list_models(project_id):
    fiddler_api.delete_model(project_id, model_id)
fiddler_api.upload_model_custom(
    artifact_path=model_dir, 
    info=model_info, 
    project_id=project_id, 
    model_id=model_id,
    associated_dataset_ids=['bikeshare']
)

# clean up local directory
shutil.rmtree(model_dir, ignore_errors=True)

# verify the uploaded model runs
pred = fiddler_api.run_model(project_id, model_id, example_data.head(1))
print(f'Running on Fiddler, the model predicts {pred.iat[0,0]:.2f} for the first example row!')

Project already exists, no change.
Running on Fiddler, the model predicts 32.12 for the first example row!
