# Setup: make sure you have the right versions of packages
You may need to restart the kernel after running this cell for the first time.

In [1]:
# ensure TF 1 not TF 2 (use version corresponding to what's in your Fiddler configuration, 1.14 as of 2020-02-26)
!pip install --upgrade tensorflow==1.14.*

import tensorflow as tf
# NOTE: we can only run tf.saved_model.save() on a tf.keras model if we use eager execution
tf.compat.v1.enable_eager_execution()

# triple-check version is 1.x
print('TF version:', tf.__version__)
assert tf.__version__[0] == '1', 'Stop! This tutorial is meant to use TF 1.x!'

# likewise, make sure the scikit-learn version matches what's in Fidler (0.21 as of 2020-02-26)
!pip install --upgrade scikit-learn==0.21.*

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: tensorflow==1.14.* in /home/jupyterdocker/.local/lib/python3.7/site-packages (1.14.0)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


TF version: 1.14.0
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: scikit-learn==0.21.* in /home/jupyterdocker/.local/lib/python3.7/site-packages (0.21.3)


In [2]:
import io
import os
import pathlib
import pickle
import shutil
import time
import zipfile

import category_encoders
import numpy as np
import pandas as pd
import requests
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing

import fiddler as fdl

In [3]:
%load_ext autoreload
%autoreload 2

# Intro
In tutorial_03, we covered advanced model upload in Tensorflow. This tutorial is a small extension demonstrating the Keras equivalent.

# Section 0: Configure connection to Fiddler.

In [4]:
# NOTE: typically the API url for your running instance of Fiddler will be "https://api.fiddler.ai" (or "http://localhost:4100" for onebox)
# however, use "http://host.docker.internal:4100" as our URL if Jupyter is running in a docker VM on the same macOS machine as onebox
url = 'http://localhost:4100'

# see <Fiddler URL>/settings/credentials to find, create, or change this token
token = os.getenv('FIDDLER_API_TOKEN')

# see <Fiddler URL>/settings/general to find this id (listed as "Organization Name")
org_id = 'onebox'

fiddler_api = fdl.FiddlerApi(url=url, org_id=org_id, auth_token=token)

# Section 1: Preparing data and model.
For a more step-by-step version of this section, see tutorial_03.

In [5]:
zip_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
z = zipfile.ZipFile(io.BytesIO(requests.get(zip_url).content))

# here we pre-configure the datatypes for our dataframe
# so it doesn't require any datatype modification after import
bikeshare_dtypes = dict(season='category', holiday='bool',
                        workingday='bool', weathersit='category')
bikeshare_datetime_columns = ['dteday']
bikeshare_index_column = 'instant'
with z.open('hour.csv') as csv:
    df = pd.read_csv(csv, 
                     dtype=bikeshare_dtypes, 
                     parse_dates=bikeshare_datetime_columns,
                     index_col=bikeshare_index_column)

# split train/test by year
is_2011 = df['yr'] == 0
df_2011 = df[is_2011].reset_index(drop=True)
df_2012 = df[~is_2011].reset_index(drop=True)

# specify which columns are features and which are not
target = 'cnt'
not_used_as_features = ['dteday', 'yr', 'casual', 'registered']
non_feature_columns = [target] + not_used_as_features
feature_columns = list(set(df_2011.columns) - set(non_feature_columns))

# split our data into features and targets
x_train = df_2011.drop(columns=non_feature_columns)
x_test = df_2012.drop(columns=non_feature_columns)
y_train = df_2011[target]
y_test = df_2012[target]

onehot = category_encoders.OneHotEncoder(cols=df.select_dtypes('category').columns.tolist())
standard_scaler = sklearn.preprocessing.StandardScaler()
preprocessor = sklearn.pipeline.make_pipeline(onehot, standard_scaler)
preprocessor.fit(x_train)
x_train_processed = preprocessor.transform(x_train)
x_test_processed = preprocessor.transform(x_test)

# delete the datasets if we've uploaded them previously
try:
    fiddler_api.delete_dataset('bikeshare')
except:
    pass
try:
    fiddler_api.delete_dataset('bikeshare_processed')
except:
    pass
time.sleep(2)

# let's upload the original dataset
fiddler_api.upload_dataset(
    dataset={'train': df_2011, 'test': df_2012}, 
    dataset_id='bikeshare')

# let's also upload the preprocessed version of the dataset
df_2011_processed = pd.concat([pd.DataFrame(x_train_processed, columns=onehot.feature_names), y_train], axis=1)
df_2012_processed = pd.concat([pd.DataFrame(x_test_processed, columns=onehot.feature_names), y_test], axis=1)
fiddler_api.upload_dataset(
    dataset={'train': df_2011_processed, 'test': df_2012_processed}, 
    dataset_id='bikeshare_processed')

print('Listing all datasets in Fiddler, you should see "bikeshare" and "bikeshare_processed" in this list.')
fiddler_api.list_datasets()

Heads up! We are inferring the details of your dataset from the dataframe(s) provided. Please take a second to check our work.

If the following DatasetInfo is an incorrect representation of your data, you can construct a DatasetInfo with the DatasetInfo.from_dataframe() method and modify that object to reflect the correct details of your dataset.

After constructing a corrected DatasetInfo, please re-upload your dataset with that DatasetInfo object explicitly passed via the `info` parameter of FiddlerApi.upload_dataset().

You may need to delete the initially uploaded versionvia FiddlerApi.delete_dataset('bikeshare').

Inferred DatasetInfo to check:
  DatasetInfo:
    display_name: bikeshare
    files: []
    columns:
              column     dtype count(possible_values)
      0       dteday    STRING                      -
      1       season  CATEGORY                      4
      2           yr   INTEGER                      -
      3         mnth   INTEGER                      -
 

Listing all datasets in Fiddler, you should see "bikeshare" and "bikeshare_processed" in this list.


['time_split_dataset',
 '20news',
 'titanic',
 'iris',
 'bank_churn',
 'bikeshare_processed',
 'bikeshare',
 'imdb_rnn',
 'winequality',
 'p2p_loans']

# Section 2: Building a Keras model

In [6]:
# Train a 2-layer MLP model
inputs = tf.keras.Input(shape=(x_train_processed.shape[1],))
activations = tf.keras.layers.Dense(128, activation=tf.nn.relu)(inputs)
activations = tf.keras.layers.Dense(128, activation=tf.nn.relu)(activations)
activations = tf.keras.layers.Dense(1)(activations)
model = tf.keras.Model(inputs=inputs, outputs=activations, name='keras_bikeshare_mlp_model')

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.003),
    loss='mse')
model.fit(x_train_processed, y_train.values, batch_size=16, epochs=2)
model.optimizer.learning_rate = 0.01
model.fit(x_train_processed, y_train.values, batch_size=32, epochs=8)

y_hat = model.predict(x_test_processed)
r2 = sklearn.metrics.r2_score(y_test, y_hat)
print(f'The model achieves a test-set r2 score of {r2:.2f}')

Epoch 1/2
Epoch 2/2
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
The model achieves a test-set r2 score of 0.58


# Section 3: Uploading Keras model to Fiddler
Here we adapt the previous tutorial to use a Keras .h5 saved model format.

In [7]:
from fiddler.model_loaders import KerasModel

# preliminary info
project_id = 'bikeshare_forecasting'
model_id = 'raw_features_mlp'
tf_model = model
example_data = pd.concat([x_test, y_test], axis=1)
target_column_name = 'cnt'
model_dir = pathlib.Path('tf_model')

# create a ModelInfo
model_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=fdl.DatasetInfo.from_dataframe(example_data),
    target=target_column_name,
    features=example_data.columns.difference([target_column_name]).tolist(),
    input_type=fdl.ModelInputType.TABULAR,
)

# (re-)create directory for the model
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

# save the preprocessor
with (model_dir / 'preprocessor.pkl').open('wb') as pkl_file:
    pickle.dump(preprocessor, pkl_file)

# save the tensorflow model
model.save(str(model_dir / 'model.h5'), include_optimizer=False)

# create package.py
package_py_contents = '''
import pathlib
import pickle

import yaml

import fiddler as fdl
from fiddler.model_loaders import KerasModel

MODEL_DIR = pathlib.Path(__file__).parent

def load_model_info(model_dir):
    """Load ModelInfo from a model.yaml file"""
    with (pathlib.Path(model_dir) / 'model.yaml').open('r') as yaml_file:
        return fdl.ModelInfo.from_dict(yaml.load(yaml_file, Loader=yaml.SafeLoader))

def get_model(model_dir=MODEL_DIR, tf_saved_model_dir=MODEL_DIR / 'model.h5'):
    model_info = load_model_info(model_dir)
    output_names = model_info.get_output_names()
    is_binary_classification = (
        model_info.model_task.name 
            == fdl.ModelTask.BINARY_CLASSIFICATION.name
    )
    # load preprocessor before we define custom_input_transformation()
    with (MODEL_DIR / 'preprocessor.pkl').open('rb') as pkl_file:
        preproc = pickle.load(pkl_file)

    def custom_input_transformation(input_df):
        return [preproc.transform(input_df)]
    
    return KerasModel(
        tf_saved_model_dir, 
        output_column_names=output_names,
        is_binary_classification=is_binary_classification,
        input_transformation=custom_input_transformation,
    )
'''
with (model_dir / 'package.py').open('w') as f:
    f.write(package_py_contents)

# (re-)upload our model
fiddler_api.create_project(project_id)
if model_id in fiddler_api.list_models(project_id):
    fiddler_api.delete_model(project_id, model_id)
time.sleep(2)
fiddler_api.upload_model_custom(
    artifact_path=model_dir, 
    info=model_info, 
    project_id=project_id, 
    model_id=model_id,
    associated_dataset_ids=['bikeshare']
)

# clean up local directory
shutil.rmtree(model_dir, ignore_errors=True)

# verify the uploaded model runs
time.sleep(2)
pred = fiddler_api.run_model(project_id, model_id, example_data.head(1))
print(f'Running on Fiddler, the model predicts {pred.iat[0,0]:.2f} for the first example row, locally running the model before gave {float(y_hat[0]):.2f}!')

Project already exists, no change.
Running on Fiddler, the model predicts 39.31 for the first example row, locally running the model before gave 39.31!
