# Inference for CIFAR-10 dataset using predict BYOM
The predict BYOM function allows you to do inference using models that have not been trained with MADlib, but rather imported or created elsewhere. It was added in MADlib 1.17.

In this workbook we train a model in Python using
https://keras.io/examples/cifar10_cnn/
and run inference on the validation set.

## Table of contents

<a href="#setup">1. Setup</a>

<a href="#train_model">2. Train model in Python</a>

<a href="#load_model">3. Load model into table</a>

<a href="#load_images">4. Get validation data set and load into table</a>

<a href="#inference">5. Inference</a>

<a id="setup"></a>
# 1. Setup

In [1]:
%load_ext sql

In [2]:
# Greenplum Database 5.x on GCP - via tunnel
%sql postgresql://gpadmin@localhost:8000/madlib
        
# PostgreSQL local
#%sql postgresql://fmcquillan@localhost:5432/madlib

In [3]:
%sql select madlib.version();
#%sql select version();

1 rows affected.


version
"MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-89-g9d9f756, cmake configuration time: Thu Mar 4 23:11:53 UTC 2021, build type: release, build system: Linux-3.10.0-1160.11.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5"


<a id="train_model"></a>
# 2. Train model in Python

Train a model in Python using https://keras.io/examples/cifar10_cnn/

Define model

In [4]:
from __future__ import print_function
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
import os

batch_size = 32
num_classes = 10
epochs = 2
data_augmentation = True
num_predictions = 20
#save_dir = os.path.join(os.getcwd(), 'saved_models')
#model_name = 'keras_cifar10_trained_model.h5'

# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# initiate RMSprop optimizer
opt = keras.optimizers.RMSprop(lr=0.0001, decay=1e-6)

# Let's train the model using RMSprop
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy']);

x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [5]:
model.to_json()

'{"class_name": "Sequential", "keras_version": "2.2.4-tf", "config": {"layers": [{"class_name": "Conv2D", "config": {"kernel_initializer": {"class_name": "GlorotUniform", "config": {"dtype": "float32", "seed": null}}, "name": "conv2d", "kernel_constraint": null, "bias_regularizer": null, "bias_constraint": null, "dtype": "float32", "activation": "linear", "trainable": true, "data_format": "channels_last", "padding": "same", "strides": [1, 1], "dilation_rate": [1, 1], "kernel_regularizer": null, "filters": 32, "bias_initializer": {"class_name": "Zeros", "config": {"dtype": "float32"}}, "batch_input_shape": [null, 32, 32, 3], "use_bias": true, "activity_regularizer": null, "kernel_size": [3, 3]}}, {"class_name": "Activation", "config": {"dtype": "float32", "activation": "relu", "trainable": true, "name": "activation"}}, {"class_name": "Conv2D", "config": {"kernel_initializer": {"class_name": "GlorotUniform", "config": {"dtype": "float32", "seed": null}}, "name": "conv2d_1", "kernel_const

In [7]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

if not data_augmentation:
    print('Not using data augmentation.')
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_test, y_test),
              shuffle=True)
else:
    print('Using real-time data augmentation.')
    # This will do preprocessing and realtime data augmentation:
    datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        zca_epsilon=1e-06,  # epsilon for ZCA whitening
        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
        # randomly shift images horizontally (fraction of total width)
        width_shift_range=0.1,
        # randomly shift images vertically (fraction of total height)
        height_shift_range=0.1,
        shear_range=0.,  # set range for random shear
        zoom_range=0.,  # set range for random zoom
        channel_shift_range=0.,  # set range for random channel shifts
        # set mode for filling points outside the input boundaries
        fill_mode='nearest',
        cval=0.,  # value used for fill_mode = "constant"
        horizontal_flip=True,  # randomly flip images
        vertical_flip=False,  # randomly flip images
        # set rescaling factor (applied before any other transformation)
        rescale=None,
        # set function that will be applied on each input
        preprocessing_function=None,
        # image data format, either "channels_first" or "channels_last"
        data_format=None,
        # fraction of images reserved for validation (strictly between 0 and 1)
        validation_split=0.0)

    # Compute quantities required for feature-wise normalization
    # (std, mean, and principal components if ZCA whitening is applied).
    datagen.fit(x_train)

    # Fit the model on the batches generated by datagen.flow().
    model.fit_generator(datagen.flow(x_train, y_train,
                                     batch_size=batch_size),
                        epochs=epochs,
                        validation_data=(x_test, y_test),
                        workers=1)

# Save model and weights
#if not os.path.isdir(save_dir):
#    os.makedirs(save_dir)
#model_path = os.path.join(save_dir, model_name)
#model.save(model_path)
#print('Saved trained model at %s ' % model_path)

# Score trained model.
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

Using real-time data augmentation.
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x14dfc98d0>

Test loss: 1.4364811393737793
Test accuracy: 0.4754


<a id="load_model"></a>
# 3.  Load model into table

Load the model architecture and weights into the model architecture table

In [10]:
import psycopg2 as p2
conn = p2.connect('postgresql://gpadmin@localhost:8000/madlib')
#conn = p2.connect('postgresql://fmcquillan@localhost:5432/madlib')
cur = conn.cursor()

from keras.layers import *
from keras import Sequential
import numpy as np

# get weights, flatten and serialize
weights = model.get_weights()
weights_flat = [w.flatten() for w in weights]
weights1d =  np.concatenate(weights_flat).ravel()
weights_bytea = p2.Binary(weights1d.tostring())

%sql DROP TABLE IF EXISTS model_arch_library_cifar10;
query = "SELECT madlib.load_keras_model('model_arch_library_cifar10', %s,%s,%s,%s)"
cur.execute(query,[model.to_json(), weights_bytea, "CIFAR10 model", "CNN model with weights trained on CIFAR10."])
conn.commit()

# check weights loaded OK
%sql SELECT model_id, name, description FROM model_arch_library_cifar10;

Done.


[]

1 rows affected.


model_id,name,description
1,CIFAR10 model,CNN model with weights trained on CIFAR10.


<a id="load_images"></a>
# 4. Get validation data set and load into table

First set up image loader using the script called <em>madlib_image_loader.py</em> located at https://github.com/apache/madlib-site/tree/asf-site/community-artifacts/Deep-learning

In [9]:
import sys
import os
madlib_site_dir = '/Users/fmcquillan/Documents/Product/MADlib/Demos/data'
sys.path.append(madlib_site_dir)

# Import image loader module
from madlib_image_loader import ImageLoader, DbCredentials

# Specify database credentials, for connecting to db
#db_creds = DbCredentials(user='fmcquillan',
#                         host='localhost',
#                         port='5432',
#                         password='')

# Specify database credentials, for connecting to db
db_creds = DbCredentials(user='gpadmin', 
                         db_name='madlib',
                         host='localhost',
                         port='8000',
                         password='')

# Initialize ImageLoader (increase num_workers to run faster)
iloader = ImageLoader(num_workers=5, db_creds=db_creds)

Next load CIFAR-10 data from Keras consisting of 50,000 32x32 color training images, labeled over 10 categories, and 10,000 test images.

In [10]:
from keras.datasets import cifar10

# Load dataset into np array
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

%sql DROP TABLE IF EXISTS cifar_10_test_data;

# Save images to temporary directories and load into database
#iloader.load_dataset_from_np(x_train, y_train, 'cifar_10_train_data', append=False)
iloader.load_dataset_from_np(x_test, y_test, 'cifar_10_test_data', append=False)

Done.


[]

MainProcess: Connected to madlib db.
Executing: CREATE TABLE cifar_10_test_data (id SERIAL, x REAL[], y TEXT)
CREATE TABLE
Created table cifar_10_test_data in madlib db
Spawning 5 workers...
Initializing PoolWorker-1 [pid 95042]
PoolWorker-1: Created temporary directory /tmp/madlib_dTZhEGBDFE
Initializing PoolWorker-2 [pid 95043]
PoolWorker-2: Created temporary directory /tmp/madlib_ctWjbhcjwz
Initializing PoolWorker-3 [pid 95044]
PoolWorker-3: Created temporary directory /tmp/madlib_nx9VuMScrX
Initializing PoolWorker-4 [pid 95045]
PoolWorker-4: Created temporary directory /tmp/madlib_thkphNCw4r
Initializing PoolWorker-5 [pid 95046]
PoolWorker-5: Created temporary directory /tmp/madlib_037luEXgEL
PoolWorker-2: Connected to madlib db.
PoolWorker-3: Connected to madlib db.
PoolWorker-1: Connected to madlib db.
PoolWorker-5: Connected to madlib db.
PoolWorker-4: Connected to madlib db.
PoolWorker-3: Wrote 1000 images to /tmp/madlib_nx9VuMScrX/cifar_10_test_data0000.tmp
PoolWorker-1: Wrote

<a id="inference"></a>
# 5. Inference

In [11]:
%%sql
DROP TABLE IF EXISTS cifar10_predict_byom;

SELECT madlib.madlib_keras_predict_byom('model_arch_library_cifar10',  -- model arch table
                                         1,                            -- model arch id
                                        'cifar_10_test_data',          -- test_table
                                        'id',                          -- id column
                                        'x',                           -- independent var
                                        'cifar10_predict_byom',        -- output table
                                        'response',                    -- prediction type
                                         FALSE,                        -- use gpus
                                         NULL,                         -- class values
                                         255.0                         -- normalizing const
                                   );
SELECT * FROM cifar10_predict_byom ORDER BY id LIMIT 10;

Done.


InternalError: (psycopg2.errors.InternalError_) plpy.Error: Unable to get number of classes from model architecture. (plpython.c:5038)
CONTEXT:  Traceback (most recent call last):
  PL/Python function "madlib_keras_predict_byom", line 23, in <module>
    madlib_keras_predict.PredictBYOM(**globals())
  PL/Python function "madlib_keras_predict_byom", line 42, in wrapper
  PL/Python function "madlib_keras_predict_byom", line 314, in __init__
  PL/Python function "madlib_keras_predict_byom", line 326, in validate_and_set_defaults
  PL/Python function "madlib_keras_predict_byom", line 207, in set_default_class_values
  PL/Python function "madlib_keras_predict_byom", line 75, in get_num_classes
PL/Python function "madlib_keras_predict_byom"

[SQL: SELECT madlib.madlib_keras_predict_byom('model_arch_library_cifar10',  -- model arch table
                                         1,                            -- model arch id
                                        'cifar_10_test_data',          -- test_table
                                        'id',                          -- id column
                                        'x',                           -- independent var
                                        'cifar10_predict_byom',        -- output table
                                        'response',                    -- prediction type
                                         FALSE,                        -- use gpus
                                         NULL,                         -- class values
                                         255.0                         -- normalizing const
                                   );]
(Background on this error at: http://sqlalche.me/e/2j85)

Number of missclassifications:

In [5]:
%%sql
SELECT COUNT(*) FROM cifar10_predict_byom JOIN cifar_10_test_data USING (id)
WHERE cifar10_predict_byom.estimated_dependent_var != cifar_10_test_data.y;

1 rows affected.


count
2551


Predict accuracy. From https://keras.io/examples/cifar10_cnn/ accuracy claim is 75% on validation set after 25 epochs.  From run above test accuracy: 0.7449.  MADlib predict BYOM accuracy matches:

In [6]:
%%sql
SELECT round(count(*)*100.0/10000.0, 2) as test_accuracy_percent from
    (select cifar_10_test_data.y as actual, cifar10_predict_byom.estimated_dependent_var as estimated
     from cifar10_predict_byom inner join cifar_10_test_data
     on cifar_10_test_data.id=cifar10_predict_byom.id) q
WHERE q.actual=q.estimated;

1 rows affected.


test_accuracy_percent
74.49
