# Transfer Learning Using Keras and MADlib

This is a transfer learning example based on https://keras.io/examples/mnist_transfer_cnn/ 

To load images into tables we use the script called <em>madlib_image_loader.py</em> located at https://github.com/apache/madlib-site/tree/asf-site/community-artifacts/Deep-learning which uses the Python Imaging Library so supports multiple formats http://www.pythonware.com/products/pil/

## Table of contents
<a href="#import_libraries">1. Import libraries</a>

<a href="#load_and_prepare_data">2. Load and prepare data</a>

<a href="#image_preproc">3. Call image preprocessor</a>

<a href="#define_and_load_model">4. Define and load model architecture</a>

<a href="#train">5. Train</a>

<a href="#transfer_learning">6. Transfer learning</a>

In [1]:
%load_ext sql

In [2]:
# Greenplum Database 5.x on GCP - via tunnel
%sql postgresql://gpadmin@localhost:8000/madlib
        
# PostgreSQL local
#%sql postgresql://fmcquillan@localhost:5432/madlib

In [3]:
%sql select madlib.version();
#%sql select version();

1 rows affected.


version
"MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-89-g14a91ce, cmake configuration time: Fri Mar 5 23:08:38 UTC 2021, build type: release, build system: Linux-3.10.0-1160.11.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5"


<a id="import_libraries"></a>
# 1.  Import libraries
From https://keras.io/examples/mnist_transfer_cnn/ import libraries and define some params

In [4]:
from __future__ import print_function

import datetime
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import backend as K

now = datetime.datetime.now

batch_size = 128
num_classes = 5
epochs = 5

# input image dimensions
img_rows, img_cols = 28, 28
# number of convolutional filters to use
filters = 32
# size of pooling area for max pooling
pool_size = 2
# convolution kernel size
kernel_size = 3

if K.image_data_format() == 'channels_first':
    input_shape = (1, img_rows, img_cols)
else:
    input_shape = (img_rows, img_cols, 1)

Others needed in this workbook

In [5]:
import pandas as pd
import numpy as np

<a id="load_and_prepare_data"></a>
# 2.  Load and prepare data

First load MNIST data from Keras, consisting of 60,000 28x28 grayscale images of the 10 digits, along with a test set of 10,000 images.

In [6]:
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# create two datasets one with digits below 5 and one with 5 and above
x_train_lt5 = x_train[y_train < 5]
y_train_lt5 = y_train[y_train < 5]
x_test_lt5 = x_test[y_test < 5]
y_test_lt5 = y_test[y_test < 5]

x_train_gte5 = x_train[y_train >= 5]
y_train_gte5 = y_train[y_train >= 5] - 5
x_test_gte5 = x_test[y_test >= 5]
y_test_gte5 = y_test[y_test >= 5] - 5

# reshape to match model architecture
print(x_test_gte5.shape)
x_train_lt5=x_train_lt5.reshape(len(x_train_lt5), *input_shape)
x_test_lt5 = x_test_lt5.reshape(len(x_test_lt5), *input_shape)
x_train_gte5=x_train_gte5.reshape(len(x_train_gte5), *input_shape)
x_test_gte5 = x_test_gte5.reshape(len(x_test_gte5), *input_shape)
print(x_test_gte5.shape)

(4861, 28, 28)
(4861, 28, 28, 1)


Load datasets into tables using image loader scripts called <em>madlib_image_loader.py</em> located at https://github.com/apache/madlib-site/tree/asf-site/community-artifacts/Deep-learning

In [7]:
# MADlib tools directory
import sys
import os
madlib_site_dir = '/Users/fmcquillan/Documents/Product/MADlib/Demos/data'
sys.path.append(madlib_site_dir)

# Import image loader module
from madlib_image_loader import ImageLoader, DbCredentials

In [8]:
# Specify database credentials, for connecting to db
#db_creds = DbCredentials(user='gpadmin',
#                         host='35.239.240.26',
#                         port='5432',
#                         password='')

db_creds = DbCredentials(user='gpadmin',
                         host='localhost',
                         port='8000',
                         password='')

In [9]:
# Initialize ImageLoader (increase num_workers to run faster)
iloader = ImageLoader(num_workers=5, db_creds=db_creds)

In [10]:
# Drop tables
%sql DROP TABLE IF EXISTS train_lt5, test_lt5, train_gte5, test_gte5

# Save images to temporary directories and load into database
iloader.load_dataset_from_np(x_train_lt5, y_train_lt5, 'train_lt5', append=False)
iloader.load_dataset_from_np(x_test_lt5, y_test_lt5, 'test_lt5', append=False)
iloader.load_dataset_from_np(x_train_gte5, y_train_gte5, 'train_gte5', append=False)
iloader.load_dataset_from_np(x_test_gte5, y_test_gte5, 'test_gte5', append=False)

Done.


[]

MainProcess: Connected to madlib db.
Executing: CREATE TABLE train_lt5 (id SERIAL, x REAL[], y TEXT[])
CREATE TABLE
Created table train_lt5 in madlib db
Spawning 5 workers...
Initializing PoolWorker-1 [pid 84275]
PoolWorker-1: Created temporary directory /tmp/madlib_5TU8FybuWQ
Initializing PoolWorker-2 [pid 84276]
PoolWorker-2: Created temporary directory /tmp/madlib_LjDRu2RVLy
Initializing PoolWorker-3 [pid 84277]
PoolWorker-3: Created temporary directory /tmp/madlib_ksuUrx0mOn
Initializing PoolWorker-4 [pid 84278]
PoolWorker-4: Created temporary directory /tmp/madlib_f2SlPjS13H
PoolWorker-5: Created temporary directory /tmp/madlib_8GA0SlnXzj
Initializing PoolWorker-5 [pid 84279]
PoolWorker-4: Connected to madlib db.
PoolWorker-5: Connected to madlib db.
PoolWorker-2: Connected to madlib db.
PoolWorker-1: Connected to madlib db.
PoolWorker-3: Connected to madlib db.
PoolWorker-5: Wrote 1000 images to /tmp/madlib_8GA0SlnXzj/train_lt50000.tmp
PoolWorker-2: Wrote 1000 images to /tmp/madl

BadCopyFileFormat: array value must start with "{" or dimension information  (seg0 10.128.0.41:40000 pid=18042)
CONTEXT:  COPY train_lt5, line 1, column {{{0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}...


<a id="image_preproc"></a>
# 3. Call image preprocessor

Transforms from one image per row to multiple images per row for batch optimization.  Also normalizes and one-hot encodes.

Training dataset < 5

In [None]:
%%sql
DROP TABLE IF EXISTS train_lt5_packed, train_lt5_packed_summary;

SELECT madlib.training_preprocessor_dl('train_lt5',               -- Source table
                                       'train_lt5_packed',        -- Output table
                                       'y',                       -- Dependent variable
                                       'x',                       -- Independent variable
                                        1000,                     -- Buffer size
                                        255                       -- Normalizing constant
                                        );

SELECT * FROM train_lt5_packed_summary;

Test dataset < 5

In [None]:
%%sql
DROP TABLE IF EXISTS test_lt5_packed, test_lt5_packed_summary;

SELECT madlib.validation_preprocessor_dl('test_lt5',                -- Source table
                                         'test_lt5_packed',         -- Output table
                                         'y',                       -- Dependent variable
                                         'x',                       -- Independent variable
                                         'train_lt5_packed'         -- Training preproc table
                                        );

SELECT * FROM test_lt5_packed_summary;

Training dataset >= 5

In [None]:
%%sql
DROP TABLE IF EXISTS train_gte5_packed, train_gte5_packed_summary;

SELECT madlib.training_preprocessor_dl('train_gte5',              -- Source table
                                       'train_gte5_packed',       -- Output table
                                       'y',                       -- Dependent variable
                                       'x',                       -- Independent variable
                                        1000,                     -- Buffer size
                                        255                       -- Normalizing constant
                                        );

SELECT * FROM train_gte5_packed_summary;

Test dataset >= 5

In [None]:
%%sql
DROP TABLE IF EXISTS test_gte5_packed, test_gte5_packed_summary;

SELECT madlib.validation_preprocessor_dl('test_gte5',             -- Source table
                                         'test_gte5_packed',      -- Output table
                                         'y',                     -- Dependent variable
                                         'x',                     -- Independent variable
                                         'train_gte5_packed'      -- Training preproc table
                                        );

SELECT * FROM test_gte5_packed_summary;

<a id="define_and_load_model"></a>
# 4. Define and load model architecture

Model with feature and classification layers trainable

In [None]:
# define two groups of layers: feature (convolutions) and classification (dense)
feature_layers = [
    Conv2D(filters, kernel_size,
           padding='valid',
           input_shape=input_shape),
    Activation('relu'),
    Conv2D(filters, kernel_size),
    Activation('relu'),
    MaxPooling2D(pool_size=pool_size),
    Dropout(0.25),
    Flatten(),
]

classification_layers = [
    Dense(128),
    Activation('relu'),
    Dropout(0.5),
    Dense(num_classes),
    Activation('softmax')
]

# create complete model
model = Sequential(feature_layers + classification_layers)

model.summary()

Load into model architecture table using psycopg2

In [None]:
import psycopg2 as p2
#conn = p2.connect('postgresql://gpadmin@35.239.240.26:5432/madlib')
conn = p2.connect('postgresql://gpadmin@localhost:8000/madlib')
cur = conn.cursor()

%sql DROP TABLE IF EXISTS model_arch_library;
query = "SELECT madlib.load_keras_model('model_arch_library', %s, NULL, %s)"
cur.execute(query,[model.to_json(), "feature + classification layers trainable"])
conn.commit()

# check model loaded OK
%sql SELECT model_id, name FROM model_arch_library;

Model with feature layers frozen

In [None]:
# freeze feature layers
for l in feature_layers:
    l.trainable = False

model.summary()

Load into transfer model architecture table using psycopg2

In [None]:
cur.execute(query,[model.to_json(), "only classification layers trainable"])
conn.commit()

# check model loaded OK
%sql SELECT model_id, name FROM model_arch_library ORDER BY model_id;

<a id="train"></a>
# 5.  Train
Train the model for 5-digit classification [0..4]  

In [None]:
%%sql
DROP TABLE IF EXISTS mnist_model, mnist_model_summary;

SELECT madlib.madlib_keras_fit('train_lt5_packed',    -- source table
                               'mnist_model',         -- model output table
                               'model_arch_library',  -- model arch table
                                1,                    -- model arch id
                                $$ loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']$$,  -- compile_params
                                $$ batch_size=128, epochs=1 $$,  -- fit_params
                                5                     -- num_iterations
                              );

View the model summary:

In [None]:
%%sql
SELECT * FROM mnist_model_summary;

Evaluate using test data

In [None]:
%%sql
DROP TABLE IF EXISTS mnist_validate;

SELECT madlib.madlib_keras_evaluate('mnist_model',      -- model
                                   'test_lt5_packed',   -- test table
                                   'mnist_validate'     -- output table
                                   );

SELECT * FROM mnist_validate;

<a id="transfer_learning"></a>
# 6. Transfer learning

Use UPDATE to load trained weights from previous run into the model library table:

In [None]:
%%sql
UPDATE model_arch_library
SET model_weights = mnist_model.model_weights
FROM mnist_model
WHERE model_arch_library.model_id = 2;

Transfer: train dense layers for new classification task [5..9]

In [None]:
%%sql
DROP TABLE IF EXISTS mnist_transfer_model, mnist_transfer_model_summary;

SELECT madlib.madlib_keras_fit('train_gte5_packed',   -- source table
                               'mnist_transfer_model',-- model output table
                               'model_arch_library',  -- model arch table
                                2,                    -- model arch id
                                $$ loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']$$,  -- compile_params
                                $$ batch_size=128, epochs=1 $$,  -- fit_params
                                5                     -- num_iterations
                              );

View the model summary

In [None]:
%%sql
SELECT * FROM mnist_transfer_model_summary;

Evaluate using test data

In [None]:
%%sql
DROP TABLE IF EXISTS mnist_transfer_validate;

SELECT madlib.madlib_keras_evaluate('mnist_transfer_model',      -- model
                                   'test_gte5_packed',           -- test table
                                   'mnist_transfer_validate'     -- output table
                                   );

SELECT * FROM mnist_transfer_validate;