In [None]:
# Ref
# https://docs.databricks.com/_static/notebooks/deep-learning/dist-img-infer-3-keras-udf.html
# https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html

In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys
print('You have pyspark version : ', os.listdir('/opt/spark/versions'))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = '/opt/spark/versions/spark-2.3'

You have pyspark version :  ['spark-2.3', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4']


In [20]:
from os.path import join
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '10g')
    .set('spark.driver.maxResultSize', '5g')
   )

In [7]:
spark = (Session
     .builder
     .appName('keras-model-pandas-udf')
     .master('local[4]')
     .config(conf=conf)
     .getOrCreate())

In [8]:
spark

# Data

In [13]:


IMG_DATA_PATH = join('data','img_url_sdf_100.parquet')

In [14]:
img_sdf = spark.read.parquet(IMG_DATA_PATH)
print(img_sdf.count())
img_sdf.printSchema()
img_sdf.show(n=5, vertical=True)

100
root
 |-- mapping_name: string (nullable = true)
 |-- hash_id: string (nullable = true)
 |-- mapping_article_id: string (nullable = true)
 |-- mapping_category: string (nullable = true)
 |-- mapping_tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- mapping_title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- mapping_date: integer (nullable = true)
 |-- img_url: string (nullable = true)
 |-- author: string (nullable = true)
 |-- processed_img: string (nullable = true)
 |-- img_b64_str: string (nullable = true)

-RECORD 0----------------------------------
 mapping_name       | am                   
 hash_id            | 1481b68df1578cd6d... 
 mapping_article_id | 326670634            
 mapping_category   | 冰冰霜淇淋來瘋              
 mapping_tags       | [川川, 川川等於溜, 美食, 小... 
 mapping_title      | 『全家FamilyMart』‐‐ ... 
 url                | http://ctionkuni.... 
 mapping_date       | 20210221             
 img_url            | https://pi

In [15]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from PIL import Image
import base64

In [17]:
# broadcast your model
model = ResNet50()
bc_model_weights = spark.sparkContext.broadcast(model.get_weights())

In [24]:
def preprocess(img_b64_str):
    img_binary = base64.decodestring(img_b64_str)
    img = Image.open(io.BytesIO(img_data)).convert('RGB')
    img = img.resize([224, 224])
    x = np.asarray(img, dtype="float32")
    return preprocess_input(x)


def keras_model_udf(model_fn):
    def predict(image_batch_iter):
        model = model_fn()
        for img_series in image_batch_iter:
            processed_images = np.array([preprocess(img) for img in img_series])
            predictions = model.predict(processed_images, batch_size=64)
            predicted_labels = [x[0] for x in decode_predictions(predictions, top=1)]
            yield pd.DataFrame(predicted_labels)
    return_type = "class: string, desc: string, score:float"
    return pandas_udf(return_type, PandasUDFType.SCALAR_ITER)(predict)  

def resnet50_fn():
    model = ResNet50(weights=None)
    model.set_weights(bc_model_weights.value)
    return model

resnet50_udf = keras_model_udf(resnet50_fn)

AttributeError: type object 'PandasUDFType' has no attribute 'SCALAR_ITER'

In [22]:
# Unfortunely 
# SCALAR_ITER only supported in pyspark 3.0
# https://spark.apache.org/docs/3.0.0-preview/sql-pyspark-pandas-with-arrow.html#scalar-iterator

['GROUPED_MAP',
 'SCALAR',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__']