# Set up

Python librairies imports :

In [None]:
# File system management
import io

# Data manipulation
import numpy as np
import pandas as pd
from typing import Iterator

# Image manipulation
from PIL import Image

# Tensorflow
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

# Pyspark
from pyspark.ml.feature import PCA as pyPCA
from pyspark.ml.functions import array_to_vector, vector_to_array
from pyspark.sql import functions as F
# from pyspark.sql import SparkSession

Define work location :

In [None]:
# Current project path
PATH_PROJ = "gs://bucket-openclassrooms-p8"


# Define images paths
PATH_DATA = PATH_PROJ + "/data/test"
PATH_RESULTS = PATH_PROJ + "/data/results"

# Data processing

## Functions

### MobileNetV2 model

In [None]:
def model_create(show_summary=False):
    """Create a MobileNetV2 model with top layer removed

    Returns:
        MobileNetV2 model
    """
    # Load default model
    model_base = MobileNetV2(weights="imagenet", include_top=True, input_shape=(224, 224, 3))

    # Freeze layers
    for layer in model_base.layers:
        layer.trainable = False

    # Create model without top layer
    model_new = Model(inputs=model_base.input, outputs=model_base.layers[-2].output)

    # Show model summary
    if show_summary is True:
        print(model_new.summary())

    return model_new

### Images preprocesssing

In [None]:
def preprocess(content):
    """Preprocesses raw image bytes.

    Args:
        content: PIL Image

    Returns:
        Numpy array
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

In [None]:
def featurize_series(model, content_series):
    """Featurize a pd.Series of raw images using the input model.

    Args:
        model: CNN model
        content_series: pd.Series of image data

    Returns:
        pd.Series of image features
    """
    content_input = np.stack(content_series.map(preprocess))
    preds = model.predict(content_input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

## Distributed model inference

### Create the Spark session

In [None]:
# Spark session created by cloud notebook

# Create sparkContext
sc = spark.sparkContext

# Set log level
sc.setLogLevel("WARN")

spark

### Broadcast the model weights

In [None]:
# Create broadcast weights
broadcast_weights = spark.sparkContext.broadcast(model_create(show_summary=True).get_weights())

In [None]:
@F.pandas_udf("array<float>")
def featurize_udf(content_series_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """This method is a Scalar Iterator pandas UDF wrapping our featurization function.
        The decorator specifies this returns a Spark DataFrame column of type ArrayType(FloatType).

    Args:
        content_series_iter: Iterator over batches of data, where each batch
                            is a pandas Series of image data.

    Yields:
        pd.Series of image features
    """
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_create()
    # Broadcast weights to workers
    model.set_weights(broadcast_weights.value)
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)

### Load the images

In [None]:
# Load all images
images = spark.read.format("binaryFile").option("pathGlobFilter", "*.jpg").option("recursiveFileLookup", "true").load(PATH_DATA)

In [None]:
# Display first images
images = images.withColumn('label', F.element_at(F.split(images['path'], '/'),-2))
print(images.printSchema())
print(images.select('path','label').show(5, False))
print("Number of images loaded : ", images.count())

In [None]:
# Select sample of dataset
SELECT_RATIO = 0.1
images = images.sample(fraction=SELECT_RATIO, seed=42)
print("Select {0:.0%} of images : {0}".format(SELECT_RATIO, images.count()))

### Run the model inference

In [None]:
# Create the image features
features_df = images.select(F.col("path"), F.col("label"),
                                                   featurize_udf("content").alias('features'))

# Create the vectors
features_df = features_df.withColumn('features_vec', array_to_vector("features"))

display(features_df.show(5))
display(features_df.printSchema())

# Dimension reduction

In [None]:
# Number of components
PCA_K = 166

In [None]:
# Create pyspark PCA model
pca = pyPCA(k=PCA_K, inputCol='features_vec', outputCol='features_pca')

# Fit model
pca_model = pca.fit(features_df)

# Transform data
features_df = pca_model.transform(features_df)

display(features_df.show(5))
display(features_df.printSchema())

# Export results

In [None]:
# Save results as parquet files
features_df.write.mode("overwrite").parquet(PATH_RESULTS + "/Features_output")

In [None]:
# Save PCA output as single json file
features_df.select(F.col('features_pca')).withColumn('features_pca', vector_to_array('features_pca')) \
    .repartition(1) \
    .write \
    .mode("overwrite") \
    .json(PATH_RESULTS + "/PCA_output")