In [None]:
# Start spark application

# Set up

Python librairies imports :

In [39]:
# File system management
import os
from pathlib import Path
import io

# Data manipulation
import numpy as np
import pandas as pd

# Image manipulation
from PIL import Image

# Tensorflow
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

# Pyspark
from pyspark.ml.feature import PCA as pyPCA
from pyspark.ml.functions import array_to_vector
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType

Define work location :

In [4]:
# Bucket path
PATH_PROJ = 'gs://bucket-openclassrooms-p8'

# Define images paths
PATH_DATA = PATH_PROJ + '/data/training'
PATH_RESULTS = PATH_PROJ + '/data/results'

# Change directory
os.chdir(PATH_PROJ)

Check the **Spark** session :

In [None]:
# Spark session created by pyspark kernel
spark

In [None]:
# Create sparkContext
sc = spark.sparkContext

# Data processing

### Load images

In [9]:
# Load all images
images = spark.read.format("binaryFile").option("pathGlobFilter", "*.jpg").option("recursiveFileLookup", "true").load(PATH_DATA)

In [10]:
# Display first images
images = images.withColumn('label', F.element_at(F.split(images['path'], '/'),-2))
print(images.printSchema())
print(images.select('path','label').show(5,False))

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)

None


[Stage 0:>                                                          (0 + 1) / 1]

+-------------------------------------------------------------------------------------+-----------+
|path                                                                                 |label      |
+-------------------------------------------------------------------------------------+-----------+
|file:/Users/victor/Documents/OPENCLASSROOMS/projet_8/data/test/apple_hit_1/r0_115.jpg|apple_hit_1|
|file:/Users/victor/Documents/OPENCLASSROOMS/projet_8/data/test/apple_hit_1/r0_119.jpg|apple_hit_1|
|file:/Users/victor/Documents/OPENCLASSROOMS/projet_8/data/test/apple_hit_1/r0_107.jpg|apple_hit_1|
|file:/Users/victor/Documents/OPENCLASSROOMS/projet_8/data/test/apple_hit_1/r0_143.jpg|apple_hit_1|
|file:/Users/victor/Documents/OPENCLASSROOMS/projet_8/data/test/apple_hit_1/r0_111.jpg|apple_hit_1|
+-------------------------------------------------------------------------------------+-----------+
only showing top 5 rows

None


                                                                                

### MobileNetV2 model

* Create a MobileNetV2 model with top layer removed and broadcasted pretrained weights.

In [11]:
# Load default model
mn_default = MobileNetV2(weights='imagenet', include_top=True, input_shape=(224, 224, 3))

# Create model without top layer
mn_custom = Model(inputs=mn_default.input, outputs=mn_default.layers[-2].output)
display(mn_custom.summary())

# Create broadcast weights 
brodcast_weights = sc.broadcast(mn_custom.get_weights())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                             

None

In [12]:
def model_fn():
    """ Returns a MobileNetV2 model with top layer removed 
        and broadcasted pretrained weights.

    Returns:
        MobileNetV2 model
    """
    # Load default model
    mn = MobileNetV2(weights='imagenet', include_top=True, input_shape=(224, 224, 3))

    # Customize layers
    for layer in mn.layers:
        layer.trainable = False
    
    # Create model without top layer
    model = Model(inputs=mn.input, outputs=mn.layers[-2].output)

    # Broadcast weights to workers
    model.set_weights(brodcast_weights.value)

    return model

### Images features

* Preprocess and featurize the images with a MobileNetV2 model.

In [13]:
def preprocess(content):
    """Preprocesses raw image bytes.

    Args:
        content: PIL Image

    Returns:
        Numpy array
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

In [40]:
def featurize_series(model, content_series):
    """Featurize a pd.Series of raw images using the input model.

    Args:
        model: CNN model
        content_series: pd.Series of image data

    Returns:
        pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)

In [44]:
@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    """ This method is a Scalar Iterator pandas UDF wrapping our featurization function.
        The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    Args:
        content_series_iter: Iterator over batches of data, where each batch
                             is a pandas Series of image data.

    Yields:
        pd.Series of image features
    """
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)



In [23]:
f_udf = F.udf(lambda r: Vectors.dense(r), VectorUDT())

In [17]:
# Select 10% of dataset
images_sample = images.sample(fraction=0.1, seed=42)
display(images_sample.show(5))
display(images_sample.printSchema())

+--------------------+-------------------+------+--------------------+-----------+
|                path|   modificationTime|length|             content|      label|
+--------------------+-------------------+------+--------------------+-----------+
|file:/Users/victo...|2021-09-12 19:23:28|121883|[FF D8 FF E0 00 1...|apple_hit_1|
|file:/Users/victo...|2021-09-12 19:23:28|117940|[FF D8 FF E0 00 1...|apple_hit_1|
|file:/Users/victo...|2021-09-12 19:23:28|117804|[FF D8 FF E0 00 1...|apple_hit_1|
|file:/Users/victo...|2021-09-12 19:23:30|112281|[FF D8 FF E0 00 1...|apple_hit_1|
|file:/Users/victo...|2021-09-12 19:23:28|110149|[FF D8 FF E0 00 1...|apple_hit_1|
+--------------------+-------------------+------+--------------------+-----------+
only showing top 5 rows



None

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)



None

In [50]:
# Create the image features
features_df = images_sample.repartition(20).select(F.col("path"), F.col("label"),
                                                   featurize_udf("content").alias('features'))

# Create the vectors
features_df = features_df.withColumn('features_vec', array_to_vector("features"))

display(features_df.show(5))
display(features_df.printSchema())

[Stage 39:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|                path|               label|            features|        features_vec|
+--------------------+--------------------+--------------------+--------------------+
|file:/Users/victo...|apple_crimson_snow_1|[0.020554474, 0.0...|[0.02055447362363...|
|file:/Users/victo...|apple_crimson_snow_1|[0.010522083, 0.2...|[0.01052208337932...|
|file:/Users/victo...|          cucumber_1|[1.9151659, 0.057...|[1.91516590118408...|
|file:/Users/victo...|          cucumber_1|[1.0218675, 0.045...|[1.02186751365661...|
|file:/Users/victo...|         apple_red_1|[1.2343181, 0.0, ...|[1.23431813716888...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

None

root
 |-- path: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- features_vec: vector (nullable = true)



None

In [None]:
# Save results as parquet files
features_df.write.mode("overwrite").parquet(PATH_RESULTS)

# Dimension reduction

In [51]:
# Create pyspark PCA model
pca_model = pyPCA(k=34, inputCol='features_vec', outputCol='features_pca').fit(features_df)

# Create PCA dataframe
pca_df = pca_model.transform(features_df)

In [53]:
pca_data = pca_model.fit(features_df)

2023-03-06 12:44:57.254417: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-06 12:45:00.811443: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-06 12:45:04.312903: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-06 12:45:34.759703: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


23/03/06 12:45:35 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/03/06 12:45:35 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


[Stage 52:>                                                         (0 + 4) / 4]

23/03/06 12:47:16 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks 
java.io.IOException: Connecting to /172.20.10.6:53670 failed in the last 4750 ms, fail this connection directly
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:214)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:173)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.start(RetryingBlockTransferor.java:152)
	at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:146)
	at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102)
	at org.apache.spark.storage.BlockManager.fetchRemoteManagedBuffer(BlockManager.scala:1159)
	at org.apache.spark.storage.BlockMana

[Stage 52:>                                                         (0 + 4) / 4]

23/03/06 12:48:44 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks (after 2 retries)
java.io.IOException: Connecting to /172.20.10.6:53670 failed in the last 4750 ms, fail this connection directly
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:214)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:173)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.lambda$initiateRetry$0(RetryingBlockTransferor.java:206)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624

[Stage 52:>                                                         (0 + 4) / 4]

23/03/06 12:50:04 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks (after 3 retries)
java.io.IOException: Connecting to /172.20.10.6:53670 failed in the last 4750 ms, fail this connection directly
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:214)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:173)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.lambda$initiateRetry$0(RetryingBlockTransferor.java:206)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624

Py4JJavaError: An error occurred while calling o302.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 52.0 failed 1 times, most recent failure: Lost task 0.0 in stage 52.0 (TID 1180) (172.20.10.6 executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2333)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1174)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1168)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1267)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1228)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1214)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1214)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeDenseVectorCovariance(RowMatrix.scala:171)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeCovariance(RowMatrix.scala:463)
	at org.apache.spark.mllib.linalg.distributed.RowMatrix.computePrincipalComponentsAndExplainedVariance(RowMatrix.scala:499)
	at org.apache.spark.mllib.feature.PCA.fit(PCA.scala:65)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:93)
	at org.apache.spark.ml.feature.PCA.fit(PCA.scala:64)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
