In [None]:
import re
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, StringType

from bigdl.nn.criterion import CrossEntropyCriterion
from zoo.common.nncontext import init_nncontext
from zoo.feature.image import RowToImageFeature, ImageResize, ImageCenterCrop, ImageChannelNormalize, ImageMatToTensor, ImageFeatureToTensor

from zoo.pipeline.api.keras.layers import Dense, Input, Flatten
from zoo.pipeline.api.keras.models import Model
from zoo.pipeline.api.net import Net
from zoo.pipeline.nnframes import NNImageReader, ChainedPreprocessing, NNClassifier

In [None]:
import matplotlib.pyplot as plt

In [None]:
sc = init_nncontext("ImageTransferLearningExample")

In [None]:
sc

Load the pretrained model:

In [None]:
!wget 'https://s3-ap-southeast-1.amazonaws.com/bigdl-models/imageclassification/imagenet/bigdl_inception-v1_imagenet_0.4.0.model'

In [None]:
!wget 'https://s3.amazonaws.com/elephantscale-public/bigdl/cat-dog.zip'

In [None]:
!unzip -o cat-dog.zip

In [None]:
model_path = 'bigdl_inception-v1_imagenet_0.4.0.model' 
full_model = Net.load_bigdl(model_path)

In [None]:
full_model

## Get Flowers

In [None]:
!curl -LO http://download.tensorflow.org/example_images/flower_photos.tgz
!tar xzf flower_photos.tgz
!rm flower_photos/LICENSE.txt

In [None]:
!ls flower_photos

In [None]:
!ls flower_photos/daisy | head -4
!ls flower_photos/dandelion | head -4

In [None]:
from PIL import Image

In [None]:
plt.imshow(Image.open('flower_photos/daisy/10140303196_b88d3d6cec.jpg'))

In [None]:
plt.imshow(Image.open('flower_photos/dandelion/10437652486_aa86c14985.jpg'))

## Read Labels

In [None]:
def flowers():
    classes = {}
    file_name_set = set()
    file_name_class = {}

    image_path = 'flower_photos'
    for dir_name in os.listdir(image_path):
        if os.path.isdir(image_path + '/' + dir_name):
            print(dir_name)
            classes.setdefault(dir_name, float(len(classes) + 1))
            for file_name in os.listdir(image_path + '/' + dir_name):
                if file_name in file_name_set:
                    print('Duplicate file name', file_name)
                file_name_set.add(file_name)
                file_name_class[file_name] = classes[dir_name]
            
    return classes, file_name_class

classes, train_image_to_class = flowers()

In [None]:
classes

## Read files

To consume less memory and shorted training, we'll only use a random subsample.

In [None]:
classes.keys()

In [None]:
!rm -rf sampled/

import os, shutil, subprocess

for class_name in classes.keys():
    sampled_train = 'sampled/train/'
    sampled_test = 'sampled/test/'
    if not os.path.exists(sampled_train + class_name):
        os.makedirs(sampled_train + class_name)
    if not os.path.exists(sampled_test + class_name):
        os.makedirs(sampled_test + class_name)

    subprocess.call(["cp flower_photos/" + class_name + "/[56]*.jpg " 
                     + sampled_train + class_name], shell=True)
    subprocess.call(["cp flower_photos/" + class_name + "/[78]*.jpg " 
                     + sampled_test + class_name], shell=True)

In [None]:
!ls -1 sampled/train/roses | wc -l
!ls -1 sampled/train/daisy | wc -l
!ls -1 sampled/train/sunflowers | wc -l
!ls -1 sampled/train/dandelion | wc -l
!ls -1 sampled/train/tulips | wc -l

!ls -1 sampled/test/roses | wc -l
!ls -1 sampled/test/daisy | wc -l
!ls -1 sampled/test/sunflowers | wc -l
!ls -1 sampled/test/dandelion | wc -l
!ls -1 sampled/test/tulips | wc -l

In [None]:
trainingDF = NNImageReader.readImages('sampled/train/*/*', sc).repartition(32).cache()
print ("partition number: ", trainingDF.rdd.getNumPartitions())
print ("image number: ", trainingDF.count())

In [None]:
# os.listdir(data_path + '/train_img')

In [None]:
trainingDF

In [None]:
trainingDF.show(10)

#### Create array of labels

In [None]:
getName = udf(lambda row: row[0].split('/')[-1])
getLabel = udf(lambda name: train_image_to_class[name], DoubleType())
trainingDF = trainingDF \
    .withColumn("name", getName(col("image"))) \
    .withColumn("label", getLabel(col('name'))).cache()

In [None]:
trainingDF.rdd.getNumPartitions()

In [None]:
trainingDF.show(10)

In [None]:
# (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1])

In [None]:
trainingDF.count()

In [None]:
# validationDF.count()

Compose a pipeline

In [None]:
transformer = ChainedPreprocessing(
        [RowToImageFeature(), 
         ImageResize(256, 256), 
         ImageCenterCrop(224, 224),
         ImageChannelNormalize(123.0, 117.0, 104.0), 
         ImageMatToTensor(), 
         ImageFeatureToTensor()])

Load pretrained model:

In [None]:
for layer in full_model.layers:
    print (layer.name())

In [None]:
model = full_model.new_graph(["pool5/drop_7x7_s1"])

In [None]:
for layer in model.layers:
    print(layer)

In [None]:
inputNode = Input(name="input", shape=(3, 224, 224))

In [None]:
EPOCHS = 5
LEARNING_RATE = 0.001
BATCH_SIZE = 16

In [None]:
inception = model.to_keras()(inputNode)
flatten = Flatten()(inception)
logits = Dense(len(classes))(flatten)
lrModel = Model(inputNode, logits)
classifier = NNClassifier(lrModel, CrossEntropyCriterion(), transformer) \
    .setLearningRate(LEARNING_RATE) \
    .setBatchSize(BATCH_SIZE) \
    .setMaxEpoch(EPOCHS) \
    .setFeaturesCol("image") \
    .setCachingSample(False)
pipeline = Pipeline(stages=[classifier])

In [None]:
flower_model = pipeline.fit(trainingDF)

In [None]:
trainingDF.drop()
trainingDF = None

## Predict Flowers

In [None]:
validationDF = NNImageReader.readImages('sampled/test/*/*', sc).repartition(32).cache()
print ("partition number: ", validationDF.rdd.getNumPartitions())
print ("image number: ", validationDF.count())

In [None]:
validationDF = validationDF \
    .withColumn("name", getName(col("image"))) \
    .withColumn("label", getLabel(col('name')))

In [None]:
predDF = flower_model.transform(validationDF).cache()

In [None]:
predDF.show(20)

### Calculate accuracy

In [None]:
results = predDF.select("name", "label", "prediction")

In [None]:
# predDF.unpersist()
# predDF.drop()

In [None]:
correct = results.filter("label = prediction").count()

In [None]:
total = predDF.count()
accuracy = correct * 1.0 / total
print(accuracy)

In [None]:
predDF.show()

Create Linear Classifier

In [None]:
# from bigdl.optim.optimizer import Optimizer, Adam, MaxEpoch, EveryEpoch, Top1Accuracy, \
#     TrainSummary, ValidationSummary, SeveralIteration, SGD

# from bigdl.nn.layer import Sequential, Linear, LogSoftMax

# from bigdl.nn.criterion import ClassNLLCriterion

# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# lrModel = Sequential().add(Linear(1000, len(classes))).add(LogSoftMax())

In [None]:
# classifier = NNClassifier(lrModel, ClassNLLCriterion(), [1000]) \
#         .setOptimMethod(SGD(nesterov=True, momentum=0.9, dampening=0.0)) \
#         .setLearningRate(LEARNING_RATE) \
#         .setBatchSize(BATCH_SIZE) \
#         .setMaxEpoch(EPOCHS) \
#         .setFeaturesCol("embedding")

In [None]:
# evaluator = MulticlassClassificationEvaluator(
#     labelCol="label", 
#     predictionCol="prediction", 
#     metricName="accuracy")

In [None]:
# pipeline = Pipeline(stages=[transformer, preTrainedNNModel, classifier])
# pipeline = Pipeline(stages=[transformer, preTrainedNNModel, classifier])

In [None]:
# grocery_model = classifier.fit(trainingEmbedDF)

In [None]:
# trainPredictDF = grocery_model.transform(trainingDF)
# evaluator.evaluate(trainPredictDF)

In [None]:
# validationEmbedDF = preTrainedNNModel.transform(transformer.transform(validationDF))
# predictionDF = grocery_model.transform(validationEmbedDF).cache()
# predictionDF.show()

In [None]:
# evaluator.evaluate(predictionDF)