In [1]:
import os
import numpy as np
import tensorflow as tf
from keras import initializers
#tf.compat.v1.disable_v2_behavior()

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, ArrayType, FloatType
from tensorboard.notebook import display
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil
import time
import pandas as pd
from PIL import Image
import uuid
from tensorflow.keras.applications.resnet50 import ResNet50
from pyspark.sql.functions import  col, pandas_udf, PandasUDFType
#from systemml.mllearn.estimators import Keras2DML
sess = tf.compat.v1.InteractiveSession()

Using TensorFlow backend.


In [None]:
from pyspark.sql import SparkSession
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" # Must corrispond to the current jdk used by colab
os.environ["SPARK_HOME"] = "/opt/spark/" # Must corrispond with the downloaded spark (1st line)
spark = SparkSession.builder.master("spark://192.168.1.38:7077").appName("testTrain")\
    .config("spark.driver.memory" , "2g").\
    config("spark.executor.memory" , "2g").\
    enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [4]:
def get_model():
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten, BatchNormalization
    model = Sequential()

    model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(320, 320, 1), activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(320, 320, 1), activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(128, activation='relu',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='sigmoid',kernel_initializer=initializers.RandomNormal(stddev=0.01),bias_initializer=initializers.Zeros()))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [5]:
model = get_model()

In [6]:
bc_model_weights = sc.broadcast(model.get_weights())

In [33]:
def fullpath(path, files):
    return  [(lambda x: path + x)(x) for x in files]
val_dir = "../chest_xray/chest_xray/val"
filesPneumo = fullpath(val_dir+'/PNEUMONIA/',os.listdir(os.path.join(val_dir, 'PNEUMONIA')))
del filesPneumo[0]
labelsPneumo = np.zeros(len(filesPneumo))
filesNormal = fullpath(val_dir+'/NORMAL/',os.listdir(os.path.join(val_dir, 'NORMAL')))
del filesNormal[0]
labelsNormal = np.ones(len(filesNormal))
fileData = filesPneumo + filesNormal
fileLabels = np.concatenate((labelsPneumo,labelsNormal),axis=0).astype(np.float32)

#files.

In [7]:
file_name = "image_dataTrain.parquet"
dbfs_file_path = "../chest_xray/chest_xray/dbfs/"

In [32]:
image_data = []
for (file,label) in zip(fileData, fileLabels):
  img = Image.open(file)
  img = img.resize([320,320])
  data = np.asarray( img, dtype="float32" ).reshape([320*320*1])

  image_data.append({"data": data, "label": label})

pandas_df = pd.DataFrame(image_data, columns = ['data', 'label'])
pandas_df.to_parquet(file_name)
#os.makedirs(dbfs_file_path)
shutil.copyfile(file_name, dbfs_file_path+file_name)

'../chest_xray/chest_xray/dbfs/image_dataTrain.parquet'

In [19]:
del df

In [8]:
df = spark.read.parquet(file_name)
print(df.count())

16


In [7]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")
assert len(df.head()) > 0, "`df` should not be empty"



In [9]:
@pandas_udf(ArrayType(FloatType()), PandasUDFType.SCALAR_ITER)
def predict_test(image_batch_iter):
    def parse_image(image_data):
        image = tf.reshape(image_data,[320,320,1])
        return image
    batch_size = 1
    model = get_model()
    model.set_weights(bc_model_weights.value)
    for image_batch in image_batch_iter:
        images = np.vstack(image_batch)
        dataset = tf.data.Dataset.from_tensor_slices(images)
        dataset = dataset.map(parse_image, num_parallel_calls=8).prefetch(5000).batch(batch_size)
        preds = model.predict(dataset)
        yield pd.Series(list(preds))




In [73]:
@pandas_udf(ArrayType(FloatType()), PandasUDFType.SCALAR_ITER)
def predict_batch_udf(image_batch_iter):
  def parse_image(image_data):
      image = tf.reshape(image_data,[320,320,1])
      return image
  batch_size = 1
  model = get_model()
  model.set_weights(bc_model_weights.value)
  for image_batch in image_batch_iter:
    images = np.vstack(image_batch)
    dataset = tf.data.Dataset.from_tensor_slices(images)
    dataset = dataset.map(parse_image, num_parallel_calls=8).prefetch(5000).batch(batch_size)
    preds = model.predict(dataset)
    yield pd.Series(list(preds))



In [77]:
def train(datasetDF, epochs=5, batch_size=1, lr=0.001):
    def np2Img(arrList):
        def parse_image(image_data):
            image = image_data.reshape([320,320,1])
            return image
        conv = []
        arrList = arrList.flatten()
        for i,el in enumerate(arrList):
            conv.append(parse_image(np.asarray(el)))
        return np.asarray(conv)
    def lists2nparray(arrList, labels=False):
        arrList = arrList.flatten()
        conv = np.zeros(len(arrList))
        for i,el in enumerate(arrList):
            if labels:
                conv[i] = el
            else:
                conv[i] = el[0]
        return tf.convert_to_tensor(conv.reshape(-1,1))
    import tensorflow as tf
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction=0.6
    tf.compat.v1.disable_eager_execution()
    sess = tf.compat.v1.InteractiveSession(config=config)
    model = get_model()

    bce = tf.keras.losses.BinaryCrossentropy()
    bc_model_weights = sc.broadcast(model.get_weights())
    weights = model.trainable_weights
    data = datasetDF[["data"]]
    labels = datasetDF[["label"]]
    for e in range(epochs):
        print("Epoch: %d" %e)
        predictions_df = data.select(predict_batch_udf(col("data")).alias("prediction"))

        #loss = bce(lists2nparray(labels.toPandas().to_numpy(), labels=True), lists2nparray(predictions_df.toPandas().to_numpy()) )
        #print(loss.eval())
        gradients = tf.keras.backend.gradients(lists2nparray(predictions_df.toPandas().to_numpy()), model.trainable_weights)
        evaluated_gradients = sess.run(gradients, feed_dict={model.input: np2Img(data.select(col("data")).toPandas().to_numpy())})
        # For every trainable layer in the network
        for i in range(len(model.trainable_weights)):
            layer = model.trainable_weights[i]  # Select the layer
            sess.run(tf.compat.v1.assign_sub(layer, lr * evaluated_gradients[i]))
        bc_model_weights = sc.broadcast(model.get_weights())


In [78]:
train(df)

Epoch: 0


TypeError: Fetch argument None has invalid type <class 'NoneType'>

In [10]:
d = df[["data"]]
#predictions_df = d.select(predict_batch_udf(col("data")).alias("prediction"))
x = d.toPandas().to_numpy()

In [11]:
predictions_df = df.select(predict_test(col("data")).alias("prediction"))
predictions_df.show()

+------------+
|  prediction|
+------------+
| [0.5000368]|
| [0.5000373]|
| [0.5000471]|
|   [0.50004]|
|   [0.50005]|
|[0.50005245]|
|[0.50005275]|
| [0.5000449]|
| [0.5000544]|
|[0.50005734]|
| [0.5000564]|
| [0.5000378]|
| [0.5000452]|
| [0.5000574]|
|[0.50004077]|
|[0.50004494]|
+------------+



In [86]:
def convert_to_tf(input_df):
    x = []
    def parse_image(image_data):
      image = tf.reshape(image_data,[320,320,1])
      return image
    input = input_df.toPandas().to_numpy()
    for image_batch in input:
        for img in image_batch:
            x.append(parse_image(img))
    return x

In [12]:
original_lab = df[["label"]]
original_lab.show()

+-----+
|label|
+-----+
|  0.0|
|  0.0|
|  0.0|
|  0.0|
|  0.0|
|  0.0|
|  0.0|
|  0.0|
|  1.0|
|  1.0|
|  1.0|
|  1.0|
|  1.0|
|  1.0|
|  1.0|
|  1.0|
+-----+



In [13]:
bce = tf.keras.losses.BinaryCrossentropy()
preds = predictions_df.toPandas().to_numpy()
preds = [item for sublist in preds for item in sublist]
y_true = original_lab.toPandas().to_numpy()
y_pred = preds
loss = bce(y_true, y_pred)

In [20]:
loss.eval()

ValueError: Cannot evaluate tensor using `eval()`: No default session is registered. Use `with sess.as_default()` or pass an explicit session to `eval(session=sess)`

In [None]:
l = original_lab.toPandas().to_numpy()
#preds = predictions_df.toPandas().to_numpy()
#preds = [item for sublist in preds for item in sublist]
#np.asarray(preds).flatten()
l

In [39]:
preds

[[0.7917439341545105],
 [0.5353220701217651],
 [0.8957020044326782],
 [0.7357274293899536],
 [0.8170340061187744],
 [0.898546576499939],
 [0.8372771739959717],
 [0.7673894166946411],
 [0.896142303943634],
 [0.9164182543754578],
 [0.9064261317253113],
 [0.7997414469718933],
 [0.9225687980651855],
 [0.8096106052398682],
 [0.866844892501831],
 [0.7499334812164307]]

In [None]:
from d2l import mxnet as d2l
from mxnet import autograd, gluon, init, np, npx

with sess.as_default():
    def loss_fun(y_true, y_pred):
        bce = tf.keras.losses.BinaryCrossentropy()
        l = bce(y_true, y_pred)
        return tf.reduce_mean(l)
    def l():
        return loss_fun(y_true, y_pred)
    model = get_model()
    last_layer = model.layers[-1]
    #l = len(model.layers)
    trainable = np.array(last_layer.trainable_weights[0].numpy())
    trainable.attach_grad()
    obj = loss_fun(y_true, y_pred)
    train = model.optimizer.minimize(l, tf.convert_to_tensor(trainable))

Exception ignored in: <bound method NDArrayBase.__del__ of array([0.01102832])>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/mxnet/_ctypes/ndarray.py", line 58, in __del__
    check_call(_LIB.MXNDArrayFree(self.handle))
KeyboardInterrupt
Exception ignored in: <bound method NDArrayBase.__del__ of array([0.01102832])>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/mxnet/_ctypes/ndarray.py", line 58, in __del__
    check_call(_LIB.MXNDArrayFree(self.handle))
KeyboardInterrupt
Exception ignored in: <bound method NDArrayBase.__del__ of array([0.01102832])>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/mxnet/_ctypes/ndarray.py", line 58, in __del__
    check_call(_LIB.MXNDArrayFree(self.handle))
KeyboardInterrupt


In [24]:
def get_loss():
    return tf.constant(ev)

In [19]:
layer = model.layers[-1]

In [20]:
w = layer.trainable_weights

In [24]:
w[0].numpy()

NotImplementedError: numpy() is only available when eager execution is enabled.

In [30]:
obj = loss_fun(y_true, y_pred)

In [35]:
opt = tf.compat.v1.train.AdamOptimizer(0.01)
train = opt.minimize(obj)

ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 32) dtype=float32>", "<tf.Variable 'conv2d/bias:0' shape=(32,) dtype=float32>", "<tf.Variable 'batch_normalization/gamma:0' shape=(32,) dtype=float32>", "<tf.Variable 'batch_normalization/beta:0' shape=(32,) dtype=float32>", "<tf.Variable 'conv2d_1_1/kernel:0' shape=(3, 3, 32, 32) dtype=float32>", "<tf.Variable 'conv2d_1_1/bias:0' shape=(32,) dtype=float32>", "<tf.Variable 'batch_normalization_1/gamma:0' shape=(32,) dtype=float32>", "<tf.Variable 'batch_normalization_1/beta:0' shape=(32,) dtype=float32>", "<tf.Variable 'conv2d_2/kernel:0' shape=(3, 3, 32, 64) dtype=float32>", "<tf.Variable 'conv2d_2/bias:0' shape=(64,) dtype=float32>", "<tf.Variable 'batch_normalization_2/gamma:0' shape=(64,) dtype=float32>", "<tf.Variable 'batch_normalization_2/beta:0' shape=(64,) dtype=float32>", "<tf.Variable 'conv2d_3/kernel:0' shape=(3, 3, 64, 64) dtype=float32>", "<tf.Variable 'conv2d_3/bias:0' shape=(64,) dtype=float32>", "<tf.Variable 'batch_normalization_3/gamma:0' shape=(64,) dtype=float32>", "<tf.Variable 'batch_normalization_3/beta:0' shape=(64,) dtype=float32>", "<tf.Variable 'conv2d_4/kernel:0' shape=(3, 3, 64, 128) dtype=float32>", "<tf.Variable 'conv2d_4/bias:0' shape=(128,) dtype=float32>", "<tf.Variable 'batch_normalization_4/gamma:0' shape=(128,) dtype=float32>", "<tf.Variable 'batch_normalization_4/beta:0' shape=(128,) dtype=float32>", "<tf.Variable 'conv2d_5/kernel:0' shape=(3, 3, 128, 128) dtype=float32>", "<tf.Variable 'conv2d_5/bias:0' shape=(128,) dtype=float32>", "<tf.Variable 'batch_normalization_5/gamma:0' shape=(128,) dtype=float32>", "<tf.Variable 'batch_normalization_5/beta:0' shape=(128,) dtype=float32>", "<tf.Variable 'dense/kernel:0' shape=(165888, 128) dtype=float32>", "<tf.Variable 'dense/bias:0' shape=(128,) dtype=float32>", "<tf.Variable 'dense_1/kernel:0' shape=(128, 1) dtype=float32>", "<tf.Variable 'dense_1/bias:0' shape=(1,) dtype=float32>"] and loss Tensor("Mean:0", shape=(), dtype=float32).

In [18]:
with sess.as_default():
    vars = tf.convert_to_tensor(model.trainable_weights)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    gradients = opt.minimize(loss=loss, var_list=vars )
gradients

TypeError: 'tensorflow.python.framework.ops.EagerTensor' object is not callable

In [19]:
opt = tf.compat.v1.train.AdamOptimizer()
with sess.as_default():
    x1_var = tf.Variable([1, 2, 3], dtype=tf.float32)
    x2_var = tf.Variable([3, 4, 5], dtype=tf.float32)
    combined_op = tf.concat(0, [x1_var, x2_var])
    grad_op = opt.compute_gradients(combined_op)

ValueError: Shape (2, 3) must have rank 0

In [None]:
input_data = convert_to_tf(df[["data"]])
x_tensor = tf.convert_to_tensor(input_data, dtype=tf.float32)
with tf.GradientTape() as t:
    t.watch(x_tensor)
    output = model(x_tensor)

result = output
gradients = tf.gradients(output, x_tensor)

In [92]:
v = gradients[0]

In [34]:
X_train = fileData
y_train = fileLabels

In [7]:
epochs = 5
batch_size = 1
samples = 16
#max_iter = int(epochs*math.ceil(samples/batch_size))
sysml_model = Keras2DML(spark, keras_model=model, input_shape=(1,320,320), weights='weights_dir', batch_size=batch_size,  test_interval=0, display=10)
sysml_model.fit(X_train, y_train)

TypeError: 'InputLayer' object is not iterable