In [1]:
import os
import numpy as np
import tensorflow as tf
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, ArrayType, FloatType
from tensorboard.notebook import display
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil
import time
import pandas as pd
from PIL import Image
import uuid
from tensorflow.keras.applications.resnet50 import ResNet50
from pyspark.sql.functions import col, pandas_udf, PandasUDFType

In [2]:
from pyspark.sql import SparkSession
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" # Must corrispond to the current jdk used by colab
os.environ["SPARK_HOME"] = "/opt/spark/" # Must corrispond with the downloaded spark (1st line)
spark = SparkSession.builder.master("spark://192.168.1.38:7077").appName("testTrain")\
    .config("spark.driver.memory" , "2g").\
    config("spark.executor.memory" , "2g").\
    enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [4]:
def get_model():
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten, BatchNormalization
    model = Sequential()

    model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(320, 320, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(320, 320, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [5]:
model = get_model()

In [6]:
bc_model_weights = sc.broadcast(model.get_weights())

In [31]:
def fullpath(path, files):
    return  [(lambda x: path + x)(x) for x in files]
val_dir = "../chest_xray/chest_xray/val"
filesPneumo = fullpath(val_dir+'/PNEUMONIA/',os.listdir(os.path.join(val_dir, 'PNEUMONIA')))
del filesPneumo[0]
labelsPneumo = np.zeros(len(filesPneumo))
filesNormal = fullpath(val_dir+'/NORMAL/',os.listdir(os.path.join(val_dir, 'NORMAL')))
del filesNormal[0]
labelsNormal = np.ones(len(filesNormal))
fileData = filesPneumo + filesNormal
fileLabels = np.concatenate((labelsPneumo,labelsNormal),axis=0).astype(np.float32)

#files.

In [7]:
file_name = "image_dataTrain.parquet"
dbfs_file_path = "../chest_xray/chest_xray/dbfs/"

In [32]:
image_data = []
for (file,label) in zip(fileData, fileLabels):
  img = Image.open(file)
  img = img.resize([320,320])
  data = np.asarray( img, dtype="float32" ).reshape([320*320*1])

  image_data.append({"data": data, "label": label})

pandas_df = pd.DataFrame(image_data, columns = ['data', 'label'])
pandas_df.to_parquet(file_name)
#os.makedirs(dbfs_file_path)
shutil.copyfile(file_name, dbfs_file_path+file_name)

'../chest_xray/chest_xray/dbfs/image_dataTrain.parquet'

In [19]:
del df

In [8]:
df = spark.read.parquet("dbfs/"+file_name)
print(df.count())

16


In [9]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")
assert len(df.head()) > 0, "`df` should not be empty"

In [10]:
def parse_image(image_data,image_label):
  #image = tf.image.convert_image_dtype(image_data, dtype=tf.float32) * (2. / 255) - 1
  image = tf.reshape(image_data,[320,320,1])
  return image,image_label

In [9]:
@pandas_udf(df.schema, PandasUDFType.SCALAR_ITER)
def predict_batch_udf(image_batch_iter):
  import tensorflow as tf
  config = tf.compat.v1.ConfigProto()
  config.gpu_options.allow_growth = True
  config.gpu_options.per_process_gpu_memory_fraction=0.6
  session = tf.compat.v1.InteractiveSession(config=config)
  batch_size = 1
  model = get_model()
  model.set_weights(bc_model_weights.value)
  for image_batch, label_batch in image_batch_iter:
    images = np.vstack(image_batch)
    labels = label_batch
    print(images.dtype)
    print(labels.dtype)

    dataset = tf.data.Dataset.from_tensor_slices((images,labels))
    dataset = dataset.map(parse_image, num_parallel_calls=12).prefetch(5000).batch(batch_size)
    history = model.fit(x=dataset, epochs=5, verbose=True)
    yield pd.DataFrame(list(history.history['accuracy']))



In [10]:
@pandas_udf(ArrayType(FloatType()), PandasUDFType.SCALAR_ITER)
def predict_batch_udf(image_batch_iter):
  def parse_image(image_data):
      image = tf.reshape(image_data,[320,320,1])
      return image
  batch_size = 1
  model = get_model()
  model.set_weights(bc_model_weights.value)
  for image_batch in image_batch_iter:
    images = np.vstack(image_batch)
    dataset = tf.data.Dataset.from_tensor_slices(images)
    dataset = dataset.map(parse_image, num_parallel_calls=8).prefetch(5000).batch(batch_size)
    preds = model.predict(dataset)
    yield pd.Series(list(preds))



In [175]:
def train(datasetDF, epochs=5, batch_size=1, lr=0.001):
    def np2Img(arrList):
        def parse_image(image_data):
            image = tf.reshape(image_data,[320,320,1])
            return image
        conv = []
        arrList = arrList.flatten()
        test = arrList[0:2]
        for i,el in enumerate(arrList):
            conv.append(parse_image(np.asarray(el)))
        return np.asarray(conv)
    def lists2nparray(arrList, labels=False):
        arrList = arrList.flatten()
        conv = np.zeros(len(arrList))
        for i,el in enumerate(arrList):
            if labels:
                conv[i] = el
            else:
                conv[i] = el[0]
        return conv.reshape(-1,1)
    import tensorflow as tf
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction=0.6
    tf.compat.v1.disable_eager_execution()
    sess = tf.compat.v1.InteractiveSession(config=config)
    model = get_model()
    bc_model_weights = sc.broadcast(model.get_weights())
    data = datasetDF[["data"]]
    labels = datasetDF[["label"]]
    for e in range(epochs):
        print("Epoch: %d" %e)
        predictions_df = data.select(predict_batch_udf(col("data")).alias("prediction"))
        loss = tf.keras.losses.binary_crossentropy(lists2nparray(labels.toPandas().to_numpy(), labels=True), lists2nparray(predictions_df.toPandas().to_numpy()) )
        gradients = tf.keras.backend.gradients(loss, model.trainable_weights)
        evaluated_gradients = sess.run(gradients, feed_dict={model.input: np2Img(data.select(col("data")).toPandas().to_numpy())})
        # For every trainable layer in the network
        for i in range(len(model.trainable_weights)):
            layer = model.trainable_weights[i]  # Select the layer
            sess.run(tf.compat.v1.assign_sub(layer, lr * evaluated_gradients[i]))
        bc_model_weights = sc.broadcast(model.get_weights())

In [19]:
output_file_path = "../chest_xray/chest_xray/dbfs/history"
history_df = df.select(predict_batch_udf(col("data"),col("label")).alias("history"))
history_df.show()


NameError: name 'predict_batch_udf' is not defined

In [176]:
train(df)



Epoch: 0


KeyboardInterrupt: 

In [120]:
d = df[["data"]]
#predictions_df = d.select(predict_batch_udf(col("data")).alias("prediction"))
x = d.toPandas().to_numpy()

In [35]:
v = pd.DataFrame(np.asarray([list([0]),list([2]),list([4]),list([4])]), columns=["data"])

In [63]:
foreach = lambda x : print(np.array(x[0]))
convert = lambda x : np.asarray(x).astype(np.float32)
foreach(x)
x1 = foreach(x[:,0])

[list([0.8942124843597412])]
[0.89421248]


In [163]:
def np2Img(arrList):
    def parse_image(image_data):
      image = tf.reshape(image_data,[320,320,1])
      return image
    conv = []
    arrList = arrList.flatten()
    for i,el in enumerate(arrList):
        conv.append(parse_image(np.asarray(el)))
    return np.asarray(conv)
p = np2Img(x)

array([list([0.8942124843597412]), list([0.7883509993553162]),
       list([0.9732122421264648]), list([0.9721941351890564]),
       list([0.9826411008834839]), list([0.9366998672485352]),
       list([0.9619568586349487]), list([0.7642014622688293]),
       list([0.9734371900558472]), list([0.9742845296859741]),
       list([0.9750548601150513]), list([0.928562581539154]),
       list([0.941596269607544]), list([0.9728338718414307]),
       list([0.98555588722229]), list([0.9796028137207031])], dtype=object)