In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
train_dir = "../chest_xray/chest_xray/train"
test_dir = "../chest_xray/chest_xray/test"
val_dir = "../chest_xray/chest_xray/val"

print("Train set:\n========================================")
num_pneumonia = len(os.listdir(os.path.join(train_dir, 'PNEUMONIA')))
num_normal = len(os.listdir(os.path.join(train_dir, 'NORMAL')))
print(f"PNEUMONIA={num_pneumonia}")
print(f"NORMAL={num_normal}")

print("Test set:\n========================================")
print(f"PNEUMONIA={len(os.listdir(os.path.join(test_dir, 'PNEUMONIA')))}")
print(f"NORMAL={len(os.listdir(os.path.join(test_dir, 'NORMAL')))}")

print("Validation set:\n========================================")
print(f"PNEUMONIA={len(os.listdir(os.path.join(val_dir, 'PNEUMONIA')))}")
print(f"NORMAL={len(os.listdir(os.path.join(val_dir, 'NORMAL')))}")

Train set:
PNEUMONIA=3876
NORMAL=1342
Test set:
PNEUMONIA=390
NORMAL=234
Validation set:
PNEUMONIA=9
NORMAL=9


In [3]:
def fullpath(path, files):
    return  np.asarray([(lambda x: path + x)(x) for x in files])

train_pneumo = fullpath(test_dir+'/PNEUMONIA/',os.listdir(os.path.join(test_dir, 'PNEUMONIA')))
train_pneumo = np.delete(train_pneumo, 0)
labels_pneumo = np.ones(len(train_pneumo))
train_normal = fullpath(test_dir+'/NORMAL/',os.listdir(os.path.join(test_dir, 'NORMAL')))
train_normal = np.delete(train_normal, 0)
labels_normal = np.zeros(len(train_normal))

In [4]:
fullTrain = np.concatenate((train_pneumo,train_normal),axis=0)
fullLabels = np.concatenate((labels_pneumo,labels_normal), axis=0)
#trainPneumo = tf.data.Dataset.from_tensors((train_pneumo, labels_pneumo))
#trainNormal = tf.data.Dataset.from_tensors((train_normal, labels_normal))

In [5]:
dataset = tf.data.Dataset.from_tensor_slices((fullTrain, fullLabels))

In [6]:
def decode_and_normalize(serialized_example, serialized_class):
  """
  Decode and normalize an image and label from the given `serialized_example`.
  It is used as a map function for `dataset.map`
  """
  IMAGE_SIZE = 320

  # 1. define a parser
  """
  feature_dataset = tf.io.parse_single_example(
      serialized_example,
      # Defaults are not specified since both keys are required.
      features={
          'content': tf.io.FixedLenFeature([], tf.string),
          'label_index': tf.io.FixedLenFeature([], tf.int64),
      })
    """
  # 2. decode the data
  image = tf.io.decode_jpeg(tf.io.read_file(serialized_example))
  label = tf.cast(serialized_class, tf.int32)
  # 3. resize
  image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])
  # 4. normalize the data
  image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
  return image, label

In [7]:
parsed_dataset = dataset.map(decode_and_normalize)

In [8]:
import tensorflow as tf
from pyspark.sql import SparkSession
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" # Must corrispond to the current jdk used by colab
os.environ["SPARK_HOME"] = "/opt/spark/" # Must corrispond with the downloaded spark (1st line)
spark = SparkSession.builder.master("spark://192.168.1.38:7077").appName("testTrain").enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [9]:
ds = sc.parallelize(parsed_dataset)

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten, BatchNormalization


model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(320, 320, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(320, 320, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))

model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))

model.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters=128, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])




In [68]:
weight_for_0 = num_pneumonia / (num_normal + num_pneumonia)
weight_for_1 = num_normal / (num_normal + num_pneumonia)
class_weight = {0: weight_for_0, 1: weight_for_1}

In [19]:
#model.save("hdfs://192.168.1.38:9000/user/andrea/BDmodel.h5")

OSError: Unable to create file (unable to open file: name = 'hdfs://192.168.1.38:9000/user/andrea/BDmodel.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)

In [21]:
model_rdd_pkl = sc.binaryFiles("hdfs://192.168.1.38:9000/user/andrea/BDmodel.h5")
model_rdd_data = model_rdd_pkl.collect()

In [None]:
def predAll(dataset):
    labels = []
    for row in dataset:
        labels.append(model.predict(row))
    return  labels
labels = ds.map(predAll).collect()