In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib
matplotlib.use('Agg')
%pylab inline 
import pandas
import datetime as dt

from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist
from pyspark import SparkContext
import os 
import matplotlib.pyplot as plt
import cv2 
from bigdl.transform.vision.image import *

import numpy as np # needed for reshaping binary image data 
import tensorflow as tf # needed to implement classifier

# suppress tensorflow warnings to keep clean
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

sc = SparkContext.getOrCreate(conf=create_spark_conf().setMaster("local[4]").set("spark.driver.memory","2g"))
init_engine()

Populating the interactive namespace from numpy and matplotlib




Prepending /home/dhrubanka/.local/lib/python3.7/site-packages/bigdl/share/conf/spark-bigdl.conf to sys.path


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
'''Functions to classify images'''


def load_graph(model_file):
    graph = tf.Graph()
    graph_def = tf.GraphDef()

    with open(model_file, "rb") as f:
        graph_def.ParseFromString(f.read())
    with graph.as_default():
        tf.import_graph_def(graph_def)

    return graph


def read_tensor_from_image_file(image,
                                input_height=299,
                                input_width=299,
                                input_mean=0,
                                input_std=255):
#   input_image = cv2.imread(file_name)
    img2= cv2.resize(image,dsize=(input_height,input_width), interpolation = cv2.INTER_CUBIC)
#Numpy array
    np_image_data = np.asarray(img2)
  #maybe insert float convertion here - see edit remark!
    np_final = np.expand_dims(np_image_data,axis=0)
    normalized = tf.divide(tf.subtract(np_final, [input_mean]), [input_std])
    sess = tf.Session()
    result = sess.run(normalized)

    return result


def load_labels(label_file):
    label = []
    proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
    for l in proto_as_ascii_lines:
        label.append(l.rstrip())
    return label

def predict(graph, input_operation, output_operation, t):

    with tf.Session(graph=graph) as sess:
        results = sess.run(output_operation.outputs[0], {
        input_operation.outputs[0]: t
        })
    results = np.squeeze(results)

    top_k = results.argsort()[-5:][::-1]
    labels = load_labels(label_file)
    return [top_k, labels, results]

# Defining model parameters 
model_file = "brain_tumor_weights/output_graph.pb"
label_file = "brain_tumor_weights/output_labels.txt"
input_height = 299
input_width = 299
input_mean = 0
input_std = 255
input_layer = "Mul"
output_layer = "final_result"

# Loading graph beforehand
graph = load_graph(model_file)

# Setting name and input parameters beforehand
input_name = "import/" + input_layer
output_name = "import/" + output_layer
input_operation = graph.get_operation_by_name(input_name)
output_operation = graph.get_operation_by_name(output_name)

In [4]:
# load the hadoop filesystem into pyspark context

hadoop = sc._jvm.org.apache.hadoop
fs = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration()

# give dataset path to pyspark context
path = hadoop.fs.Path('/lsdp_project/brain_cancer_dataset')

# initalize empty list 
file_predicitons = []

# count to check if all files accessed : around 4682
count = 0

for f in fs.get(conf).listStatus(path):
    
    # get the path of each binary file
    image_file_path = f.getPath()
    image_file_path = f.getPath()
#     print(image_file_path)
    
    # load binary file into variable
    wc = SparkContext.binaryFiles(sc, path = str(image_file_path), minPartitions=None)
    list_elements = wc.collect()
    image_bytes = list_elements[0][1]
    
    # form back image from bytes 
    recovered_image = np.frombuffer(image_bytes,dtype='uint8').reshape(512,512,3)
    
    # load tensor for the image file into variable
    t = read_tensor_from_image_file(
        recovered_image,
        input_height=input_height,
        input_width=input_width,
        input_mean=input_mean,
        input_std=input_std)
    
    # get predictions from the model 
    top_k, labels, results = predict(graph, input_operation,output_operation,t)
    prediction_label = labels[top_k[0]]
    
    # append file,predicition to list
    file_predicitons.append((str(image_file_path),prediction_label))
    count += 1
    
    print('Image: ',str(image_file_path),' Tumor Prediction: ',prediction_label)
    
    if count==10:
        break

Image:  hdfs://localhost:9000/lsdp_project/brain_cancer_dataset/2.8Gb Dignosis for Cancer_DOI_R_004_1.3.6.1.4.1.14519.5.2.1.4320.5030.248552508121514040263344871813_1.3.6.1.4.1.14519.5.2.1.4320.5030.966354075876482042295761929295_000000  Tumor Prediction:  no
Image:  hdfs://localhost:9000/lsdp_project/brain_cancer_dataset/2.8Gb Dignosis for Cancer_DOI_R_004_1.3.6.1.4.1.14519.5.2.1.4320.5030.248552508121514040263344871813_1.3.6.1.4.1.14519.5.2.1.4320.5030.966354075876482042295761929295_000001  Tumor Prediction:  yes
Image:  hdfs://localhost:9000/lsdp_project/brain_cancer_dataset/2.8Gb Dignosis for Cancer_DOI_R_004_1.3.6.1.4.1.14519.5.2.1.4320.5030.248552508121514040263344871813_1.3.6.1.4.1.14519.5.2.1.4320.5030.966354075876482042295761929295_000002  Tumor Prediction:  yes
Image:  hdfs://localhost:9000/lsdp_project/brain_cancer_dataset/2.8Gb Dignosis for Cancer_DOI_R_004_1.3.6.1.4.1.14519.5.2.1.4320.5030.248552508121514040263344871813_1.3.6.1.4.1.14519.5.2.1.4320.5030.9663540758764820422

In [10]:
import pickle
 
with open('classification_results.pkl', 'wb') as f:
    pickle.dump(file_predicitons, f)

In [11]:
count 

500