## Image Classifier based on 3D RGB Histogram

In [1]:
##find to all PNG files in the dataset
import glob
path = glob.glob('C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\*\images\*\*.png')
path

['C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.51_999_6.png',
 'C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.53_999_6.png',
 'C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.55_999_6.png',
 'C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.57_999_6.png',
 'C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.59_999_6.png',
 'C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.61_999_6.png',
 'C:\\Users\\shade\\Dropbox\\Data_Science_Bowl_2018_bleu\\MMMP_Segmented_Data\\1_10_10\\images\\2_color_DAPI\\1_161_10_10_547.63_999_6.png',
 'C:\\Users\\

In [8]:
import numpy as np
import cv2
 
class RGBHistogram:
    def __init__(self, bins):
        # store the number of bins the histogram will use
        self.bins = bins
 
    def describe(self, image):
        # compute a 3D histogram in the RGB colorspace
        hist = cv2.calcHist([image], [0, 1, 2],
            None, self.bins, [0, 256, 0, 256, 0, 256])
 
        # return out 3D histogram as a flattened array
        return hist.flatten()

In [9]:
# initialize the index dictionary to store our our quantifed images
# 'key' of the dictionary is image filename and 'value' is our computed features
index = {}

In [10]:
# initialize image descriptor: a 3D RGB histogram with 8 bins per channel
desc = RGBHistogram([8, 8, 8])

In [11]:
for imagePath in path:
    # extract filename
    k = imagePath[imagePath.rfind("/") + 1:]
 
    # load the image, describe it using our RGB histogram descriptor, and update the index
    image = cv2.imread(imagePath)
    features = desc.describe(image)
    index[k] = features

In [14]:
import cPickle
# Done indexing images, write to disk

f = open("index", "w")
f.write(cPickle.dumps(index))
f.close()

In [15]:
class Searcher:
    def __init__(self, index):
        # store the index of images
        self.index = index
 
    def search(self, queryFeatures):
        # initialize dictionary of results
        results = {}
 
        # loop over the index
        for (k, features) in self.index.items():
            # compute the chi-squared distance between the features in our index 
            # and our query features to compare the histograms
            d = self.chi2_distance(features, queryFeatures)
 
            # now that we have the distance between the two feature vectors, 
            # we can udpate the results dictionary -- the key is the current image ID 
            # in the index and the value is the distance we just computed, representing
            # how 'similar' the image in the index is to our query
            results[k] = d
 
        # sort the results, so that the smaller distances (more relevant images)
        # are at the front of the list
        results = sorted([(v, k) for (k, v) in results.items()])
 
        return results
 
    def chi2_distance(self, histA, histB, eps = 1e-10):
        # compute the chi-squared distance
        d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
            for (a, b) in zip(histA, histB)])
 
        # return the chi-squared distance
        return d

In [16]:
index = cPickle.loads(open("index").read())
searcher = Searcher(index)

## Test Search Function with Image from image in the Dataset (indexed)

In [24]:
# testing with 2-color DAPI image

queryImage = cv2.imread("1_161_5_2_547.77_999_28.png")

print "query: %s" % ("1_161_5_2_547.77_999_28.png")
 
# describe the query as a 3D RGB histogram with 8 bins per channel
desc = RGBHistogram([8, 8, 8])
queryFeatures = desc.describe(queryImage)
 
# load the index perform the search
index = cPickle.loads(open("index").read())
searcher = Searcher(index)
results = searcher.search(queryFeatures)
 
# print the top ten results
for j in xrange(0, 10):
    # grab the result (we are using row-major order)
    (score, imageName) = results[j]
    IMpath = "path" + "/%s" % (imageName)
    result = cv2.imread(IMpath)
    print "\t%d. %s : %.3f" % (j + 1, imageName, score)
 


query: 1_161_5_2_547.77_999_28.png
	1. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_2_5\images\2_color_DAPI\1_161_5_2_547.77_999_28.png : 0.000
	2. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_10\images\2_color_DAPI\1_161_10_7_547.77_999_10.png : 80926.916
	3. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_6_9\images\2_color_DAPI\1_161_9_6_547.77_999_53.png : 106376.528
	4. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_6\images\2_color_DAPI\1_161_6_7_547.77_999_36.png : 128346.728
	5. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_2\images\2_color_DAPI\1_161_2_7_547.77_999_15.png : 209300.393
	6. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_6\images\2_color_DAPI\1_161_6_7_547.79_999_36.png : 212921.615
	7. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_8\images\2_color_DAPI\1_161_8_7_547.77

## We see that the best match is the matching image in the dataset, and the best matching images come from the same type of data (2_color_DAPI)

In [25]:
# try hematoxylin eosin image

queryImage = cv2.imread("1_161_6_7_547.82_999_36.png")

print "query: %s" % ("1_161_6_7_547.82_999_36.png")
 
# describe the query in the same way that we did in
# index.py -- a 3D RGB histogram with 8 bins per
# channel
desc = RGBHistogram([8, 8, 8])
queryFeatures = desc.describe(queryImage)
 
# load the index perform the search
index = cPickle.loads(open("index").read())
searcher = Searcher(index)
results = searcher.search(queryFeatures)
 
# print the top ten results
for j in xrange(0, 10):
	# grab the result (we are using row-major order) and
	# load the result image
	(score, imageName) = results[j]
	IMpath = "path" + "/%s" % (imageName)
	result = cv2.imread(IMpath)
	print "\t%d. %s : %.3f" % (j + 1, imageName, score)
 

query: 1_161_6_7_547.82_999_36.png
	1. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_6\images\Hematoxylin_Eosin\1_161_6_7_547.82_999_36.png : 0.000
	2. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_8\images\Hematoxylin_eosin\1_161_8_7_547.82_999_48.png : 287870.216
	3. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_10\images\Hematoxylin_eosin\1_161_10_7_547.82_999_10.png : 295486.603
	4. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_10_10\images\Hematoxylin_Eosin\1_161_10_10_547.82_999_6.png : 304433.142
	5. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_6_6\images\Hematoxylin_Eosin\1_161_6_6_547.82_999_35.png : 313284.047
	6. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\2_1_4\images\Hematoxylin_Eosin\2_161_4_1_547.82_999_77.png : 324158.867
	7. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_9\im

## We see that the best match is the matching image in the dataset, and the best matching images come from the same type of data (Hematoxylin_Eosin)

## Test foreign images from our training dataset

In [26]:

# try hematoxylin eosin image

queryImage = cv2.imread("87ea72894f6534b28e740cc34cf5c9eb75d0d8902687fce5fcc08a92e9f41386.png")

print "query: %s" % ("87ea72894f6534b28e740cc34cf5c9eb75d0d8902687fce5fcc08a92e9f41386.png")
 
# describe the query in the same way that we did in
# index.py -- a 3D RGB histogram with 8 bins per
# channel
desc = RGBHistogram([8, 8, 8])
queryFeatures = desc.describe(queryImage)
 
# load the index perform the search
index = cPickle.loads(open("index").read())
searcher = Searcher(index)
results = searcher.search(queryFeatures)
 
# print the top ten results
for j in xrange(0, 10):
	# grab the result (we are using row-major order) and
	# load the result image
	(score, imageName) = results[j]
	IMpath = "path" + "/%s" % (imageName)
	result = cv2.imread(IMpath)
	print "\t%d. %s : %.3f" % (j + 1, imageName, score)

query: 87ea72894f6534b28e740cc34cf5c9eb75d0d8902687fce5fcc08a92e9f41386.png
	1. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_6_9\images\PAS\1_161_9_6_547.84_999_53.png : 2923573.128
	2. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_10\images\PAS\1_161_10_7_547.84_999_10.png : 2925771.535
	3. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_3_1\images\PAS\1_161_3_1_547.84_999_17.png : 2951508.902
	4. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_6_9\images\Hematoxylin_Eosin\1_161_9_6_547.82_999_53.png : 2960154.458
	5. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_7_10\images\Hematoxylin_eosin\1_161_10_7_547.82_999_10.png : 2967203.315
	6. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data\1_3_1\images\Hematoxylin_Eosin\1_161_3_1_547.82_999_17.png : 2972724.170
	7. C:\Users\shade\Dropbox\Data_Science_Bowl_2018_bleu\MMMP_Segmented_Data

## The classification points to both PAS and Hematoxylin Eosin images- two stain types which are very similar.