In [None]:
#==================================================================
#Program: blockVerification
#Version: 1.0
#Author: David Helminiak
#Date Created: September 4, 2024
#Date Last Modified: September 6, 2024
#Changelog: 1.0 - Assembly and cropping - September 2024
#Description: Verify .tif block images were actually extracted from the associated WSI .jpg
#Operation: Move back into main program directory before running.
#Status: Confirmed that several of the sample block sets (listed below) were not derived from WSI .jpg files
# ['10', '12', '13', '14', '15', '21', '3', '32', '36', '38', '40', '42', '43', '45', '48', '49', '52', '53', '55', '56', '59', '6', '63', '66', '69', '8']
#69 did match with it's color corrected variant, but several others examined did not match with any WSI available.
#Will re-extract blocks from BaSIC-corrected (but not color corrected) variants and visually confirm matches. 
#See blockReExtration.ipynb
#==================================================================

#Have the notebook fill more of the display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:80% !important; }</style>"))

#RNG seed value to ensure run-to-run consistency (-1 to disable)
manualSeedValue = 0

#Debugging mode
debugMode = False

#TQDM progress bar visualization flag
asciiFlag = False

#Load external libraries
exec(open("./CODE/EXTERNAL.py", encoding='utf-8').read())

#When splitting WSI images, what size should the resulting blocks be (default: 400)
#Should remain consistent with block sizes given for training
blockSize = 400

#Arbitrary block foreground threshold; 80% with values at least = 64
threshold = 0.80*blockSize*blockSize*64


In [None]:
#Store directory references
dir_data = '.' + os.path.sep + 'DATA' + os.path.sep
dir_blocks_data = dir_data + 'BLOCKS' + os.path.sep
dir_blocks_inputBlocks = dir_blocks_data + 'INPUT_BLOCKS' + os.path.sep
dir_blocks_outputBlocksTIF = dir_blocks_data + 'OUTPUT_BLOCKS_TIF' + os.path.sep
dir_blocks_outputBlocksJPG = dir_blocks_data + 'OUTPUT_BLOCKS_JPG' + os.path.sep
dir_blocks_outputBlocksDIF = dir_blocks_data + 'OUTPUT_BLOCKS_DIF'+ os.path.sep
dir_blocks_outputWSITIF = dir_blocks_data + 'OUTPUT_WSI_TIF' + os.path.sep
dir_blocks_outputWSIJPG = dir_blocks_data + 'OUTPUT_WSI_JPG' + os.path.sep
file_blocks_labels = dir_blocks_inputBlocks + 'Patch_list.xlsx'
dir_blocks_inputWSI = dir_blocks_data + 'INPUT_WSI' + os.path.sep

#Get list of all the available blocks
metadata = pd.read_excel(file_blocks_labels, header=None, names=['name', 'label'], converters={'name':str,'label':str})
blockNamesAll_blocks, blockLabelsAll_blocks = np.array(metadata['name']), np.array(metadata['label'])
blockSampleNamesAll_blocks = np.array([re.split('PS|_', blockName)[1] for blockName in blockNamesAll_blocks])
blockFilenamesAll_blocks = [dir_blocks_inputBlocks + 'S' + blockSampleNamesAll_blocks[blockIndex] + os.path.sep + blockNamesAll_blocks[blockIndex] + '.tif' for blockIndex in range(0, len(blockNamesAll_blocks))]


In [None]:
#Specify sample names to examine
#sampleNames = np.unique(blockSampleNamesAll_blocks)
#sampleNames = ['10', '12', '13', '14', '15', '21', '3', '32', '36', '38', '40', '42', '43', '45', '48', '49', '52', '53', '55', '56', '59', '6', '63', '66', '69', '8']

#Verify blocks for each sample were derived from the corresponding WSI
goodSamples, badSamples = [], []
for sampleName in tqdm(sampleNames, desc='Samples', leave=True, ascii=asciiFlag):
    
    #Load the sample WSI
    imageWSI = cv2.cvtColor(cv2.imread(dir_blocks_inputWSI + sampleName + '.jpg', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
    #imageWSI = cv2.cvtColor(cv2.imread(dir_blocks_inputWSI + sampleName + '.tif', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
    
    #If the image is a multi-page .tif
    #ret, imageWSI = cv2.imreadmulti(dir_blocks_inputWSI + sampleName + '.tif', [], cv2.IMREAD_ANYCOLOR)
    #imageWSI = np.stack(imageWSI, -1)
    
    #Isolate blocks for the sample
    blockIndices = np.where(blockSampleNamesAll_blocks == sampleName)[0]
    blockFilenames = np.asarray(blockFilenamesAll_blocks)[blockIndices]

    #Find a block with sufficient content to conduct a search of the WSI
    fileIndex = 0
    while True: 
        blockImage = cv2.cvtColor(cv2.imread(blockFilenames[fileIndex], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
        if np.sum(np.mean(blockImage, -1)) > threshold: break
        else: fileIndex+=1
        if fileIndex >= len(blockFilenames):
            print('Error - no block with content found')
            break

    #Exit the loop if no block was found
    if fileIndex >= len(blockFilenames): break
    
    #Find the location of the block image (assuming WSI was not resized before blocks were extracted)
    #Use grayscale to prevent OOM with larger samples
    grayWSI, grayBlock = cv2.cvtColor(imageWSI, cv2.COLOR_RGB2GRAY), cv2.cvtColor(blockImage, cv2.COLOR_RGB2GRAY)
    heat_map = cv2.matchTemplate(grayWSI, grayBlock, cv2.TM_CCOEFF_NORMED)
    startRow, startColumn = np.unravel_index(np.argmax(heat_map), heat_map.shape)

    #Assuming that the blocks are non-overlapping and match blockSize expectations, extrapolate crop parameters applied to the WSI before splitting
    cropLeft, cropRight, cropTop, cropBottom = startColumn, startColumn, startRow, startRow
    while cropLeft > 0: cropLeft-=blockSize
    while cropRight < imageWSI.shape[1]: cropRight+=blockSize
    while cropTop > 0: cropTop-=blockSize
    while cropBottom < imageWSI.shape[0]: cropBottom+=blockSize
    cropLeft, cropRight, cropTop, cropBottom = cropLeft+blockSize, cropRight-blockSize, cropTop+blockSize, cropBottom-blockSize

    #Crop the .jpg WSI, as was done to the orignal WSI before splitting
    imageWSI = imageWSI[cropTop:cropBottom, cropLeft:cropRight]

    #Split the cropped WSI into blocks and flatten, creating a matched indexed locations list
    numRows, numColumns = math.ceil(imageWSI.shape[0]/blockSize), math.ceil(imageWSI.shape[1]/blockSize)
    splitWSI = imageWSI.reshape(numRows, blockSize, numColumns, blockSize, imageWSI.shape[2]).swapaxes(1,2)
    splitWSI = splitWSI.reshape(-1, splitWSI.shape[2], splitWSI.shape[3], splitWSI.shape[4])
    locations = [[rowNum, colNum] for rowNum in range(0, numRows) for colNum in range(0, numColumns)]
    
    #Compare against all blocks from the WSI (.jpg)
    scores = [compare_MSE(blockImage, splitWSI[index]) for index, compareImage in enumerate(splitWSI)]

    #Get location data for the best match
    bestIndex = np.argmin(scores)
    location = locations[bestIndex]
    startRow, startColumn = location[0]*blockSize, location[1]*blockSize
    
    #Verify content matches and show images if the located block did not match the original exactly
    if np.sum(np.abs(splitWSI[bestIndex]-blockImage)) == 0: 
        goodSamples.append(sampleName)
    else:
        badSamples.append(sampleName)
        fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
        ax[0].imshow(blockImage)
        ax[1].imshow(splitWSI[bestIndex])
        ax[2].imshow(np.mean(np.abs(splitWSI[bestIndex]-blockImage), -1), vmin=0, cmap='gray')
        ax[0].set_title('Original')
        ax[1].set_title('Located')
        ax[2].set_title('Mean Absolute Difference')
        fig.suptitle('Block from sample ' + sampleName + ' did not match')
        plt.tight_layout()
        plt.show()
        plt.close()

print('Samples that matched perfectly')
print(goodSamples)
print()
print('Samples that did not match')
print(badSamples)
