In [None]:
#==================================================================
#Program: blockDeeperExtraction
#Version: 1.0
#Author: David Helminiak
#Date Created: September 10, 2024
#Date Last Modified: September 11, 2024
#Description: Extract best matching .tif block images for samples from the associated WSI .jpg files using template matching
#Operation: Move back into main program directory before running.
#==================================================================

#Have the notebook fill more of the display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:80% !important; }</style>"))

#Items otherwise covered when not running this code in a notebook
import tempfile
dir_tmp = tempfile.TemporaryDirectory(prefix='TMP_').name
configFileName = './CONFIG_0-TEST'

#Should parallelization calls be used
parallelization = True

#If parallelization is enabled, how many CPU threads should be used? (0 will use any/all available)
#Recommend starting at half of the available system threads if using hyperthreading,
#or 1-2 less than the number of system CPU cores if not using hyperthreading.
#Can adjust down to help manage RAM overhead, but it may have limited impact.
#4 threads with 128 GB RAM doesn't cause OOM, but 8 threads does
#Could leverage shared memory for larger WSI objects, if for some reason continued development is needed in the future.
availableThreads = 4

#Which GPU(s) devices should be used; NA, but need to initialize (default: [])
gpus = []

#RNG seed value to ensure run-to-run consistency (-1 to disable)
manualSeedValue = 0

#Debugging mode
debugMode = True

#TQDM progress bar visualization flag
asciiFlag = False

#Load external libraries
exec(open("./CODE/EXTERNAL.py", encoding='utf-8').read())
exec(open("./CODE/COMPUTE.py", encoding='utf-8').read())

#When splitting WSI images, what size should the resulting blocks be (default: 400)
#Should remain consistent with original block sizes
blockSize = 400
 
    

In [None]:
#Find original block location in the WSI and extract block from it
def matchBlock(blockFilenames, imageWSI, grayImageWSI, sampleFolder, sampleVisualsFolder, showProgressBars):
    samplesToCheck = []
    for index, blockFilename in tqdm(enumerate(blockFilenames), total=len(blockFilenames), desc='Blocks', leave=False, ascii=asciiFlag, disable=showProgressBars): 
        originalBlockImage = cv2.cvtColor(cv2.imread(blockFilename, cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
        matchMap = cv2.matchTemplate(grayImageWSI, cv2.cvtColor(originalBlockImage, cv2.COLOR_RGB2GRAY), cv2.TM_CCOEFF_NORMED)
        startRow, startColumn = np.unravel_index(np.argmax(matchMap), matchMap.shape)
        blockImage = imageWSI[startRow:startRow+blockSize, startColumn:startColumn+blockSize]
        blockIndex = os.path.basename(blockFilenames[index]).split('_')[-1].split('.tif')[0]
        filenameOutput = sampleFolder + 'PS' + sampleName + '_' + blockIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
        writeSuccess = cv2.imwrite(filenameOutput, cv2.cvtColor(blockImage, cv2.COLOR_RGB2BGR), params=(cv2.IMWRITE_TIFF_COMPRESSION, 1))
        maeImage = np.mean(np.abs(originalBlockImage-blockImage), -1)
        if np.sum(maeImage) != 0: 
            samplesToCheck.append(sampleName)
            fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
            ax[0].imshow(blockImage)
            ax[1].imshow(originalBlockImage)
            ax[2].imshow(maeImage)
            ax[0].set_title('New')
            ax[1].set_title('Original')
            ax[2].set_title('MAE')
            fig.suptitle('Sample ' + sampleName)
            plt.tight_layout()
            filenameOutput = sampleVisualsFolder + 'PS' + sampleName + '_' + blockIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
            plt.savefig(filenameOutput)
            plt.close()
    return samplesToCheck


In [None]:
#Store directory references
dir_data = '.' + os.path.sep + 'DATA' + os.path.sep
dir_blocks_data = dir_data + 'BLOCKS' + os.path.sep
dir_blocks_inputBlocks = dir_blocks_data + 'INPUT_BLOCKS' + os.path.sep
dir_blocks_inputWSI = dir_blocks_data + 'INPUT_WSI' + os.path.sep
dir_blocks_outputBlocks = dir_blocks_data + 'OUTPUT_BLOCKS' + os.path.sep
dir_blocks_outputVisuals = dir_blocks_data + 'OUTPUT_BLOCKS_VISUALS' + os.path.sep
file_blocks_labels = dir_blocks_inputBlocks + 'Patch_list.xlsx'

#Create storage locations for new block files and visuals
if os.path.exists(dir_blocks_outputBlocks): shutil.rmtree(dir_blocks_outputBlocks)
os.makedirs(dir_blocks_outputBlocks)
if os.path.exists(dir_blocks_outputVisuals): shutil.rmtree(dir_blocks_outputVisuals)
os.makedirs(dir_blocks_outputVisuals)

#Get list of all the previous blocks that are intended for comparison with new extractions
metadata = pd.read_excel(file_blocks_labels, header=None, names=['name', 'label'], converters={'name':str,'label':str})
blockNamesAll_blocks, blockLabelsAll_blocks = np.array(metadata['name']), np.array(metadata['label'])
blockSampleNamesAll_blocks = np.array([re.split('PS|_', blockName)[1] for blockName in blockNamesAll_blocks])
blockFilenamesAll_blocks = [dir_blocks_inputBlocks + 'S' + blockSampleNamesAll_blocks[blockIndex] + os.path.sep + blockNamesAll_blocks[blockIndex] + '.tif' for blockIndex in range(0, len(blockNamesAll_blocks))]


In [None]:
#Specify all samples that need to be processed
#sampleNames = np.unique(blockSampleNamesAll_blocks)

#Create subfolders for the block images derived from each sample and comparisons between original and new block images
sampleFolders = np.asarray([dir_blocks_outputBlocks + sampleName + os.path.sep for sampleName in sampleNames])
for sampleFolder in sampleFolders: os.makedirs(sampleFolder)
sampleVisualsFolders = np.asarray([dir_blocks_outputVisuals + sampleName + os.path.sep for sampleName in sampleNames])
for sampleVisualsFolder in sampleVisualsFolders: os.makedirs(sampleVisualsFolder)

#Extract blocks for each sample
samplesToCheck = []
for sampleIndex, sampleName in tqdm(enumerate(sampleNames), total=len(sampleNames), desc='Samples', leave=True, ascii=asciiFlag):
    
    #Get filenames, load original blocks, and setup indices for such
    blockFilenames = np.asarray(blockFilenamesAll_blocks)[np.where(blockSampleNamesAll_blocks == sampleName)[0]]
    originalBlockIndices = np.arange(0, len(blockFilenames))    

    #Load the sample WSI
    imageWSI = cv2.cvtColor(cv2.imread(dir_blocks_inputWSI + sampleName + '.jpg', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
    grayImageWSI = cv2.cvtColor(imageWSI, cv2.COLOR_RGB2GRAY)
    
    #Find block in WSI, storing it, and a visual of the original/new blocks if they do not match exactly
    if parallelization:
        futures = [(blockFilenames[indices], imageWSI, grayImageWSI, sampleFolders[sampleIndex], sampleVisualsFolders[sampleIndex], True) for indices in np.array_split(originalBlockIndices, numberCPUS)]
        computePool = Pool(numberCPUS)
        results = computePool.starmap_async(matchBlock, futures)
        computePool.close()
        computePool.join()
        samplesToCheckPartial = np.concatenate(results.get().copy()).tolist()
    else: 
        samplesToCheckPartial = matchBlock(blockFilenames, imageWSI, grayImageWSI, sampleFolders[sampleIndex], sampleVisualsFolders[sampleIndex], False)
    samplesToCheck += samplesToCheckPartial

#Print out the samples that need to have their output blocks examined by hand
print(np.unique(samplesToCheck))


In [None]:
#The following 25 (out of 66 total) samples were identified to not have exact matches for at least some of the re-extracted blocks:
#samplesToCheck = ['10', '12', '13', '14', '15', '21', '3', '32', '36', '38', '40', '42', '43', '45', '48', '49', '52', '53', '55', '56', '59', '6', '63', '66', '69']
#All newly extracted blocks for the identified samples were compared manually and empirically confirmed/rejected as matches
#Visual observation identified the following blocks as not having been successfully matched:
#PS40_703, PS40_759, PS43_6, PS45_119, PS45_133, PS45_137, PS45_140, PS49_135, PS49_386, PS56_220, PS59_167, PS59_186
#These particular blocks all appear to have had freehand cropping applied to them, resulting in bad matches.
#Alternative selection criteria (below), using only the centeral data of the original blocks, was used to find better matches. 

#Identify which isolated blocks were visually observed not to match with their original counterparts
blockIDs = [
    'PS40_703', 
    'PS40_759', 
    'PS43_6', 
    'PS45_119', 
    'PS45_133', 
    'PS45_137', 
    'PS45_140', 
    'PS49_135', 
    'PS49_386', 
    'PS56_220', 
    'PS59_167', 
    'PS59_186', 
]

#Redefine directory references
dir_blocks_outputBlocks = dir_blocks_data + 'OUTPUT_BLOCKS_REVISED' + os.path.sep
dir_blocks_outputVisuals = dir_blocks_data + 'OUTPUT_BLOCKS_VISUALS_REVISED' + os.path.sep

#Create storage locations for new block files and visuals
if os.path.exists(dir_blocks_outputBlocks): shutil.rmtree(dir_blocks_outputBlocks)
os.makedirs(dir_blocks_outputBlocks)
if os.path.exists(dir_blocks_outputVisuals): shutil.rmtree(dir_blocks_outputVisuals)
os.makedirs(dir_blocks_outputVisuals)

#Perform block searching with revised criteria
samplesToCheck = []
for blockID in tqdm(blockIDs, desc='Blocks', leave=True, ascii=asciiFlag):
    
    #Load each of the noted blocks and their originating WSI
    sampleName, blockIndex = re.split('_|PS', blockID)[1:]
    blockFilename = dir_blocks_inputBlocks+'S'+sampleName+os.path.sep+blockID+'.tif'
    originalBlockImage = cv2.cvtColor(cv2.imread(blockFilename), cv2.COLOR_BGR2RGB)
    imageWSI = cv2.cvtColor(cv2.imread(dir_blocks_inputWSI + sampleName + '.jpg', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
    grayImageWSI = cv2.cvtColor(imageWSI, cv2.COLOR_RGB2GRAY)
    
    #Create subfolders for the revised block images derived from each sample and comparisons between original and new block images
    sampleFolder = dir_blocks_outputBlocks + sampleName + os.path.sep
    if not os.path.exists(sampleFolder): os.makedirs(sampleFolder)
    sampleVisualsFolder = dir_blocks_outputVisuals + sampleName + os.path.sep
    if not os.path.exists(sampleVisualsFolder): os.makedirs(sampleVisualsFolder)
    
    #Look for just the central block of data; try and avoid including the freehand cropping in search parameters...
    dataPad = int(np.ceil(blockSize/4))
    findData = cv2.cvtColor(originalBlockImage[dataPad:dataPad+dataPad, dataPad:dataPad+dataPad], cv2.COLOR_RGB2GRAY)
    matchMap = cv2.matchTemplate(grayImageWSI, findData, cv2.TM_CCOEFF_NORMED)
    startRow, startColumn = np.unravel_index(np.argmax(matchMap), matchMap.shape)
    startRow, startColumn = startRow-dataPad, startColumn-dataPad
    
    #Extract block data and store updated results to disk
    blockImage = imageWSI[startRow:startRow+blockSize, startColumn:startColumn+blockSize]
    filenameOutput = sampleFolder + 'PS' + sampleName + '_' + blockIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
    writeSuccess = cv2.imwrite(filenameOutput, cv2.cvtColor(blockImage, cv2.COLOR_RGB2BGR), params=(cv2.IMWRITE_TIFF_COMPRESSION, 1))
    maeImage = np.mean(np.abs(originalBlockImage-blockImage), -1)
    if np.sum(maeImage) != 0: 
        samplesToCheck.append(sampleName)
        fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
        ax[0].imshow(blockImage)
        ax[1].imshow(originalBlockImage)
        ax[2].imshow(maeImage)
        ax[0].set_title('New')
        ax[1].set_title('Original')
        ax[2].set_title('MAE')
        fig.suptitle('Sample ' + sampleName)
        plt.tight_layout()
        filenameOutput = sampleVisualsFolder + 'PS' + sampleName + '_' + blockIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
        plt.savefig(filenameOutput)
        plt.close()
    
#Print out the samples that need to have their output blocks examined by hand again
print(np.unique(samplesToCheck))
