In [1]:
#==================================================================
#Program: patchDeeperExtraction
#Version: 1.0
#Author: David Helminiak
#Date Created: 10 September 2024
#Date Last Modified: 15 April 2025
#Description: Extract best matching .tif patch images for samples from the associated WSI .jpg files using template matching
#Operation: Move back into main program directory before running.
#==================================================================

#Have the notebook fill more of the display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:80% !important; }</style>"))

#Items otherwise covered when not running this code in a notebook
import tempfile
dir_tmp = tempfile.TemporaryDirectory(prefix='TMP_').name
configFileName = './CONFIG_0-TEST'

#Should parallelization calls be used
parallelization = True

#If parallelization is enabled, how many CPU threads should be used? (0 will use any/all available)
#Recommend starting at half of the available system threads if using hyperthreading,
#or 1-2 less than the number of system CPU cores if not using hyperthreading.
#Can adjust down to help manage RAM overhead, but it may have limited impact.
#4 threads with 128 GB RAM doesn't cause OOM, but 8 threads does
#Could leverage shared memory for larger WSI objects, if for some reason continued development is needed in the future.
availableThreads = 4

#Which GPU(s) devices should be used; NA, but need to initialize (default: [])
gpus = []

#RNG seed value to ensure run-to-run consistency (-1 to disable)
manualSeedValue = 0

#Debugging mode
debugMode = True

#TQDM progress bar visualization flag
asciiFlag = False

#Load external libraries
exec(open("./CODE/EXTERNAL.py", encoding='utf-8').read())
exec(open("./CODE/COMPUTE.py", encoding='utf-8').read())

#When splitting WSI images, what size should the resulting patches be (default: 400)
#Should remain consistent with original patch sizes
patchSize = 400
 
    



In [2]:
#Find original patch location in the WSI and extract patch from it
def matchPatch(patchFilenames, imageWSI, grayImageWSI, sampleFolder, sampleVisualsFolder, showProgressBars):
    samplesToCheck = []
    for index, patchFilename in tqdm(enumerate(patchFilenames), total=len(patchFilenames), desc='Patches', leave=False, ascii=asciiFlag, disable=showProgressBars): 
        originalPatchImage = cv2.cvtColor(cv2.imread(patchFilename, cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
        matchMap = cv2.matchTemplate(grayImageWSI, cv2.cvtColor(originalPatchImage, cv2.COLOR_RGB2GRAY), cv2.TM_CCOEFF_NORMED)
        startRow, startColumn = np.unravel_index(np.argmax(matchMap), matchMap.shape)
        patchImage = imageWSI[startRow:startRow+patchSize, startColumn:startColumn+patchSize]
        patchIndex = os.path.basename(patchFilenames[index]).split('_')[-1].split('.tif')[0]
        filenameOutput = sampleFolder + 'PS' + sampleName + '_' + patchIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
        writeSuccess = cv2.imwrite(filenameOutput, cv2.cvtColor(patchImage, cv2.COLOR_RGB2BGR), params=(cv2.IMWRITE_TIFF_COMPRESSION, 1))
        maeImage = np.mean(np.abs(originalPatchImage-patchImage), -1)
        if np.sum(maeImage) != 0: 
            samplesToCheck.append(sampleName)
            fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
            ax[0].imshow(patchImage)
            ax[1].imshow(originalPatchImage)
            ax[2].imshow(maeImage)
            ax[0].set_title('New')
            ax[1].set_title('Original')
            ax[2].set_title('MAE')
            fig.suptitle('Sample ' + sampleName)
            plt.tight_layout()
            filenameOutput = sampleVisualsFolder + 'PS' + sampleName + '_' + patchIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
            plt.savefig(filenameOutput)
            plt.close()
    return samplesToCheck


In [3]:
#Store directory references
dir_data = '.' + os.path.sep + 'DATA' + os.path.sep
dir_patches_data = dir_data + 'PATCHES' + os.path.sep
dir_patches_inputPatches = dir_patches_data + 'INPUT_PATCHES' + os.path.sep
dir_patches_inputWSI = dir_patches_data + 'INPUT_WSI' + os.path.sep
dir_patches_outputPatches = dir_patches_data + 'OUTPUT_PATCHES' + os.path.sep
dir_patches_outputVisuals = dir_patches_data + 'OUTPUT_patches_VISUALS' + os.path.sep
file_patches_labels = dir_patches_inputPatches + 'Patch_list.xlsx'

#Create storage locations for new patch files and visuals
if os.path.exists(dir_patches_outputPatches): shutil.rmtree(dir_patches_outputPatches)
os.makedirs(dir_patches_outputPatches)
if os.path.exists(dir_patches_outputVisuals): shutil.rmtree(dir_patches_outputVisuals)
os.makedirs(dir_patches_outputVisuals)

#Get list of all the previous patches that are intended for comparison with new extractions
metadata = pd.read_excel(file_patches_labels, header=None, names=['name', 'label'], converters={'name':str,'label':str})
patchNamesAll_patches, patchLabelsAll_patches = np.array(metadata['name']), np.array(metadata['label'])

#Dataset 1
#patchSampleNamesAll_patches = np.array([re.split('PS|_', patchName)[1] for patchName in patchNamesAll_patches])
#patchFilenamesAll_patches = [dir_patches_inputPatches + 'S' + patchSampleNamesAll_patches[patchIndex] + os.path.sep + patchNamesAll_patches[patchIndex] + '.tif' for patchIndex in range(0, len(patchNamesAll_patches))]

#Dataset 2
patchSampleNamesAll_patches = np.array([re.split('P|_', patchName)[1] for patchName in patchNamesAll_patches])
patchFilenamesAll_patches = [dir_patches_inputPatches + patchSampleNamesAll_patches[patchIndex] + os.path.sep + patchNamesAll_patches[patchIndex] for patchIndex in range(0, len(patchNamesAll_patches))]


In [None]:
#Specify all samples that need to be processed
sampleNames = np.unique(patchSampleNamesAll_patches)

#Create subfolders for the patch images derived from each sample and comparisons between original and new patch images
sampleFolders = np.asarray([dir_patches_outputPatches + sampleName + os.path.sep for sampleName in sampleNames])
for sampleFolder in sampleFolders: os.makedirs(sampleFolder)
sampleVisualsFolders = np.asarray([dir_patches_outputVisuals + sampleName + os.path.sep for sampleName in sampleNames])
for sampleVisualsFolder in sampleVisualsFolders: os.makedirs(sampleVisualsFolder)

#Extract patches for each sample
samplesToCheck = []
for sampleIndex, sampleName in tqdm(enumerate(sampleNames), total=len(sampleNames), desc='Samples', leave=True, ascii=asciiFlag):
    
    #Get filenames, load original patches, and setup indices for such
    patchFilenames = np.asarray(patchFilenamesAll_patches)[np.where(patchSampleNamesAll_patches == sampleName)[0]]
    originalPatchIndices = np.arange(0, len(patchFilenames))    

    #Load the sample WSI
    imageWSI = cv2.cvtColor(cv2.imread(dir_patches_inputWSI + sampleName + '.jpg', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
    grayImageWSI = cv2.cvtColor(imageWSI, cv2.COLOR_RGB2GRAY)
    
    #Find patch in WSI, storing it, and a visual of the original/new patches if they do not match exactly
    if parallelization:
        futures = [(patchFilenames[indices], imageWSI, grayImageWSI, sampleFolders[sampleIndex], sampleVisualsFolders[sampleIndex], True) for indices in np.array_split(originalPatchIndices, numberCPUS)]
        computePool = Pool(numberCPUS)
        results = computePool.starmap_async(matchPatch, futures)
        computePool.close()
        computePool.join()
        samplesToCheckPartial = np.concatenate(results.get().copy()).tolist()
    else: 
        samplesToCheckPartial = matchPatch(patchFilenames, imageWSI, grayImageWSI, sampleFolders[sampleIndex], sampleVisualsFolders[sampleIndex], False)
    samplesToCheck += samplesToCheckPartial

#Print out the samples that need to have their output patches examined by hand
print(np.unique(samplesToCheck))



In [156]:
#Alternative selection criteria (below), using only the centeral data of the original patches, was used to find better matches for patches with non-exact matches. 

#For Dataset 1
#The following 25 (out of 66 total) samples were identified to not have exact matches for at least some of the re-extracted patches:
#samplesToCheck = ['10', '12', '13', '14', '15', '21', '3', '32', '36', '38', '40', '42', '43', '45', '48', '49', '52', '53', '55', '56', '59', '6', '63', '66', '69']
#All newly extracted patches for the identified samples were compared manually and empirically confirmed/rejected as matches
#Visual observation identified the following patches as not having been successfully matched:
#patchIDs = [PS40_703, PS40_759, PS43_6, PS45_119, PS45_133, PS45_137, PS45_140, PS49_135, PS49_386, PS56_220, PS59_167, PS59_186]
#These particular patches all appear to have had freehand cropping applied to them, resulting in bad matches.

#For Dataset 2
#The following 32 (out of 76 total) samples were identified to not have exact matches for at least some of the re-extracted patches:
#samplesToCheck = ['103', '104', '106', '109', '110', '111', '112', '113', '114', '125', '132', '135', '136', '139', '142', '160', '166', '179', '73', '76', '77', '79', '81', '82', '85', '86', '90', '92', '93', '94', '95', '97']
#All newly extracted patches for the identified samples were compared manually and empirically confirmed/rejected as matches
#Visual observation identified the following patches as not having been successfully matched (or were questionable):
#patchIDs = ['P73_88', 'P73_230', 'P73_272', 'P79_57', 'P79_237','P79_240', 'P82_147', 'P90_1', 'P92_113', 'P93_124', 'P94_162', 'P97_50', 'P104_46', 'P104_154', 'P104_158', 'P104_168', 'P106_335', 'P106_384', 'P109_28', 'P109_75', 'P111_175', 'P111_193', 'P111_194', 'P112_16', 'P112_24', 'P112_42', 'P112_51', 'P112_86', 'P112_108', 'P112_132', 'P112_165', 'P112_207', 'P112_225', 'P113_410', 'P125_16', 'P125_32', 'P125_48', 'P125_319', 'P125_359', 'P125_367', 'P135_151', 'P136_415', 'P136_420', 'P139_58', 'P142_94', 'P142_106', 'P142_118', 'P142_182', 'P160_348', 'P160_385', 'P160_466', 'P160_467', 'P160_475', 'P166_1', 'P166_101', 'P166_248', 'P166_249', 'P166_251', 'P166_252', 'P179_210', 'P179_229', 'P179_247', 'P179_279']
#Most of these patches appear to have had freehand cropping applied to them, resulting in bad matches.
#One patch was still unable to be found and was determined manually - PS112_86: (4000, 6400)

#Identify which isolated patches were visually observed not to match with their original counterparts
patchIDs = []

#Redefine directory references
dir_patches_outputPatches = dir_patches_data + 'OUTPUT_patches_REVISED' + os.path.sep
dir_patches_outputVisuals = dir_patches_data + 'OUTPUT_patches_VISUALS_REVISED' + os.path.sep

#Create storage locations for new patch files and visuals
if os.path.exists(dir_patches_outputPatches): shutil.rmtree(dir_patches_outputPatches)
os.makedirs(dir_patches_outputPatches)
if os.path.exists(dir_patches_outputVisuals): shutil.rmtree(dir_patches_outputVisuals)
os.makedirs(dir_patches_outputVisuals)


#Perform patch searching with revised criteria
samplesToCheck = []
for patchID in tqdm(patchIDs, desc='Patches', leave=True, ascii=asciiFlag):
    
    #Load each of the noted patches and their originating WSI
    
    #Dataset 1
    #sampleName, patchIndex = re.split('_|PS', patchID)[1:]
    #patchFilename = dir_patches_inputPatches+'S'+sampleName+os.path.sep+patchID+'.tif'
    
    #Dataset 2
    sampleName, patchIndex = re.split('_|P', patchID)[1:]
    patchFilename = dir_patches_inputPatches+sampleName+os.path.sep+patchID+'.tif'
    
    originalPatchImage = cv2.cvtColor(cv2.imread(patchFilename), cv2.COLOR_BGR2RGB)
    imageWSI = cv2.cvtColor(cv2.imread(dir_patches_inputWSI + sampleName + '.jpg', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
    grayImageWSI = cv2.cvtColor(imageWSI, cv2.COLOR_RGB2GRAY)
    
    #Create subfolders for the revised patch images derived from each sample and comparisons between original and new patch images
    sampleFolder = dir_patches_outputPatches + sampleName + os.path.sep
    if not os.path.exists(sampleFolder): os.makedirs(sampleFolder)
    sampleVisualsFolder = dir_patches_outputVisuals + sampleName + os.path.sep
    if not os.path.exists(sampleVisualsFolder): os.makedirs(sampleVisualsFolder)
    
    #Look for just the central patch of data; try and avoid including the freehand cropping in search parameters...
    dataPad = int(np.ceil(patchSize/4))
    findData = cv2.cvtColor(originalPatchImage[dataPad:dataPad+dataPad, dataPad:dataPad+dataPad], cv2.COLOR_RGB2GRAY)
    findData = originalPatchImage[dataPad:dataPad+dataPad, dataPad:dataPad+dataPad, 1]
    matchMap = cv2.matchTemplate(grayImageWSI, findData, cv2.TM_CCOEFF_NORMED)
    startRow, startColumn = np.unravel_index(np.argmax(matchMap), matchMap.shape)
    startRow, startColumn = startRow-dataPad, startColumn-dataPad
    
    #Extract patch data and store updated results to disk
    patchImage = imageWSI[startRow:startRow+patchSize, startColumn:startColumn+patchSize]
    filenameOutput = sampleFolder + 'PS' + sampleName + '_' + patchIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
    writeSuccess = cv2.imwrite(filenameOutput, cv2.cvtColor(patchImage, cv2.COLOR_RGB2BGR), params=(cv2.IMWRITE_TIFF_COMPRESSION, 1))
    maeImage = np.mean(np.abs(originalPatchImage-patchImage), -1)
    if np.sum(maeImage) != 0: 
        samplesToCheck.append(sampleName)
        fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 4))
        ax[0].imshow(patchImage)
        ax[1].imshow(originalPatchImage)
        ax[2].imshow(maeImage)
        ax[0].set_title('New')
        ax[1].set_title('Original')
        ax[2].set_title('MAE')
        fig.suptitle('Sample ' + sampleName)
        plt.tight_layout()
        filenameOutput = sampleVisualsFolder + 'PS' + sampleName + '_' + patchIndex + '_' + str(startRow) + '_' + str(startColumn) + '.tif'
        plt.savefig(filenameOutput)
        plt.close()
    
#Print out the samples that need to have their output patches examined by hand again
print(np.unique(samplesToCheck))

