## CD8+ phenptyping based on nature paper on Immune phenotyping : 

- Validating if area measurments, cell count and density calculation match from Marta 
- Plotting cells and annotations masks on CD8 IHC -- relevant for annotation transfer  
- Nature paper: https://www.nature.com/articles/s41374-021-00653-y


In [None]:
import numpy as np 
import os 
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import cv2
import openslide
import math
import matplotlib.pyplot as plt
import time 
import matplotlib.pyplot as plt
from PIL import Image
import math
from sklearn import metrics 



In [None]:
base_CD8_cells = '/cluster/work/tumorp/share/st_data/usz_share/CD8_quantification/CD8_cells_coordinates' # positive cells 
base_CD8_regions = '/cluster/work/tumorp/share/st_data/usz_share/CD8_quantification/CD8_region_annotations' # global tumor/inv margin annotations
base_CD8_annotations = '/cluster/work/tumorp/share/st_data/usz_share/CD8_quantification/TuPro_classifier_masks' # 5 regions annotations within each global regions 
base_CD8_imgs = '/cluster/work/tumorp/data_repository/study/'

f_GT_CD8 = '/cluster/work/tumorp/share/st_data/usz_share/CD8_quantification/20220209_TuProcohort_CD8.xlsx' 

print(os.path.exists(base_CD8_cells), os.path.exists(base_CD8_regions), os.path.exists(base_CD8_annotations))


In [None]:
def showimg(k, title, figsize = (15,15)):
    plt.figure(figsize = figsize)
    plt.title(title)
    plt.imshow(k)#, cmap='gray')
    plt.show()
# ---------------------------------------------------

def get_img_from_wsi(wsi_path, downsample_factor): 
    wsi = openslide.OpenSlide(wsi_path)
    resolution = float(wsi.properties['openslide.mpp-x'])
    level_for_plotting = wsi.get_best_level_for_downsample(downsample_factor)

    img_for_plotting = wsi.read_region((0, 0), level_for_plotting, (wsi.level_dimensions[level_for_plotting]))
    img_for_plotting = np.array(img_for_plotting.convert('RGB'))
    return img_for_plotting, resolution

# ---------------------------------------------------

In [None]:
# for each sample get cell count, area and density 
f_CD8_cells = glob.glob(base_CD8_cells + '/*tumor_center.csv')
len(f_CD8_cells), f_CD8_cells[0]

In [None]:
# ---------------------------------------------------
# getting mask of IM and tumor regions 
# ---------------------------------------------------
def get_region_masks(f_wsi_CD8, f_regions, downsample_factor=32): 

    img_for_plotting, resolution = get_img_from_wsi(f_wsi_CD8, downsample_factor)
    mask_annotated = np.zeros((img_for_plotting.shape), np.uint8)

    tree = ET.parse(f_regions)
    Annotation = tree.findall('Annotation')
    labels_dict = {"Tumor Center":(255,0,0), "IM": (0,0,255)}

    for j in range(len(Annotation)):
        label = Annotation[j].get('Name')
        mask_inclusion = np.zeros((img_for_plotting.shape), np.uint8)
        mask_exclusion = np.zeros((img_for_plotting.shape), np.uint8)

        if label in labels_dict.keys(): 
            n_regions = len(Annotation[j].findall('Regions/Region'))

            for i in range(n_regions): 
                region = Annotation[j].findall('Regions/Region')[i]
                exclusion = region.get('NegativeROA')
                vertices = region.findall('Vertices/V')

                # get vertices for the region
                loc_temp = []
                for counter, x in enumerate(vertices):
                    loc_X = int(float(x.attrib['X']))
                    loc_Y = int(float(x.attrib['Y']))
                    loc_temp.append([loc_X, loc_Y])
                loc_temp = np.asarray(loc_temp)
                loc_temp = loc_temp / downsample_factor # just to plot the coordinates on a downsampled image
                loc_temp = loc_temp.astype(int)

                if int(exclusion)==1: 
                    mask_exclusion = cv2.drawContours(mask_exclusion, [loc_temp], 0, labels_dict[label], -1)

                elif int(exclusion)!=1: 
                    mask_inclusion = cv2.drawContours(mask_inclusion, [loc_temp], 0, labels_dict[label], -1)               

            # for label merge inclusion exclusion masks 
            mask_label = mask_inclusion
            mask_label[np.where(mask_inclusion==mask_exclusion)]=0

            mask_annotated = cv2.bitwise_or(mask_annotated, mask_label)
    overlay = cv2.addWeighted(img_for_plotting,0.7,mask_annotated,0.3,0)
    return overlay

# ---------------------------------------------------
# getting mask of 5 regions annotated 
# ---------------------------------------------------
def get_annotation_masks(f_annotation, level=3): 
    '''
    GT colors: 
    tumor: red [255,   0,   0], stroma: green [0,128,0], positive lymphocytes: pink [255,0,255]
    pigment : black [0,0,0], blood and necrosis : red/brown [192,64,0], white space: yellow [255,255,0]    
    '''
    wsi = openslide.OpenSlide(f_annotation) # tif file 
    img = wsi.read_region((0, 0), level, (wsi.level_dimensions[level]))
    img = np.array(img.convert('RGB'))
    
    # count of pixels for tumor and stroma region for iCD8 and sCD8+ calculations 
    n_tumor_pixels = np.count_nonzero((img == [255, 0, 0]).all(axis = 2))
    n_stroma_pixels = np.count_nonzero((img == [0,   128,   0]).all(axis = 2)) + np.count_nonzero((img == [255,   0,   255]).all(axis = 2))
    
    # getting total area for each region in mm2
    resolution_level =  wsi.level_downsamples[level] * 0.3 #* 10**(-3) # resolution as shared by Marta: 0.3um/pixel, converting in mm
    
    area_tumor = (n_tumor_pixels*(resolution_level**2)) # in um2
    area_stroma = (n_stroma_pixels*(resolution_level**2)) # in um2
    return area_tumor, area_stroma, img


In [None]:
# load GT CD8 calculations from Marta in pd df 
xl = pd.ExcelFile(f_GT_CD8)
df = xl.parse("Tabelle1")
df = df[['Case_ID', 'Analysis_Region', 'Revised immune diagnosis', 'Density Tumor', 'Density Stroma total', 'Tumor Area (um²)',
  'Positive Lymphocytes Area (um²)', 'Stroma Area (um²)', 'Tumor:_AP_Positive_Cells', 'Stroma:_AP_Positive_Cells', 'Positive_Lymphocytes:_AP_Positive_Cells']]
df  
   

### Validating if area measurments, cell count and density calculation match 
- comparing the individual sample calculations match with the ones in global sumamry excel file 
- Comments from Marta: 
    - the cell counts might a bit off. Why: the summary excel file was generated, the HALO software got updated and then the individual sample files were generated
- the area measure might also be slightly different as I am using a low resolution image for counting 
- Therefore, if the measures are off by say 5%, it is ok 
- In the end need to see if it has any effect on the immune phenotyping of samples 

In [None]:
def get_cell_count(df, cell_types): 
    # counting cells from each cell type 
    cell_type_count = {}
    for cell_type in cell_types: 
        cell_type_count[cell_type] = len(df[(df['Classifier Label'] == cell_type)])
    return cell_type_count

def get_immune_type(iCD8_density, sCD8_density):
    # thresholds obtained from nature paper: https://www.nature.com/articles/s41374-021-00653-y
    if (iCD8_density < 0.00004 and sCD8_density <= 0.0006): 
        immune_type = 'desert'
    elif (iCD8_density >= 0.00004 and sCD8_density < 0.002): 
        immune_type = 'excluded'
    elif (iCD8_density >= 0.00004 and sCD8_density > 0.002): 
        immune_type = 'inflamed'    
    
    return immune_type 

In [None]:
df_array = []
cell_types= ['Stroma', 'Tumor', 'Positive Lymphocytes']
plot_annotations = False
error_threshold = 5 # in percent 

for f_cells in glob.glob(base_CD8_cells + '/*tumor_center.csv'): 
    sample = f_cells.split('/')[-1].split('-')[0].split('_')[-1]
    region = f_cells.split('/')[-1].split('Data_')[-1].split('.')[0]
#     print(f_cells)
        
    if region == 'tumor_center': # as cell count only from tumor region relevant 
        print(sample)
        
        f_regions = glob.glob(base_CD8_regions + '/' + sample + '*')[0]
        f_annotation = glob.glob(base_CD8_annotations + '/' + sample + '*_tumor_center.tif')[0] 

        try: 
            f_wsi_CD8 = glob.glob(base_CD8_imgs + '/*/' + sample + '/digpath_zurich/raw/*/' + '*CD8_*.tif')[0]
            
            # get GT for sample from the summary excel file  
            df_sample_GT = df.loc[(df['Case_ID'] == sample) & ((df['Analysis_Region'] == 'Tumor Center') | (df['Analysis_Region'] == 'Layer 1'))]
            
            # ---- get region maps ---- 
            if plot_annotations:  
                regions_overlay = get_region_masks(f_wsi_CD8, f_regions, downsample_factor=32)

            # ---- get area of stroma and tumor regions  ----  
            area_tumor, area_stroma, img_annots_tumor = get_annotation_masks(f_annotation)  # from annotation mask
            area_tumor_gt = df_sample_GT['Tumor Area (um²)'].to_numpy()[0] # Ground truth (GT) from HALO excel file 
            area_stroma_gt = df_sample_GT['Positive Lymphocytes Area (um²)'].to_numpy()[0] + df_sample_GT['Stroma Area (um²)'].to_numpy()[0]
            
            error_area_tumor = (abs(area_tumor - area_tumor_gt)/area_tumor_gt*100) < error_threshold
            error_area_stroma = (abs(area_stroma - area_stroma_gt)/area_stroma_gt*100) < error_threshold
            
            # ---- get cell counts ---- 
            df_cells = pd.read_csv(f_cells)
            df_cells = df_cells[(df_cells['AP Positive'] == 1)]

            cell_type_count = get_cell_count(df_cells, cell_types) 
            iCD8 = cell_type_count['Tumor']
            sCD8 = cell_type_count['Stroma'] + cell_type_count['Positive Lymphocytes']
            
            iCD8_gt = df_sample_GT['Tumor:_AP_Positive_Cells'].to_numpy()[0]
            sCD8_gt = df_sample_GT['Stroma:_AP_Positive_Cells'].to_numpy()[0] + df_sample_GT['Positive_Lymphocytes:_AP_Positive_Cells'].to_numpy()[0]
            
            error_iCD8 = (abs(iCD8 - iCD8_gt)/iCD8_gt*100) < error_threshold
            error_sCD8 = (abs(sCD8 - sCD8_gt)/sCD8_gt*100) < error_threshold
            
            # ---- get density of CD8+ cells in stroma and tumor within tumor compartment ----
            iCD8_density = iCD8 / area_tumor
            sCD8_density = sCD8 / area_stroma
            
            iCD8_density_gt = df_sample_GT['Density Tumor'].to_numpy()[0]
            sCD8_density_gt = df_sample_GT['Density Stroma total'].to_numpy()[0]
            
            error_iCD8_density = (abs(iCD8_density - iCD8_density_gt)/iCD8_density_gt*100) < error_threshold
            error_sCD8_density = (abs(sCD8_density - sCD8_density_gt)/sCD8_density_gt*100) < error_threshold
            
            # ---- Immune phenotyping ---- 
            immune_type_reported = df_sample_GT['Revised immune diagnosis'].to_numpy()[0]
            immune_type_gt = get_immune_type(iCD8_density_gt, sCD8_density_gt)
            immune_type = get_immune_type(iCD8_density, sCD8_density)
                        
            df_array.append([sample, immune_type_reported, immune_type_gt, immune_type,  
                             iCD8_density_gt, iCD8_density, error_iCD8_density, 
                             sCD8_density_gt, sCD8_density, error_sCD8_density,
                             area_tumor_gt, area_tumor, error_area_tumor, 
                             area_stroma_gt, area_stroma, error_area_stroma, 
                             iCD8_gt, iCD8, error_iCD8, sCD8_gt, sCD8, error_sCD8 
                             ])

            if plot_annotations: # if want to visulise the annotations masks  
                f, ax = plt.subplots(1, 2, figsize=(40, 20))
                ax[0].imshow(regions_overlay)
                ax[0].set_title(sample + ' regions', fontsize=30)
                ax[1].imshow(img_annots_tumor)
                ax[1].set_title(sample + ' annotation Tumor', fontsize=30)
                plt.show()

        except Exception as e: print(e)


In [None]:
df_final = pd.DataFrame(df_array, columns = ['sample', 'immune_type_reported', 'immune_type_gt', 'immune_type', 
                                            'iCD8_density_gt', 'iCD8_density', 'error_iCD8_density', 
                                            'sCD8_density_gt', 'sCD8_density', 'error_sCD8_density', 
                                            'area_tumor_gt', 'area_tumor', 'error_area_tumor', 
                                            'area_stroma_gt', 'area_stroma', 'error_area_stroma',
                                            'iCD8_gt', 'iCD8', 'error_iCD8', 'sCD8_gt', 'sCD8', 'error_sCD8'])

df_final

In [None]:
df_final.to_csv('df_final.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
df_final = pd.read_csv("df_final.csv", sep = '\t')
df_final

In [None]:
# if immune phenotype of GT and per sample matches -- the immune phenotype status is the same for both 
# therefore no impace of HALO versions and downsample on the annotation mask towards immune phenptype 

matched, un_matched = df_final[df_final['immune_type_gt']==df_final['immune_type']].shape[0],df_final[df_final['immune_type_gt']!=df_final['immune_type']].shape[0]
matched, un_matched



In [None]:
# if immune phenotype GT matches immune_type_reported -- accuracy reported in paper: 
# accuracy is 64.2 % -- seems low -- excluded cases get confused with inflamed -- mailed Marta and Viktor to ask if GT is correct 
# response from Viktor: 
    # Improtant to not that these are two different methods – the immune phenotype reported is the “in study” reported phenotype by light-microscopic pathologist assessment. 
    # This is a moderately reproducible and semi-quantitative approach, which was a core motivation for the development of strictly quantitative, digital assessment methods.
    # immune_type_CD8+ is the digital immune phenotype and should serve as GT for comparison with Multi-V-stain.

matched, un_matched = df_final[df_final['immune_type_reported']==df_final['immune_type']].shape[0],df_final[df_final['immune_type_reported']!=df_final['immune_type']].shape[0]
accuracy = matched / (matched+un_matched) *100

labels = ['excluded', 'inflamed', 'desert']
confusion_matrix = metrics.confusion_matrix(df_final['immune_type_reported'], df_final['immune_type_gt'], labels) 

matched, un_matched, accuracy, confusion_matrix



### Plotting cells and annotations masks on CD8 IHC -- relevant for annotation transfer  
- for annotation transfer, will have to first align the images, obtain the transformation matrix and then apply it to the cell coordinates to get location of CD8+ cells in H&E image 
- Note: the 5 regions annotated inside the tumor compartment in "img_annots_tumor" would need additional x and y shift in the transformation matrix  


In [None]:
downsample_factor = 32

for f_sample_cells in glob.glob(base_CD8_cells + '/*tumor_center.csv'): 
    sample = f_sample_cells.split('/')[-1].split('-')[0]
    f_wsi_CD8 = glob.glob(base_CD8_imgs + '/*/' + sample + '/digpath_zurich/raw/*/' + '*CD8_*.tif')[0]
    f_regions = glob.glob(base_CD8_regions + '/' + sample + '*')[0]
    f_annotation = glob.glob(base_CD8_annotations + '/' + sample + '*_tumor_center.tif')[0] 
        
    df = pd.read_csv(f_sample_cells) 
    df_cells = df[['XMin', 'XMax', 'YMin', 'YMax', 'Classifier Label', 'AP Positive']]
    df_cells = df_cells[(df_cells['AP Positive'] == 1)].reset_index()
    
    # ---- plotting regions from region annotations file ----
    regions_overlay = get_region_masks(f_wsi_CD8, f_regions, downsample_factor=32)
    _, _, img_annots_tumor = get_annotation_masks(f_annotation)  # from annotation mask
    print(sample)
    
    f, ax = plt.subplots(1, 2, figsize=(6, 4))
    plt.figure(1, figsize = (6,4))
    
    plt.subplot(1, 2, 1)
    plt.imshow(regions_overlay)
    plt.title(sample + ' regions')
    plt.subplot(1, 2, 2)
    plt.imshow(img_annots_tumor)
    plt.title(sample + ' annotation Tumor')    
    plt.show()
                
    # ---- plotting in positive cells on low resolution CD8 img ---- 
    df_cells_downsample = df_cells.copy()
    df_cells_downsample[['XMin', 'XMax', 'YMin', 'YMax']] = df_cells[['XMin', 'XMax', 'YMin', 'YMax']].div(downsample_factor).astype('int32')
    
    # just plotting xmin, ymin as rectangle mostly a dot in low resolution image 
    img_for_plotting = get_img_from_wsi(f_wsi_CD8, downsample_factor)[0]

    # cell annotations: 
    celltype_colors = {'Stroma':(0,255,0), 'Tumor':(0,0,255), 'Positive Lymphocytes':(255,0,0)}
    cell_types = celltype_colors.keys() # df_cells_downsample['Classifier Label'].unique() 

    # plotting 
    i = 1
    plt.figure(2, figsize = (12,8))
    plt.subplot(1, 4, i)
    plt.imshow(img_for_plotting)
    plt.title('CD8 img at downsample ' + str(downsample_factor), fontsize=9)
    
    for cell_type in cell_types: 
        df_cell_type = df_cells_downsample[df_cells_downsample['Classifier Label'] == cell_type].reset_index()
        coords = df_cell_type[['XMin', 'YMin']].to_numpy()
        print('CD8+ cells in ', cell_type, len(coords))
        mask_cells = img_for_plotting.copy()
        
        radius = 8
        for coord in coords:         
            cv2.circle(mask_cells,tuple(coord), radius, celltype_colors[cell_type], -1)
        
        i = i+1
        plt.subplot(1, 4, i)
        plt.imshow(mask_cells)
        plt.title('CD8+ cells in ' + cell_type, fontsize=9)

    plt.show()
    plt.clf()
    plt.cla()
    plt.close()

    