# Making ST patches

In [80]:
import matplotlib.pyplot as plt
import matplotlib
import skimage
import os
from skimage import io
from skimage.color import rgb2gray
from PIL import Image
import pandas as pd
from skimage import data, color
from skimage.transform import rescale, resize, downscale_local_mean
import numpy as np
import csv
import glob
from itertools import chain
# set axis 
matplotlib.rcParams['font.size'] = 12

# set image size 
Image.MAX_IMAGE_PIXELS = None


In [3]:
# list of all images available
he_path = '/home/sanjavickovic/data/st_data/images'
ann_path = '/home/sanjavickovic/data/st_data/annotations'
he_filenames = os.listdir(he_path)
ann_filenames = os.listdir(ann_path)


In [4]:
# take only colon images and annotations
metadata = pd.read_excel('/home/sanjavickovic/data/st_data/Metadata_ST.xlsx')

# preprocess colons
metadata = metadata[metadata['Tissue Type'].isin(['Colon.Normal']) & metadata['Organism'].isin(['Mouse'])]

# preprocess bacs
#metadata = metadata[metadata['Genotype'].isin(['Bacs.Distal']) & metadata['Organism'].isin(['Mouse'])]

# format 
metadata['Filename'] = [str(i)+'CN'+str(j)+'_'+str(z) for i,j,z in zip(metadata['Array Batch'],metadata['Chip number'],metadata['ArrayIT Well Position'])]
metadata = metadata[['Age','Sex', 'Genotype', 'Specimen #ID','Filename']]

# select if needed otherwise comment
#metadata =  metadata[metadata['Filename'].isin(['10015CN80_C1','10015CN80_C2' ])]

In [5]:
metadata

Unnamed: 0,Age,Sex,Genotype,Specimen #ID,Filename
67,6w,F,BL6WT.Proximal,M1,L9CN12_C1
68,6w,F,BL6WT.Middle,M1,L9CN12_D1
69,6w,F,BL6WT.Distal,M1,L9CN12_E1
70,6w,F,BL6WT.Proximal,M1,L9CN12_C2
71,6w,F,BL6WT.Middle,M1,L9CN12_D2
...,...,...,...,...,...
579,6w,M,Bacs.Distal,M1 6w M (2) 10/05,10015CN67_D1
580,6w,M,Bacs.Distal,M1 6w M (2) 10/05,10015CN67_E1
581,6w,M,Bacs.Distal,M1 6w M (2) 10/05,10015CN67_C2
582,6w,M,Bacs.Distal,M1 6w M (2) 10/05,10015CN67_D2


In [6]:
# subset to take only colon images to analysis
he_files = [value for value in he_filenames if value in [i+'_HE.jpg' for i in metadata['Filename'].tolist()]]
ann_files = [value for value in ann_filenames if value in [i+'_annotations.txt' for i in metadata['Filename'].tolist()]]

In [69]:
def preprocess_st_images(he_path, ann_path, train_path, he_file):
    
    print("Processing path: ", he_file)
    
    #reads in he_file_name
    filename = os.path.join(he_path, he_file)
    
    # reads in he_file as image array
    image = io.imread(filename, plugin='matplotlib')
    xdim,ydim,zdim = image.shape      
 
    #reads in the same annotation file as HE image
    #first make sure ann and HE are from the same image
    ann_file = os.path.basename(he_file).split("_HE.jpg")[0]+'_annotations.txt'
    ann = pd.read_csv(os.path.join(ann_path, ann_file), sep ="\t")
    ann_df = pd.DataFrame(ann)
    
    # gets array dimensions
    testy = ydim/32
    testx = xdim/34

    # transforms x_y into pixel coordinates
    x_indices = np.array([(i-1)*testy for i in ann['x']])
    y_indices = np.array([(i-1)*testx for i in ann['y']])

    x_coordinates = []
    y_coordinates = []
    for i in x_indices:
        for j in y_indices:
            x_coordinates.append(i)
            y_coordinates.append(j) 
           
    # add correct pixel coordiantes
    ann_df['new_x'] = x_indices
    ann_df['new_y'] = y_indices

    # if ann label "Unknown" remove those spots
    ann_df = ann_df[~ann_df['value'].isin(['Unknown'])] 
    
    # rename regions to make sure they survive tf handling
    new_names = [{x:x.lower().replace(" - "," ").replace(";"," and ")} for x in set(ann_df['value'])]
    vals = [v for v in new_names for k,v in v.items()]
    keys = [v for v in new_names for v,k in v.items()]
    ann_df['value'] = ann_df['value'].replace({keys[i]: vals[i] for i in range(len(vals))})
    all_spots = ann_df
    
    # remove any frame spots as those interfere with size of plots given they're smaller 
    x_all = np.array([i for i in range(1, 33)])
    y_all = np.array([i for i in range(1, 35)])
    first_row = [str(x)+'_35' for x in x_all]
    last_row = [str(x)+'_1' for x in x_all]
    first_column = ['1_'+str(y) for y in y_all]
    last_column = ['33_'+str(y) for y in y_all]
    frame = np.concatenate([first_row, last_row,first_column,last_column]) # this is generic for any ST array
    all_spots = all_spots[~all_spots['x_y'].isin(frame)] 
    
    # chage to train dir 
    # mkdir dir if it doesnt exist
    if os.path.isdir(train_path) == False:
        os.mkdir(train_path)
    os.chdir(train_path)
    print(train_path)
    
    #print(glob.glob(os.path.join('/home/sanjavickovic/data/st_data/patches', "*")))

    #read in the RGB image and resize same way 
    A = matplotlib.pyplot.imread(filename)
    
    # plots sanity check
#     fig, ax = plt.subplots(figsize=(25, 25))

#     # visualize the tissue image with annotated spots
#     ax.imshow(A, interpolation='none', alpha=1, cmap='gray')

#     # visualize the ST spots on top of the tissue image
#     ax.scatter(ann_df['new_x'], ann_df['new_y'], alpha = 0.2, s = xdim/15)
#     ax.set_aspect('equal')
    
    # plot small images of annotated spots and save all metadata in labels
    xminmax = []
    for i in all_spots['new_x']:
        xminmax.append((float(i)-xdim/(A.shape[0]/100),float(i)+xdim/(A.shape[0]/100))) # should be around 100 px around in case image is 1500x1500

    yminmax = []
    for i in all_spots['new_y']:
        yminmax.append((float(i)-ydim/(A.shape[0]/120),float(i)+ydim/(A.shape[0]/120))) # should be around 100 px around in case image is 1500x1500
    
    df1 = pd.DataFrame(xminmax, columns = ['xmin', 'xmax'])
    df2 = pd.DataFrame(yminmax, columns = ['ymin', 'ymax'])
    df3 = pd.DataFrame(np.array([all_spots['image'],all_spots['x_y'],all_spots['value']])).T
    df3.columns = ('image', 'x_y', 'value')
    result_tmp = pd.concat([df1,df2], axis=1, sort=False)
    results = pd.concat([result_tmp, df3], axis=1, sort=False )

    labels = []
    for index, row in results.iterrows():
        labels.append((row['image']+'_'+row['x_y'],row['value']))

        B=A[int(round(row['ymin'])):int(round(row['ymax'])),int(round(row['xmin'])):int(round(row['xmax'])),:]
        C=Image.fromarray(B, 'RGB')
        
        #check if patch exhists in folder; otherwise write image patch
        patch_name = train_path+"/"+str(row['image']+'_'+row['x_y']+'.jpg')
        if not os.path.basename(patch_name) in os.listdir(train_path):
            C.save(str(row['image']+'_'+row['x_y']+'.jpg'), quality=95)
            print(os.path.basename(patch_name))
#         # sets names to our variables
#         npa = np.array(labels).T.tolist()

#         # write out a csv files acting as labels for all images
#         labels_path = "/Users/sanjavickovic/Desktop/colons_comp/labels"
#         file_labels = os.path.join(labels_path, he_file.split("_HE.jpg")[0].split("/")[-1] + '_labels.csv')
#         with open(file_labels,'w') as out:
#             file_writer = csv.writer(out)
#             file_writer.writerow(('id', 'annotation'))
#             for i in range(len(npa[0])):
#                 file_writer.writerow([x[i] for x in npa])

In [70]:
train_path = '/home/sanjavickovic/data/st_data/patches'
# he_files = glob.glob(he_path+'/'+'*HE.jpg') # for all images
he_files = [he_path +"/"+i for i in ['10015CN93_D2_HE.jpg', '10015CN103_D2_HE.jpg', '10015CN92_D2_HE.jpg' ,'10015CN103_D1_HE.jpg', '10015CN93_D1_HE.jpg' ,'L9CN40_D1_HE.jpg','10005CN88_E1_HE.jpg']] # for specific images
#train_ready = [i for i in os.listdir(train_path)] # for all images

for he_file in he_files:
    he_tmp = he_file.split("/")[-1]
        
    # check if this image has already been processed
#     if (he_tmp.split("_HE.jpg")[0] in train_ready):
#         continue 
        
    he_dir = os.path.join(train_path + '/' + he_tmp.split("_")[0] + '_' + he_tmp.split("_")[1])
    
    preprocess_st_images(he_path, ann_path, he_dir, he_file)


Processing path:  /home/sanjavickovic/data/st_data/images/10015CN93_D2_HE.jpg
/home/sanjavickovic/data/st_data/patches/10015CN93_D2
Processing path:  /home/sanjavickovic/data/st_data/images/10015CN103_D2_HE.jpg
/home/sanjavickovic/data/st_data/patches/10015CN103_D2
10015CN103_D2_4_20.jpg
10015CN103_D2_6_14.jpg
10015CN103_D2_26_13.jpg
10015CN103_D2_5_15.jpg
10015CN103_D2_10_32.jpg
10015CN103_D2_11_32.jpg
10015CN103_D2_7_14.jpg
10015CN103_D2_21_20.jpg
10015CN103_D2_6_15.jpg
10015CN103_D2_6_13.jpg
10015CN103_D2_11_31.jpg
10015CN103_D2_11_33.jpg
10015CN103_D2_23_18.jpg
10015CN103_D2_5_14.jpg
10015CN103_D2_7_13.jpg
10015CN103_D2_21_19.jpg
10015CN103_D2_22_18.jpg
10015CN103_D2_20_20.jpg
10015CN103_D2_21_18.jpg
10015CN103_D2_25_13.jpg
10015CN103_D2_22_19.jpg
Processing path:  /home/sanjavickovic/data/st_data/images/10015CN92_D2_HE.jpg
/home/sanjavickovic/data/st_data/patches/10015CN92_D2
10015CN92_D2_6_23.jpg
10015CN92_D2_15_15.jpg
10015CN92_D2_21_30.jpg
Processing path:  /home/sanjavickovic/