### The code does the following: 
1. Visualisation of new scanned images, next to old H&E images -- visual QC
2. generating paths csv file needed to run Flavia's annotation transfer pipeline -- to generate new paired H&E IMC data for MVS


In [None]:
import numpy as np 
import os
import glob
import json
import openslide
import pandas as pd
import matplotlib.pyplot as plt



### Getting HE old scan file paths for all aligned samples 

In [None]:
# getting sample names for which aligned data from Simon exists

path_old_he = '/cluster/work/grlab/projects/projects2021-multivstain/data/tupro/binary_he_rois'
old_he_rois = glob.glob(path_old_he + '/*npy')
aligned_samples = [old_he_roi.split('/')[-1].split('_')[0]  for old_he_roi in old_he_rois]
aligned_samples = set(aligned_samples)
len(aligned_samples)

In [None]:
# getting old H&E image names for all samples that have IMC data  

# base_json_path = '/cluster/work/grlab/projects/projects2021-multivstain/data/tupro/json_files/'
base_json_path = '/cluster/work/grlab/projects/projects2022-he2ihc/AnnotationTransfer/json_files'
he_filenames_old = [(i.split('/')[-1].split('.json')[0]) for i in glob.glob(base_json_path + '/*/*') if i.split('/')[-1].split('.')[-3].split('_')[0]=='HE']

he_filenames_old_dict = dict((he_filename_old.split('-')[0], he_filename_old) for he_filename_old in he_filenames_old)
he_filenames_old_dict


#### finding full path for old he aligned images 

In [None]:
he_paths_old = []
for sample_name in aligned_samples: 
    try: 
        he_path_old = glob.glob('/cluster/work/tumorp/data_repository/study/' + '/*/' + sample_name + '/digpath_zurich/raw/pass_1/' + he_filenames_old_dict[sample_name])[0]
        if os.path.isfile(he_path_old):
            he_paths_old.append(he_path_old)
            
    except:
        pass
print(len(he_paths_old))
he_paths_old[0:4]


### Plotting old and new scans next to each other 

In [None]:
base_new_scans_he = '/cluster/work/tumorp/share/st_data/usz_share/HE_highres_imgs'

paths = []

for he_path_old in he_paths_old: 
    he_imgname_old = he_path_old.split('/')[-1]
    tupro_id = he_imgname_old.split('/')[-1].split('-')[0]
    he_path_new = glob.glob(base_new_scans_he + '/' + tupro_id + '*.ndpi')
    
    if len(he_path_new) == 0:
        print(tupro_id, ' : No new scanned image found')
    
    elif(len(he_path_new)>=1):
        he_imgname_new = [(i.split('/')[-1]) for i in he_path_new]
        print(tupro_id, ': ', len(he_path_new),'  scanned image found')
                
        # plot images side by side 
        slide_old = openslide.open_slide(he_path_old)
        level = slide_old.get_best_level_for_downsample(128)
        img_old = slide_old.read_region((0, 0), level, slide_old.level_dimensions[level])

        n_imgs = len(he_path_new) + 1
        plt.figure(figsize=(20,10)) 
        plt.subplot(1, n_imgs,1) 
        plt.title(he_imgname_old)
        plt.imshow(img_old)
        
        for i in range(len(he_path_new)): 
            json_path = os.path.join(base_json_path, tupro_id, he_imgname_old + '.json')
            paths.append([tupro_id, he_path_new[i], he_path_old, json_path])
            
            slide_new = openslide.open_slide(he_path_new[i])
            level = slide_new.get_best_level_for_downsample(128)
            img_new = slide_new.read_region((0, 0), level, slide_new.level_dimensions[level])
            plt.subplot(1, n_imgs, i+2)    
            plt.title(he_imgname_new[i])
            plt.imshow(img_new)

        plt.show()
        plt.close()
    

In [None]:
len(paths)

In [None]:
df = pd.DataFrame(paths)
df

In [None]:
df.to_csv('paths_updated.csv', header=False, index=False)