In [1]:
from collections import defaultdict, Counter 
import colorsys
import itertools
import pickle 
import os
import shutil
import subprocess

import fcsparser
import fcswrite
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.backends.backend_pdf import PdfPages
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from IPython.display import display, HTML

import ot

from scipy.spatial import distance, ConvexHull
from scipy.stats import spearmanr

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import mutual_info_score, normalized_mutual_info_score
from sklearn.manifold import TSNE
import fcsparser 
from ete3 import Tree, TreeNode, TreeStyle, TextFace

from utils import *

data_dir = '.'

## Split Supplementary Data into FCS Files 

In [3]:
supplementary_data = pd.read_csv('../Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv')
marker_cols = list(supplementary_data.columns[1:30])
supplementary_data = supplementary_data[marker_cols + ['Imaging phenotype cluster ID']]
ids_to_names = pd.read_csv('ClusterIDtoName.txt', sep='\t')
cell_lines = list(ids_to_names['ID'].values)
ids_to_names = dict(zip(ids_to_names['ID'].values, ids_to_names['Name'].values))
# remove dirt from supplementary data 
supplementary_annotations = pd.read_excel('../Suppl.Table2.cluster annotations and cell counts.xlsx')
exclude = supplementary_annotations.loc[
                                     supplementary_annotations['Imaging phenotype (cell type)'] == 'dirt',
                                     'X-shift cluster ID']
supplementary_data = supplementary_data[~supplementary_data['Imaging phenotype cluster ID'].isin(exclude)]
suppl_groupby = supplementary_data.groupby('Imaging phenotype cluster ID')
if not os.path.isdir('scaffold_landmarks'):
    os.mkdir('scaffold_landmarks')
for cell_id, data in suppl_groupby:
    print('Writing data for cell type', cell_id)
    fcswrite.write_fcs(filename='scaffold_landmarks/cell_{}.fcs'.format(cell_id), 
                       chn_names=list(data.columns), 
                       data=data.values)

Writing data for cell type 9587
Writing data for cell type 9589
Writing data for cell type 9590
Writing data for cell type 9591
Writing data for cell type 9592
Writing data for cell type 9593
Writing data for cell type 9595
Writing data for cell type 9596
Writing data for cell type 9597
Writing data for cell type 9600
Writing data for cell type 9601
Writing data for cell type 9602
Writing data for cell type 9604
Writing data for cell type 9605
Writing data for cell type 9606
Writing data for cell type 9607
Writing data for cell type 9608
Writing data for cell type 9609
Writing data for cell type 9611
Writing data for cell type 9613
Writing data for cell type 9614
Writing data for cell type 9615
Writing data for cell type 9617
Writing data for cell type 9618
Writing data for cell type 9619
Writing data for cell type 9620
Writing data for cell type 9626
Writing data for cell type 9628
Writing data for cell type 9629
Writing data for cell type 9632
Writing data for cell type 9635
Writing 

## Run Vite on Cell Data 

In [5]:
# write clusters of tree to FCS files 
tree = pickle.load(open('tree_combined_for_html.pkl', 'rb'))
supplementary_data = pd.read_csv('../Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv')
marker_cols = list(supplementary_data.columns[1:30])
supplementary_data.rename(columns={'X.X': 'X', 'Y.Y': 'Y', 'Z.Z': 'Z'}, inplace=True)
supplementary_data['CD45_int'] = supplementary_data['CD45'].astype(int)
ids_to_names = pd.read_csv('ClusterIDtoName.txt', sep='\t')
cell_lines = list(ids_to_names['ID'].values)
ids_to_names = dict(zip(ids_to_names['ID'].values, ids_to_names['Name'].values))
# remove dirt from supplementary data 
supplementary_annotations = pd.read_excel('../Suppl.Table2.cluster annotations and cell counts.xlsx')
dirt = supplementary_annotations.loc[supplementary_annotations['Imaging phenotype (cell type)'] == 'dirt', 
                                     'X-shift cluster ID']
supplementary_data = supplementary_data[~supplementary_data['Imaging phenotype cluster ID'].isin(dirt)]
supplementary_data['sample'] = supplementary_data['sample_Xtile_Ytile'].apply(lambda x: x.split('_')[0])
suppl_converted = convert_coordinates(supplementary_data)[['X', 'Y', 'Z', 'sample'] + marker_cols]
print(marker_cols)

['CD45', 'Ly6C', 'TCR', 'Ly6G', 'CD19', 'CD169', 'CD106', 'CD3', 'CD1632', 'CD8a', 'CD90', 'F480', 'CD11c', 'Ter119', 'CD11b', 'IgD', 'CD27', 'CD5', 'CD79b', 'CD71', 'CD31', 'CD4', 'IgM', 'B220', 'ERTR7', 'CD35', 'CD2135', 'CD44', 'NKp46']


In [4]:
def write_layers(tree):
    layers = get_layers(tree)
    for layer_ind, layer in enumerate(layers):
        print('Writing layer', layer_ind)
        layer_dir = 'scaffold_analysis/Layer_' + str(layer_ind)
        if not os.path.exists(layer_dir):
            os.makedirs(layer_dir)
        all_clusters_markers = []
        for cluster_ind, node in enumerate(layer):
            node_markers= pd.merge(node.coords, suppl_converted, how='inner', on=['X', 'Y', 'Z', 'sample'])
            node_markers = node_markers[marker_cols].mean()
            all_clusters_markers.append(node_markers)
        all_clusters_markers = pd.DataFrame(all_clusters_markers)
        all_clusters_markers.to_csv(layer_dir + '/clusters_avg_markers.txt', sep='\t', index=False)
            
write_layers(tree)

Writing layer 0
Writing layer 1
Writing layer 2
Writing layer 3
Writing layer 4
Writing layer 5
Writing layer 6
Writing layer 7
Writing layer 8
Writing layer 9
Writing layer 10
Writing layer 11
Writing layer 12
Writing layer 13
Writing layer 14
Writing layer 15
Writing layer 16
Writing layer 17
Writing layer 18
Writing layer 19
Writing layer 20
Writing layer 21
Writing layer 22
Writing layer 23
Writing layer 24
Writing layer 25
Writing layer 26
Writing layer 27
Writing layer 28
Writing layer 29
Writing layer 30
Writing layer 31
Writing layer 32
Writing layer 33
Writing layer 34
Writing layer 35
Writing layer 36
Writing layer 37
Writing layer 38
Writing layer 39
Writing layer 40
Writing layer 41


In [4]:
def get_layer_intersection(layer):
    # for layer of tree, get cell types that each nodes intersects with most 
    most_intersect = []
    for node in layer:
        overlap = pd.merge(node.coords, suppl_converted, on=['X', 'Y', 'Z', 'sample'], how='inner')
        print(node.coords.shape[0], suppl_converted.shape[0], overlap.shape[0])
        counts = overlap['Imaging phenotype cluster ID'].value_counts().to_dict()
        most_intersect_node = max(counts, key=counts.get)
        most_intersect.append(most_intersect_node)
        
    return most_intersect 

tree = pickle.load(open('tree_combined_for_html.pkl', 'rb'))
for layer in get_layers(tree):
    print(get_layer_intersection(layer))

NameError: name 'suppl_converted' is not defined

In [1]:
%load_ext rpy2.ipython

  "but at least we found 'numpy'.")))
