## Visualizing single cell examples for pooled data
- Inputs are:
  - REQUIRED:
    -  **Input_gene_or_guide** - (str), it can be a string of gene symbol or guide (20 chars)
        - e.g., 'KRT28' or 'GTTGAAGAGCTAGATCAACG'
    -  **batch_name** - (str) batch name
        - e.g., '20200805_A549_WG_Screen' 
    - **rootDir** - image, metadata and overlays address is following the structure of dirs in the s3 bucket so we need to fix the root directory and everything else is being formed accroding to the s3 bucket structure 
        - e.g., '/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/'
  - OPTIONAL:
    -  **n_cells** - (int), number of single cells to be visualized, it is ignored if cell_selection_method is geometric_median
    -  **box_size** - (int), size of box in pixels for cropping a cell    
    -  **channels** - (list), list of channels you want to plot
        - e.g., ['DNA','Mito','Phalloidin','WGA','ER','Outline'] 
    -  **cell_selection_method** - (str) can be one of the following methods
        - random - generate n randomly selected cells
        - representative - clusters the data and sample from the "closest to mean cluster"
        - geometric_median - plots single sample than is the geometric median of samples
        
        
###  Run time:

- For random cells (n_cells=6) -> ~ 3-4 mins
- For representative cells (For )  -> ~ 3-4 mins
- For geometric median (for calculate the gm from 1500 random subset of total cells) -> ~ 3 mins
  

In [30]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2

import black
import jupyter_black

jupyter_black.load(
    lab=False,
    line_length=79,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)

import pandas as pd
import numpy as np
import seaborn as sns
sns.set(color_codes=True)
from sklearn import preprocessing
import matplotlib.pyplot as plt
from singlecell.read import read_single_cell_sql
from singlecell.read import read_from_gallery

from singlecell.process import extract_single_cell_samples
# from singlecell.visualize import visualize_n_SingleCell
from singlecell.visualize import viz_pooled
from singlecell.preprocess.filter_out_edge_single_cells import edgeCellFilter

# import pooled_cell_painting_single_cell_visualization
import time
import gc
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
??jupyter_black.load

### Set paths and params

In [40]:
input_gene = "KRT28"
# input_gene_or_guide='GTTGAAGAGCTAGATCAACG'

# ########################## set batch to use
## CP186
# batch='20200805_A549_WG_Screen';

## CP257
batch = "20210422_6W_CP257"


########################## save generated images 
resultsDir = './results/'

########################## set directories/params when reading directly from cp gallery
rootDir = "cpg0021-periscope/broad/"

batch_multi_name_dict = {
    "20210422_6W_CP257": "HeLa",
    "20200805_A549_WG_Screen": "A549",
}


########################## set directories/params when reading from a mounted pooled-cp bucket
# rootDir='/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/'

# batch_multi_name_dict={'20210422_6W_CP257':'CP257-HeLa-WG',\
#                        '20200805_A549_WG_Screen':'CP186-A549-WG'}


######################### set alterative names used for path to each batch data which is set above
batch_alter_name = batch_multi_name_dict[batch]


######################### set path to single cell profiles
sc_files_dir_formatter = (
    "{0}/workspace/software/{1}"
    "/data/1.profiles/{2}/single_cell/single_cell_by_guide/"
)  # for pooled-cp-bucket

sc_files_dir_formatter = (
    "{0}workspace/profiles/{1}"
    "/single_cell_by_guide/"
)  # for cp gallery


sc_files_dir = sc_files_dir_formatter.format(rootDir, batch_alter_name, batch)


im_size = 5500  # hardcoded for now, TODO: create a dictionary if this number is different for 257 vs 186

#################### options for parameters to set
n_cells = 6
box_size = 100

# how we select cells which can be 'random','representative','geometric_median'
cell_selection_method = "representative"

channels = ["DNA", "Mito", "Phalloidin", "WGA", "ER", "Outline"]


##################### read metadata and create the file name for input guide or gene
metadata_dir = (
    "/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/workspace/metadata/"
    + batch + "/"
)
metadata_orig = pd.read_csv(metadata_dir + "Barcodes.csv")


paths = {
    "root_dir": rootDir,
    "batch_folder": batch,
}  


meta_cols = {
    "site": "Metadata_Foci_site_location",
    "well": "Metadata_Foci_well",
    "plate": "Metadata_Foci_plate",
}

viz_cols = {
    "center_indicator_columns_x_y": [
        "Cells_AreaShape_Center_X",
        "Cells_AreaShape_Center_Y",
    ],
    "image_path_column_prefix": "PathName_Corr",
    "image_name_column_prefix": "FileName_Corr",
}

fix_paths_params = {
    "paths": paths,
    "meta_cols": meta_cols,
    "viz_cols": viz_cols,
}

In [41]:
def custom_create_image_path_cols(params, df_p_s):
    """
    The following columns are needed for generation of single cell images,\
    if you already have it just modify the config file,
    if not you have to generate them
    """

    paths = params["paths"]
    meta_cols = params["meta_cols"]
    viz_cols = params["viz_cols"]

    for ch in channels:
        df_p_s["PathName_Corr" + ch] = (
            paths["root_dir"]+ "images/"+ paths["batch_folder"]\
            + "/images_corrected_cropped/"\
            + df_p_s[meta_cols["plate"]]\
            + "_"+ df_p_s[meta_cols["well"]]\
            + "/Corr"+ ch)
        
        df_p_s["FileName_Corr" + ch] = (
            "Corr"+ ch + "_" + "Site_"
            + df_p_s[meta_cols["site"]].astype(str) + ".tiff"
        )        
        
        
    df_p_s["Path_Outlines"] = (
        paths["root_dir"]
        + "workspace/analysis/"
        + batch
        + "/"
        + df_p_s[meta_cols["plate"]]
        + "-"
        + df_p_s[meta_cols["well"]]
        + "-"
        + df_p_s[meta_cols["site"]].astype(str)
        + "/"
        + "CorrDNA_Site_"
        + df_p_s[meta_cols["site"]].astype(str)
        + "_Overlay.png"
    )

    return df_p_s

In [44]:
resdir

'CACCCTTCACCAAAACGTAG'

## Visualize single cells for the input gene of guide
- Read from gallery

In [45]:
#### get guides corresponding to the input gene by checking the barcode library reference table
gene_guids_ls = metadata_orig[
    metadata_orig["gene_symbol"] == input_gene
].sgRNA.tolist()

for gi in gene_guids_ls:

    df_p_s = read_from_gallery.read_csv_gzip(
        sc_files_dir + batch + "_single_cell_normalized_ALLBATCHES__"\
        + gi + "_" + input_gene + ".csv.gz")

    df_p_s = custom_create_image_path_cols(fix_paths_params, df_p_s)


    df_p_s["Nuclei_Location_Center_X"] = df_p_s["Cells_AreaShape_Center_X"]
    df_p_s["Nuclei_Location_Center_Y"] = df_p_s["Cells_AreaShape_Center_Y"]

    #         df_p_s=edgeCellFilter2(df_p_s,im_size,box_size/2);
    df_p_s, _ = edgeCellFilter(df_p_s, im_size, box_size / 2)

    # .sample(n = np.min([1500,df_p_s.shape[0]]),\
    if df_p_s.shape[0] > 0:
        (
            df_samples,
            cp_features_analysis,
        ) = extract_single_cell_samples.extract_single_cell_samples(
            df_p_s, n_cells, cell_selection_method
        )

        resdir = resultsDir + input_gene  # +'/'+gi.split('_')[-2]
        os.system("mkdir -p " + resdir)


        df_samples["label"] = (
            df_p_s["Metadata_Foci_well"]
            + "-"
            + df_p_s["Metadata_Foci_site_location"].astype(str)
        )

        max_dist_of_neigh = 200
        neigh_center_ls = extract_neighbor_cells_center(df_samples,max_dist_of_neigh)

            
        fig = viz_pooled.visualize_n_SingleCell_pooled(channels,df_samples,\
                                                    box_size,im_size,title=input_gene+'_'+cell_selection_method,\
                                                    neigh_center_ls=neigh_center_ls);
        fig.savefig(resdir+'/'+gi+'.png',dpi=500)  
#         sc_fig = visualize_n_SingleCell.visualize_n_SingleCell(
#             channels,
#             df_samples,
#             box_size,
#             info_columns=y_labels,
#             outline=True,
#             color=True,
#             title=cell_selection_method,
#             compressed=True,
#             compressed_im_size=1080,
#         );

cp_features: 3733
cols2remove_manyNulls ['Cells_AreaShape_NormalizedMoment_0_0', 'Cytoplasm_AreaShape_NormalizedMoment_0_0', 'Nuclei_AreaShape_NormalizedMoment_1_0', 'Cells_Intensity_MassDisplacement_WellEdgeDistance', 'Cytoplasm_AreaShape_NormalizedMoment_1_0', 'Cells_AreaShape_NormalizedMoment_0_1', 'Nuclei_AreaShape_NormalizedMoment_0_1', 'Cells_Intensity_MeanIntensityEdge_MaskedScores_IntValues', 'Cells_Intensity_StdIntensityEdge_MaskedScores_IntValues', 'Cytoplasm_AreaShape_NormalizedMoment_0_1', 'Cells_AreaShape_NormalizedMoment_1_0', 'Nuclei_AreaShape_NormalizedMoment_0_0']
cols2remove_lowVars ['Cytoplasm_RadialDistribution_FracAtD_mito_tubeness_3of20', 'Cytoplasm_RadialDistribution_RadialCV_mito_tubeness_1of16', 'Cytoplasm_RadialDistribution_RadialCV_mito_tubeness_2of20', 'Nuclei_AreaShape_EulerNumber', 'Cytoplasm_RadialDistribution_FracAtD_mito_tubeness_Overflow', 'Cytoplasm_RadialDistribution_RadialCV_mito_tubeness_2of16', 'Cytoplasm_RadialDistribution_MeanFrac_mito_tubeness_

<IPython.core.display.Javascript object>

KRT28_representative
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257A_Well2/CorrDNA/CorrDNA_Site_44.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257A_Well2/CorrMito/CorrMito_Site_44.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257A_Well2/CorrPhalloidin/CorrPhalloidin_Site_44.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257A_Well2/CorrWGA/CorrWGA_Site_44.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257A_Well2/CorrER/CorrER_Site_44.tiff
cpg0021-periscope/broad/workspace/analysis/20210422_6W_CP257/CP257A-Well2-44/CorrDNA_Site_44_Overlay.png
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257D_Well5/CorrDNA/CorrDNA_Site_78.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257D_Well5/CorrMito/CorrMito_Site_78.tiff
cpg0021-periscope/broad/images/20210422_6W_CP2

<IPython.core.display.Javascript object>

KRT28_representative
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257K_Well2/CorrDNA/CorrDNA_Site_30.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257K_Well2/CorrMito/CorrMito_Site_30.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257K_Well2/CorrPhalloidin/CorrPhalloidin_Site_30.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257K_Well2/CorrWGA/CorrWGA_Site_30.tiff
cpg0021-periscope/broad/images/20210422_6W_CP257/images_corrected_cropped/CP257K_Well2/CorrER/CorrER_Site_30.tiff
cpg0021-periscope/broad/workspace/analysis/20210422_6W_CP257/CP257K-Well2-30/CorrDNA_Site_30_Overlay.png


NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [43]:
def extract_neighbor_cells_center(df_samples,max_dist_of_neigh):
    neigh_center_ls = []
    for i in range(df_samples.shape[0]):
        df_samples_0 = df_samples.loc[i]

        df_samples_0_neighs = df_p_s[
            (df_p_s["PathName_CorrER"] == df_samples_0["PathName_CorrER"])
            & (
                df_p_s["Metadata_Foci_Barcode_MatchedTo_Barcode"]
                == df_samples_0["Metadata_Foci_Barcode_MatchedTo_Barcode"]
            )
        ]
        df_s0_neighs = df_samples_0_neighs[
            (
                (
                    df_samples_0_neighs["Nuclei_Location_Center_X"]
                    - df_samples_0["Nuclei_Location_Center_X"]
                )
                < max_dist_of_neigh
            )
            & (
                (
                    df_samples_0_neighs["Nuclei_Location_Center_Y"]
                    - df_samples_0["Nuclei_Location_Center_Y"]
                )
                < max_dist_of_neigh
            )
        ]

        x_cent_ls = df_s0_neighs.Nuclei_Location_Center_X.astype(
            int
        ).tolist()  # +[df_samples_0['Nuclei_Location_Center_X']]
        y_cent_ls = df_s0_neighs.Nuclei_Location_Center_Y.astype(
            int
        ).tolist()  # +[df_samples_0['Nuclei_Location_Center_Y']]

        neigh_center_ls.append([y_cent_ls, x_cent_ls])
    return neigh_center_ls

## Visualize single cells for the input gene of guide
- Read from mounted pooled-cell-painting bucket

In [None]:
%%time


input_matched_rows = metadata_orig[(metadata_orig['gene_symbol']==input_gene_or_guide) | (metadata_orig['sgRNA']==input_gene_or_guide)]
gene=input_matched_rows.gene_symbol.unique().tolist()[0]
guides_ls=input_matched_rows.sgRNA.unique().tolist()

all_guides_gms_ls=[]
# gene_guids_ls=glob.glob(sc_files_dir+'*_'+input_gene+'.csv.gz')
gene_guids_ls=[sc_files_dir+batch+'_single_cell_normalized_ALLBATCHES__'+gi+'_'+gene+'.csv.gz' for gi in guides_ls]

for gi in gene_guids_ls:
    df_p_s=pd.read_csv(gi);

    for ch in channels:
        df_p_s["PathName_Corr"+ch]=rootDir+batch+'/images_corrected_cropped/'+df_p_s["Metadata_Foci_plate"]+'_'+df_p_s["Metadata_Foci_well"]+'/Corr'+ch
        df_p_s["FileName_Corr"+ch]="Corr"+ch+"_"+"Site_"+df_p_s["Metadata_Foci_site_location"].astype(str)+".tiff"

    df_p_s["Path_Outlines"]=rootDir+'workspace/analysis/'+batch+'/'+df_p_s["Metadata_Foci_plate"]+'-'+df_p_s["Metadata_Foci_well"]+'-'+df_p_s["Metadata_Foci_site_location"].astype(str)+'/'\
    +'/CorrDNA_Site_'+df_p_s["Metadata_Foci_site_location"].astype(str)+'_Overlay.png'

    df_p_s["Nuclei_Location_Center_X"]=df_p_s["Cells_AreaShape_Center_X"];
    df_p_s["Nuclei_Location_Center_Y"]=df_p_s["Cells_AreaShape_Center_Y"];

    df_p_s=edgeCellFilter2(df_p_s,im_size,box_size/2);

    if df_p_s.shape[0]>0:
        df_samples,cp_features_analysis = extract_single_cell_samples(df_p_s.sample(n = np.min([1500,df_p_s.shape[0]]),\
                        replace = False).reset_index(drop=True),n_cells,cell_selection_method);
        all_guides_gms_ls.append(df_samples)

df_p_s_all=pd.concat(all_guides_gms_ls,ignore_index=True).drop_duplicates(ignore_index=True)    

if len(gene_guids_ls)>1 and df_p_s_all.shape[0]>n_cells:
    df_samples=df_p_s_all.sample(n = np.min([n_cells,df_p_s_all.shape[0]]),replace = False).reset_index(drop=True)
    
else:
    df_samples=df_p_s_all.copy()

################  visualize cells

f=visualize_n_SingleCell_pooled(channels,df_samples,box_size,im_size,title=input_gene_or_guide+'_'+cell_selection_method);

In [None]:
# df_samples
# f=visualize_n_SingleCell_pooled(channels,df_samples,box_size,im_size,title=input_gene_or_guide+'_'+cell_selection_method);

In [None]:
np.__version__

In [None]:
# visualize_n_SingleCell_pooled(channels,df_samples,box_size,im_size,title=cell_selection_method);
# df_samples
# df_p_s

In [None]:
df_p_s[df_p_s.columns[df_p_s.columns.str.contains('Alig')]]

In [None]:
def pooled_cell_painting_single_cell_visualization(input_gene_or_guide,batch,rootDir,\
            n_cells=6,box_size=100,channels=['DNA','Mito','Phalloidin','WGA','ER','Outline'],\
                                                   cell_selection_method='random'):
    

    ########################## set directories
#     batch='20200805_A549_WG_Screen';
#     rootDir='/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/'
    sc_files_dir=rootDir+'workspace/software/CP186-A549-WG/data/1.profiles/20200805_A549_WG_Screen/single_cell/single_cell_by_guide/'

    im_size=5500 # hardcoded for now, TODO: create a dictionary if this number is different for 257

    ##################### read metadata and create the file name for input guide or gene
    metadata_dir=rootDir+'workspace/metadata/'+batch+'/'
    metadata_orig= pd.read_csv(metadata_dir+'Barcodes.csv')

    input_matched_rows = metadata_orig[(metadata_orig['gene_symbol']==input_gene_or_guide) | (metadata_orig['sgRNA']==input_gene_or_guide)]
    gene=input_matched_rows.gene_symbol.unique().tolist()[0]
    guides_ls=input_matched_rows.sgRNA.unique().tolist()

    all_guides_gms_ls=[]
    # gene_guids_ls=glob.glob(sc_files_dir+'*_'+input_gene+'.csv.gz')
    gene_guids_ls=[sc_files_dir+batch+'_single_cell_normalized_ALLBATCHES__'+gi+'_'+gene+'.csv.gz' for gi in guides_ls]

    for gi in gene_guids_ls:
        df_p_s=pd.read_csv(gi);

        for ch in channels:
            df_p_s["PathName_Corr"+ch]=rootDir+batch+'/images_corrected_cropped/'+df_p_s["Metadata_Foci_plate"]+'_'+df_p_s["Metadata_Foci_well"]+'/Corr'+ch
            df_p_s["FileName_Corr"+ch]="Corr"+ch+"_"+"Site_"+df_p_s["Metadata_Foci_site_location"].astype(str)+".tiff"

        df_p_s["Path_Outlines"]=rootDir+'workspace/analysis/'+batch+'/'+df_p_s["Metadata_Foci_plate"]+'-'+df_p_s["Metadata_Foci_well"]+'-'+df_p_s["Metadata_Foci_site_location"].astype(str)+'/'\
        +'/CorrDNA_Site_'+df_p_s["Metadata_Foci_site_location"].astype(str)+'_Overlay.png'

        df_p_s["Nuclei_Location_Center_X"]=df_p_s["Cells_AreaShape_Center_X"];
        df_p_s["Nuclei_Location_Center_Y"]=df_p_s["Cells_AreaShape_Center_Y"];

        df_p_s=edgeCellFilter2(df_p_s,im_size,box_size/2);

        if df_p_s.shape[0]>0:
            df_samples,cp_features_analysis = extract_single_cell_samples(df_p_s.sample(n = np.min([1500,df_p_s.shape[0]]),\
                            replace = False).reset_index(drop=True),n_cells,cell_selection_method);
            all_guides_gms_ls.append(df_samples)

    df_p_s_all=pd.concat(all_guides_gms_ls,ignore_index=True).drop_duplicates(ignore_index=True)    

    if len(gene_guids_ls)>1 and df_p_s_all.shape[0]>n_cells:
        df_samples=df_p_s_all.sample(n = np.min([n_cells,df_p_s_all.shape[0]]),replace = False).reset_index(drop=True)

    else:
        df_samples=df_p_s_all.copy()

    ################  visualize cells

    return visualize_n_SingleCell_pooled(channels,df_samples,box_size,im_size,title=input_gene_or_guide+'_'+cell_selection_method);


input_gene_or_guide='KRT28'
# input_gene_or_guide='GTTGAAGAGCTAGATCAACG'

batch='20200805_A549_WG_Screen';
rootDir='/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/'

pooled_cell_painting_single_cell_visualization(input_gene_or_guide,batch,rootDir);

In [None]:
# metadaDataFiles
# metadata_orig
# df_p_s.columns
# df_p_s[df_p_s.eq(5500).any(1)]