## Visualizing n random or representative cell in a given well
- Inputs are:
    -  **file_name** - .sql file path for the target plate(or a dataframe)
    -  **well_name** - Well name
    -  **n_cells** - number of single cells you want to visualize
    -  **box_size** - size of box in pixels for cropping a cell
    -  **channels** - list of channels you want to plot
    -  **cell_selection_method** - can be one of the following methods
        - random - generate n randomly selected cells
        - representative - clusters the data and sample from the "closest to mean cluster"
        - geometric_median - plots single sample than is the geometric median of samples
        
        
#### Steps:

- Reading the target well from the input plate 
- Sampling using cell_selection_method 
  

In [1]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(color_codes=True)
from sklearn import preprocessing
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# from utils import read_data, visualize_data
from utils.read_data import *
from utils.visualize_data import *
from sklearn.cluster import KMeans
import time

### Inputs

In [4]:
file_name='/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/workspace/\
software/CP186-A549-WG/data/1.profiles/20200805_A549_WG_Screen/single_cell/\
single_cell_by_guide/20200805_A549_WG_Screen_single_cell_normalized_ALLBATCHES__GTTGAAGAGCTAGATCAACG_KRT28.csv.gz'

n_cells=6

box_size=100

well_name=["A10"]

# repEnabled= True
# how we select cells which can be 'random','representative','geometric_median'
cell_selection_method='geometric_median'

channels=["Mito","AGP","DNA","RNA","ER","Outline"]

In [None]:
batch='20210422_6W_CP257';

In [23]:
# os.listdir("/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/workspace/software/CP257-HeLa-WG/data/1.profiles/20210422_6W_CP257/single_cell/")

In [7]:
# python restore_intelligent.py imaging-platform "projects/2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad/workspace/backend/2016_04_01_a549_48hr_batch1/SQ00015195/"

### Read single cell data

In [5]:
%%time
df_p_s0=pd.read_csv(file_name);
df_p_s0.head()

CPU times: user 1min 4s, sys: 3.43 s, total: 1min 7s
Wall time: 1min 16s


Unnamed: 0,Metadata_Foci_Parent_Cells,Metadata_Foci_Cell_Quality_Index,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Metadata_Foci_Barcode_MatchedTo_Score_mean,Metadata_Foci_Barcode_MatchedTo_Score_count,Metadata_Foci_cell_quality_method,Metadata_Foci_ImageNumber,Metadata_Foci_site,Metadata_Foci_plate,...,Align_Xshift_WGA,Align_Yshift_AlignedRed,Align_Yshift_ConA,Align_Yshift_DAPI_Painting,Align_Yshift_Mito,Align_Yshift_Phalloidin,Align_Yshift_WGA,Metadata_site,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y
0,93,2,KRT28,GTTGAAGAGCTAGATCAACG,0.83333,1,simple,441,CP186A-Well3-75,CP186A,...,-7.0,0.0,56.0,56.0,56.0,56.0,56.0,CP186A-Well3-75,5126.0,84.667
1,198,2,KRT28,GTTGAAGAGCTAGATCAACG,0.83333,2,simple,441,CP186A-Well3-75,CP186A,...,-7.0,0.0,56.0,56.0,56.0,56.0,56.0,CP186A-Well3-75,672.33,129.98
2,1428,2,KRT28,GTTGAAGAGCTAGATCAACG,0.83333,1,simple,441,CP186A-Well3-75,CP186A,...,-7.0,0.0,56.0,56.0,56.0,56.0,56.0,CP186A-Well3-75,2576.2,1012.9
3,1584,2,KRT28,GTTGAAGAGCTAGATCAACG,0.83333,2,simple,441,CP186A-Well3-75,CP186A,...,-7.0,0.0,56.0,56.0,56.0,56.0,56.0,CP186A-Well3-75,2423.6,1101.0
4,1586,2,KRT28,GTTGAAGAGCTAGATCAACG,0.83333,9,simple,441,CP186A-Well3-75,CP186A,...,-7.0,0.0,56.0,56.0,56.0,56.0,56.0,CP186A-Well3-75,2600.8,1115.7


In [13]:
# df_p_s0[df_p_s0.columns[df_p_s0.columns.str.contains('utline')]]
df_p_s0.columns[df_p_s0.columns.str.contains('uid')]

Index([], dtype='object')

In [8]:
# df_p_s0['Cells_AreaShape_Center_X'].describe()

### Visualize

In [9]:
df_p_s=df_p_s0.copy()
rootDir='/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/'
batch='20200805_A549_WG_Screen';
im_size=5500

channels=['DNA','Mito','Phalloidin','WGA','ER','Outline']
for ch in channels:
    df_p_s["PathName_Corr"+ch]=rootDir+batch+'/images_corrected_cropped/'+df_p_s["Metadata_Foci_plate"]+'_'+df_p_s["Metadata_Foci_well"]+'/Corr'+ch
    df_p_s["FileName_Corr"+ch]="Corr"+ch+"_"+"Site_"+df_p_s["Metadata_Foci_site_location"].astype(str)+".tiff"

df_p_s["Path_Outlines"]=rootDir+'workspace/analysis/'+batch+'/'+df_p_s["Metadata_Foci_plate"]+'-'+df_p_s["Metadata_Foci_well"]+'-'+df_p_s["Metadata_Foci_site_location"].astype(str)+'/'\
+'/CorrDNA_Site_'+df_p_s["Metadata_Foci_site_location"].astype(str)+'_Overlay.png'

df_p_s["Nuclei_Location_Center_X"]=df_p_s["Cells_AreaShape_Center_X"];
df_p_s["Nuclei_Location_Center_Y"]=df_p_s["Cells_AreaShape_Center_Y"];

df_p_s=edgeCellFilter2(df_p_s,im_size,box_size/2);

df_samples = extract_single_cell_samples(df_p_s.sample(n = 1500, replace = False).reset_index(drop=True),n_cells,cell_selection_method);


visualize_n_SingleCell_pooled(channels,df_samples,box_size,im_size,title=cell_selection_method);

NameError: name 'df_p_s0' is not defined

In [13]:
# visualize_n_SingleCell_pooled(channels,df_samples,box_size,im_size,title=cell_selection_method);

In [None]:
# def unarchive_file(f2unarchive,bucket_name,user_profile):
#     """
#     unarchive input file

#     Inputs:
#     bucket_name: eg, pooled-cell-painting, imaging-plateform
#     user_profile: profile name that is used for accessing the above bucket (check in sudo vim ~/.aws/credentials)
#     proj_path: path to the project eg,2018_11_20_Periscope_Calico/workspace/analysis/

#     output:
#     command to run
#     """

#     cmd='aws s3api  restore-object  --profile '+user_profile+\
#             ' --bucket '+bucket_name+'  --key projects/2018_11_20_Periscope_Calico/workspace/analysis/'+f2unarchive+'   --restore-request GlacierJobParameters={"Tier"="Standard"}'
#     return cmd

user_profile='calico'
bucket_name='pooled-cell-painting'

# for p in plates:
#     for w in wells:
#         for s in sites:
# plate_well_dirs_ls=os.listdir(rootDir+'/workspace/analysis/'+batch)
for di in plate_well_dirs_ls:
    f2unarchive='workspace/analysis/'+batch+'/'+di+'/CorrDNA_Site_'+di.split('-')[-1]+'_Overlay.png'
    if os.path.exists(rootDir+f2unarchive):
        cmd='aws s3api  restore-object  --profile '+user_profile+\
        ' --bucket '+bucket_name+'  --key projects/2018_11_20_Periscope_Calico/'+\
        f2unarchive+'   --restore-request GlacierJobParameters={"Tier"="Standard"}'
        print(cmd)
        os.system(cmd)

In [46]:
plate_well_dirs_ls[0].split('-')[-1]

'1'

## Plot geometric median of all guides of an input gene

In [None]:
import glob
rootDir='/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/'
sc_files_dir=rootDir+'workspace/software/CP186-A549-WG/data/1.profiles/20200805_A549_WG_Screen/single_cell/\
single_cell_by_guide/'
batch='20200805_A549_WG_Screen';

### metadata 
metadata_dir=rootDir+'workspace/metadata/'+batch+'/'
metadaDataFiles=os.listdir(metadata_dir)
metadata_orig= pd.read_csv(metadata_dir+metadaDataFiles[0])

# input_gene='KRT28'
box_size=100
im_size=5500
n_cells=0
cell_selection_method='geometric_median'
channels=["Mito","AGP","DNA","RNA","ER","Outline"]

genes_ls=metadata_orig.gene_symbol.unique().tolist()

for igi in range(1224,len(genes_ls)):
    input_gene=genes_ls[igi]
    all_guides_gms_ls=[]
    gene_guids_ls=glob.glob(sc_files_dir+'*_'+input_gene+'.csv.gz')
    for gi in gene_guids_ls:
        df_p_s=pd.read_csv(gi);
        channels=['DNA','Mito','Phalloidin','WGA','ER','Outline']
        for ch in channels:
            df_p_s["PathName_Corr"+ch]=rootDir+batch+'/images_corrected_cropped/'+df_p_s["Metadata_Foci_plate"]+'_'+df_p_s["Metadata_Foci_well"]+'/Corr'+ch
            df_p_s["FileName_Corr"+ch]="Corr"+ch+"_"+"Site_"+df_p_s["Metadata_Foci_site_location"].astype(str)+".tiff"

        df_p_s["Path_Outlines"]=rootDir+'workspace/analysis/'+batch+'/'+df_p_s["Metadata_Foci_plate"]+'-'+df_p_s["Metadata_Foci_well"]+'-'+df_p_s["Metadata_Foci_site_location"].astype(str)+'/'\
        +'/CorrDNA_Site_'+df_p_s["Metadata_Foci_site_location"].astype(str)+'_Overlay.png'

        df_p_s["Nuclei_Location_Center_X"]=df_p_s["Cells_AreaShape_Center_X"];
        df_p_s["Nuclei_Location_Center_Y"]=df_p_s["Cells_AreaShape_Center_Y"];
        
        df_p_s=edgeCellFilter2(df_p_s,im_size,box_size/2);

        if df_p_s.shape[0]>0:
            df_samples = extract_single_cell_samples(df_p_s.sample(n = np.min([1500,df_p_s.shape[0]]),\
                            replace = False).reset_index(drop=True),n_cells,cell_selection_method);
            all_guides_gms_ls.append(df_samples)

    all_guides_gms_df=pd.concat(all_guides_gms_ls,ignore_index=True).drop_duplicates(ignore_index=True)
    
    if all_guides_gms_df.shape[0]==1:
        all_guides_gms_df=pd.concat([all_guides_gms_df]*2, ignore_index=True)    
    
    fig=visualize_n_SingleCell_pooled(channels,all_guides_gms_df,box_size,im_size,title=input_gene+'_'+cell_selection_method);

    resultsDir='/home/ubuntu/bucket/projects/2018_11_20_Periscope_Calico/workspace/visualizations/20200805_A549_WG_Screen/geometric_median_guide_level/'
    fig.savefig(resultsDir+input_gene+'.png')  
    plt.ioff()

In [41]:
df_p_s=pd.read_csv(gi);
df_p_s["Nuclei_Location_Center_X"]=df_p_s["Cells_AreaShape_Center_X"];
df_p_s["Nuclei_Location_Center_Y"]=df_p_s["Cells_AreaShape_Center_Y"];
df_p_s=edgeCellFilter2(df_p_s,im_size,box_size/2);
# df_p_s.shape

In [12]:
# genes_ls.index('LOC653602')
# pd.concat([all_guides_gms_df]*2, ignore_index=True)
len(genes_ls)

20393

In [7]:
igi,all_guides_gms_df.shape,gene_guids_ls

(101,
 (1, 3794),
 ['/home/ubuntu/calbucket/projects/2018_11_20_Periscope_Calico/workspace/software/CP186-A549-WG/data/1.profiles/20200805_A549_WG_Screen/single_cell/single_cell_by_guide/20200805_A549_WG_Screen_single_cell_normalized_ALLBATCHES__TTCATGGCTCCAGGCATCAG_DEFB107B.csv.gz'])

In [27]:
os.system('mkdir -p /home/ubuntu/bucket/projects/2018_11_20_Periscope_Calico/workspace/visualizations/20200805_A549_WG_Screen/geometric_median_guide_level')

0

In [29]:
ls /home/ubuntu/bucket/projects/2018_11_20_Periscope_Calico/workspace/visualizations

[0m[01;34m20200805_A549_WG_Screen[0m/


In [None]:
# metadata_orig.gene_symbol.unique().tolist()