In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from pycytominer import aggregate

%matplotlib inline  

In [2]:
# Set data input folder
input_folder = "inputs"

# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)
if not os.path.exists(os.path.join(output_folder,"figure_panels")):
    os.makedirs(os.path.join(output_folder,"figure_panels"), exist_ok=True)

In [3]:
# Load barcodes used in experiment
barcodes_df = pd.read_csv('../common_files/Barcodes.csv')
barcodes_df = barcodes_df[['gene_symbol','sgRNA']]
barcodes_df = barcodes_df.rename(columns={'gene_symbol':'gene'})
#guide_list = list(guide_df['sgRNA'])
guide_list = list(barcodes_df.sgRNA.unique())
len(guide_list)

80862

In [4]:
screens = ['A549','HeLa_DMEM','HeLa_HPLM']
guide_profiles_dics = {}
guide_profiles_dics['A549'] = pd.read_csv('../Profile_Aggregation/outputs/20200805_A549_WG_Screen_guide_normalized_feature_select_median_merged_ALLBATCHES___CP186___ALLWELLS.csv.gz')
guide_profiles_dics['A549'] = guide_profiles_dics['A549'][guide_profiles_dics['A549']['Metadata_Foci_Barcode_MatchedTo_Barcode'].isin(guide_list)].reset_index(drop=True)
guide_profiles_dics['HeLa_DMEM'] = pd.read_csv('../Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS.csv.gz')
guide_profiles_dics['HeLa_DMEM'] = guide_profiles_dics['HeLa_DMEM'][guide_profiles_dics['HeLa_DMEM']['Metadata_Foci_Barcode_MatchedTo_Barcode'].isin(guide_list)].reset_index(drop=True)
guide_profiles_dics['HeLa_HPLM'] = pd.read_csv('../Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS.csv.gz')
guide_profiles_dics['HeLa_HPLM'] = guide_profiles_dics['HeLa_HPLM'][guide_profiles_dics['HeLa_HPLM']['Metadata_Foci_Barcode_MatchedTo_Barcode'].isin(guide_list)].reset_index(drop=True)
guide_profiles_dics['HeLa_DMEM']

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,...,Nuclei_Texture_SumVariance_DAPI_Painting_10_02_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_01_256
0,A1BG,CAAGAGAAAGACCACGAGCA,0.016441,-0.000100,-0.138460,0.003857,0.001038,0.980080,0.788680,-0.392810,...,-0.68377,-0.48894,-0.048677,-0.028407,-0.105700,-0.240800,-0.181300,-0.28261,-0.219630,-0.428860
1,A1BG,CATCTTCTTTCACCTGAACG,0.008520,0.171140,0.000794,0.006531,0.000064,0.293300,0.521390,0.188500,...,0.31212,0.35947,-0.154080,-0.017084,-0.076653,-0.022525,-0.009099,-0.29626,-0.163430,-0.488400
2,A1BG,CTCCGGGGAGAACTCCGGCG,-0.002752,-0.267570,-0.000020,-0.001949,-0.009853,0.188720,0.085372,0.260530,...,-0.11342,-0.32553,-0.008285,-0.032883,-0.006471,-0.175300,0.008365,-0.38693,-0.139000,-0.308780
3,A1BG,TGGAAGTCCACTCCACTCAG,-0.002541,-0.010418,-0.000956,0.011877,0.000071,-0.128240,-0.334610,0.072320,...,-0.44236,-0.22552,-0.048856,-0.050878,-0.022580,-0.050792,-0.038298,-0.14036,0.252750,0.008285
4,A1CF,AGTTATGTTAGGTATACCCG,0.004307,0.037445,-0.032165,-0.002427,0.000261,-0.362320,0.478180,-0.123290,...,-0.22953,-0.43204,-0.003427,-0.064101,0.136110,0.061535,-0.134130,-0.26887,0.023397,-0.261620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80797,nontargeting,TTTATGCATTTAATACGCCG,0.001273,0.018101,0.000890,0.001313,-0.000374,-0.055722,0.099351,0.069365,...,-0.23705,-0.20256,-0.031417,-0.037462,-0.114560,-0.132490,-0.107730,-0.17507,-0.217230,-0.070118
80798,nontargeting,TTTCTAGTTACTACTGGACG,0.000851,0.007969,-0.002324,0.001188,-0.000311,-0.069128,-0.055467,0.052259,...,-0.22518,-0.24198,-0.070424,-0.033117,-0.123770,-0.146650,-0.123340,-0.19341,-0.172650,-0.156660
80799,nontargeting,TTTGGCAGTACCTTTTATTA,-0.002488,-0.020554,-0.002961,0.001481,-0.000375,-0.145710,0.007123,0.134060,...,-0.24169,-0.25947,-0.097440,-0.045132,-0.129210,-0.142610,-0.131210,-0.13088,-0.176260,-0.191660
80800,nontargeting,TTTTACCTTGTTCACATGGA,-0.001427,0.018769,-0.001960,-0.001710,-0.000271,-0.124700,0.070634,0.011164,...,-0.16827,-0.20667,-0.107190,-0.053795,-0.105610,-0.130490,-0.109310,-0.24267,-0.220700,-0.218470


In [5]:
profile_pca_dics = {}
for screen in screens:
    gene_list = list(guide_profiles_dics[screen].Metadata_Foci_Barcode_MatchedTo_GeneCode)
    df_temp = guide_profiles_dics[screen].drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
    # Perform principal component analysis on hit list
    pca = PCA()
    pca.fit(df_temp)
    x = list(pca.explained_variance_ratio_)
    # Find principal component that represents 90% variation
    PCA_lookup = {}
    for i in range(len(x)):
        distance = abs(.9-sum(x[:i+1]))
        PCA_lookup[distance] = i 
    component = PCA_lookup[min(PCA_lookup.keys())]+1
    print (f'Principal component representing closest to 90% variation is {component} for {screen} screen')
    # Perform principal component analysis and select components representing 90% of variation in data
    pca = PCA(n_components=component)
    profile_pca_dics[screen] = pd.DataFrame(pca.fit_transform(df_temp),index=df_temp.index)
    profile_pca_dics[screen] = profile_pca_dics[screen].reset_index()
    pca_feat_list = list(profile_pca_dics[screen].columns)
    feat_list = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
    feat_list.extend(pca_feat_list)
    profile_pca_dics[screen]["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
    profile_pca_dics[screen] = profile_pca_dics[screen][feat_list]

Principal component representing closest to 90% variation is 700 for A549 screen
Principal component representing closest to 90% variation is 721 for HeLa_DMEM screen
Principal component representing closest to 90% variation is 617 for HeLa_HPLM screen


In [7]:
whole_cell_hits_dict = {}
comp_spec_hits_dict = {}

whole_cell_hits_dict['A549'] = pd.read_csv('../4_A549_Screen_Summary/outputs/A549_plate_level_median_per_feat_sig_genes_1_FDR_whole_cell_hits.csv')
comp_spec_hits_dict['A549'] = pd.read_csv('../4_A549_Screen_Summary/outputs/A549_plate_level_median_per_feat_sig_genes_1_FDR_compartment_specific_hits.csv')

whole_cell_hits_dict['HeLa_DMEM'] = pd.read_csv('../2_HeLa_Screens_Summary/outputs/HeLa_DMEM_plate_level_median_per_feat_sig_genes_1_FDR_whole_cell_hits.csv')
comp_spec_hits_dict['HeLa_DMEM'] = pd.read_csv('../2_HeLa_Screens_Summary/outputs/HeLa_DMEM_plate_level_median_per_feat_sig_genes_1_FDR_compartment_specific_hits.csv')

whole_cell_hits_dict['HeLa_HPLM'] = pd.read_csv('../2_HeLa_Screens_Summary/outputs/HeLa_HPLM_plate_level_median_per_feat_sig_genes_1_FDR_whole_cell_hits.csv')
comp_spec_hits_dict['HeLa_HPLM'] = pd.read_csv('../2_HeLa_Screens_Summary/outputs/HeLa_HPLM_plate_level_median_per_feat_sig_genes_1_FDR_compartment_specific_hits.csv')

In [8]:
whole_cell_hits_profile_dict = {}
comp_spec_hits_profile_dict = {}
for screen in screens:
    whole_cell_hit_list = list(whole_cell_hits_dict[screen].Gene)
    comp_spec_hit_list = list(comp_spec_hits_dict[screen].Gene)
    whole_cell_hits_profile_dict[screen] = profile_pca_dics[screen][profile_pca_dics[screen]['Metadata_Foci_Barcode_MatchedTo_GeneCode'].isin(whole_cell_hit_list+['nontargeting'])]
    comp_spec_hits_profile_dict[screen] = profile_pca_dics[screen][profile_pca_dics[screen]['Metadata_Foci_Barcode_MatchedTo_GeneCode'].isin(comp_spec_hit_list+['nontargeting'])]
    

In [9]:
def cosine_to_df(df_temp, cosine_array, i):
    cosine_list = cosine_array[i]
    gene_list = list(df_temp.index)
    cosine_df = pd.DataFrame(index=gene_list)
    cosine_df['cosine'] = cosine_list
    cosine_df = cosine_df.sort_values('cosine',ascending=False)   
    return cosine_df

def ap_from_cosine_df(cosine_df,gene,n=10):    
    #print(cosine_df.iloc[:20])
    index_list = list(cosine_df.index)
    boolean = [1 if  i == gene else 0 for i in index_list ]
    grades_list=[]
    for i in range(2,n+2):
        pre_grade = sum(boolean[1:i])/(i-1)
        grades_list.append(pre_grade*boolean[i-1])
    return sum(grades_list)/3

def calculate_map(df_guide, gene):
    df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting' | Metadata_Foci_Barcode_MatchedTo_GeneCode == @gene")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    #print(df_temp)
    ap_list = []
    cosine_array = cosine_similarity(df_temp)
    for guide in range(4):
        cosine_df = cosine_to_df(df_temp, cosine_array, guide)
        #print(cosine_df[:10])
        guide_ap = ap_from_cosine_df(cosine_df,gene,10)
        ap_list.append(guide_ap)
    return np.mean(ap_list)

In [10]:
# calculate the mAP values
rep_results_whole = {}
for screen in screens:
    genes_list = list(whole_cell_hits_profile_dict[screen].Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
    map_list = []
    for i in range(len(genes_list)):
        gene = genes_list[i]
        gene_map = calculate_map(whole_cell_hits_profile_dict[screen], gene)
        map_list.append([gene, gene_map])
    rep_results_whole[screen] = map_list

ys = []
for screen in screens:
    y = [i[1] for i in rep_results_whole[screen] if i[1]]
    ys.append(np.mean(y))
print('mAP values (whole cell hits) for A549: ',ys[0],'HeLa DMEM: ',ys[1],'HeLa HPLM: ',ys[2])

mAP values (whole cell hits) for A549:  0.42085786556337934 HeLa DMEM:  0.45192489883330506 HeLa HPLM:  0.4939820959352209


In [11]:
# calculate the mAP values
rep_results_comp = {}
for screen in screens:
    genes_list = list(comp_spec_hits_profile_dict[screen].Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
    map_list = []
    for i in range(len(genes_list)):
        gene = genes_list[i]
        gene_map = calculate_map(comp_spec_hits_profile_dict[screen], gene)
        map_list.append([gene, gene_map])
    rep_results_comp[screen] = map_list

xs = []
for screen in screens:
    x = [i[1] for i in rep_results_comp[screen] if i[1]]
    xs.append(np.mean(x))
print('mAP values (compartment hits) for A549: ',xs[0],'HeLa DMEM: ',xs[1],'HeLa HPLM: ',xs[2])

mAP values (compartment hits) for A549:  0.19149255371300097 HeLa DMEM:  0.15855886986504963 HeLa HPLM:  0.18372510919152707
