In [1]:
### Figure 4A for the PERISCOPE manuscript. ###
### Script by Meraj Ramezani(mramezan@broadinstitute.org) ###
# Import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as font_manager

%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

In [2]:
# Import the Hela HPLM guide level profiles and extract the features. 
df = pd.read_csv("../20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS.csv.gz")
guide_df = pd.read_csv('../wgs_guide_list.csv')
cp257_guide_df = guide_df.query('dialout == 1 | dialout ==3')
cp257_guide_list = list(cp257_guide_df['sgRNA'])
df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(cp257_guide_list)]
features = list(df.columns)[2:]

In [3]:
# Extract the expression data from the Broad Institute Dependency Map data set
express = pd.read_csv("../CCLE_expression_hela.csv")

for i in range(len(express.index)):
    express.iloc[i,0] = express.iloc[i,0].split()[0]

express_min , express_max = express['HELA_CERVIX'].min(), express['HELA_CERVIX'].max()

express = express.sort_values(by = ['Unnamed: 0']).set_index('Unnamed: 0')

all_genes_list = list(df.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
zero_express = express.query('HELA_CERVIX == 0')
zero_express_list = list(zero_express.index)

zero_tpm = [value for value in zero_express_list if value in all_genes_list]
zero_tpm = sorted(list(set(zero_tpm)))


expressed_gene_list = [value for value in all_genes_list if value not in zero_tpm]
expressed_gene_list.remove('nontargeting')

print(len(zero_express_list),len(all_genes_list),len(zero_tpm),len(expressed_gene_list))

33120 20393 3665 16727


In [4]:
# Load p_values for the Hela HPLM WGS from the hit calling process
df_p_values = pd.read_csv('../cp257_hplm_sig_features_mann_whitney_whole_genome_median_p_values.csv').rename(columns={'Unnamed: 0':''}).set_index('')
df_p_values = df_p_values.rename(columns=int)

# Integer names back to sring feature names for downstream analysis
features_dic_forward = {features[i] : i for i in range(len(features))}
features_dic_reverse = {i : features[i] for i in range(len(features))}
df_int_feats = df.rename(columns=features_dic_forward)
df_str_feats = df_int_feats.rename(columns=features_dic_reverse)
p_values = df_p_values.rename(columns=features_dic_reverse)
p_values = p_values[:-1]

p_values

Unnamed: 0,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,...,Nuclei_Texture_SumVariance_Mito_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_02_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_WGA_10_00_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_02_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_03_256,Nuclei_Texture_Variance_DAPI_Painting_10_03_256
,,,,,,,,,,,,,,,,,,,,,
A1BG,0.086287,0.925936,0.551426,0.213209,4.421232e-09,0.742758,0.103179,0.107254,1.039936e-01,0.989404,...,0.100794,0.000728,0.472134,0.013438,0.103984,0.319254,0.085997,0.922922,0.083260,0.089521
A1CF,0.024309,0.910878,0.635356,0.284026,6.537873e-01,0.000759,0.883859,0.074847,8.591084e-01,0.103179,...,0.007864,0.182309,0.132473,0.565407,0.535020,0.338038,1.000000,0.537521,0.431102,0.001465
A2M,0.844763,0.085306,0.109259,0.020245,1.978586e-01,0.998486,0.904864,0.088098,9.343601e-01,0.106429,...,0.255798,0.971246,0.002417,0.000944,0.113171,0.604517,0.934984,0.995459,0.407079,0.097683
A2ML1,0.867739,0.000769,0.677780,0.025791,1.375126e-01,0.013581,0.000769,0.152049,9.251428e-07,0.998486,...,0.953103,0.989404,0.083938,0.139445,0.062733,0.042946,0.950081,0.079286,0.105609,0.005512
A3GALT2,0.645863,0.123908,0.914012,0.873501,1.119651e-01,0.093902,0.000708,0.000634,1.713227e-07,0.125774,...,0.103984,0.977298,0.557733,0.086693,0.907871,1.000000,0.108084,0.096156,0.182309,0.009716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.992698,0.874881,0.594029,0.406832,1.417864e-01,0.986377,0.172557,0.128612,1.978586e-01,0.085997,...,0.111455,0.547581,0.433326,0.381805,0.683360,0.596598,0.109759,0.255798,0.205734,0.355538
ZYG11B,0.566302,0.604516,0.000220,0.096451,8.418995e-01,0.965196,0.003769,0.081918,1.064657e-03,0.789088,...,0.196377,0.247947,0.000679,0.326684,0.048278,0.562843,0.111454,0.683360,0.907871,0.134437
ZYX,0.000005,0.702963,0.003290,0.995619,1.333319e-01,0.085306,0.258986,0.109759,5.940290e-01,0.095400,...,0.008795,0.093160,0.088098,0.088098,0.085306,0.031599,0.237241,0.019132,0.001251,0.702964


In [None]:
# Claculate the morphological signal scores for each perturbation and sort 
p_values = p_values.loc[expressed_gene_list[:-1]]
p_values_neg_log = -np.log10(p_values)
ranked_list = pd.DataFrame(((p_values <= 0.001) * p_values_neg_log).sum(axis=1).sort_values(ascending=False))

ranked_list

Unnamed: 0,0
,
ATP2A2,7208.687082
ILK,6955.854113
RAC1,6368.172435
EIF4E,6367.152512
ITGB1,6258.312566
...,...
HIAT1,229.225314
ZNF788,189.001641
LOC101929983,158.197188


In [62]:
ranked_list = ranked_list.reset_index()
ranked_list = ranked_list.rename(columns={'':'GID',0:''})
ranked_list.to_csv('hela_hplm_all_genes.rnk', sep='\t', index=False)
ranked_list

Unnamed: 0,GID,Unnamed: 2
0,ATP2A2,7208.687082
1,ILK,6955.854113
2,RAC1,6368.172435
3,EIF4E,6367.152512
4,ITGB1,6258.312566
...,...,...
16721,HIAT1,229.225314
16722,ZNF788,189.001641
16723,LOC101929983,158.197188
16724,SGOL2,139.377820
