In [1]:
### Figure 4A for the PERISCOPE manuscript. ###
### Script by Meraj Ramezani(mramezan@broadinstitute.org) ###
# Import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as font_manager

%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

In [2]:
# Import the Hela DMEM guide level profiles and extract the features. 
df = pd.read_csv("../20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS.csv.gz")
guide_df = pd.read_csv('../wgs_guide_list.csv')
cp257_guide_df = guide_df.query('dialout == 1 | dialout ==3')
cp257_guide_list = list(cp257_guide_df['sgRNA'])
df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(cp257_guide_list)]
features = list(df.columns)[2:]

In [3]:
# Extract the expression data from the Broad Institute Dependency Map data set
express = pd.read_csv("../CCLE_expression_hela.csv")

for i in range(len(express.index)):
    express.iloc[i,0] = express.iloc[i,0].split()[0]

express_min , express_max = express['HELA_CERVIX'].min(), express['HELA_CERVIX'].max()

express = express.sort_values(by = ['Unnamed: 0']).set_index('Unnamed: 0')

all_genes_list = list(df.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
zero_express = express.query('HELA_CERVIX == 0')
zero_express_list = list(zero_express.index)

zero_tpm = [value for value in zero_express_list if value in all_genes_list]
zero_tpm = sorted(list(set(zero_tpm)))


expressed_gene_list = [value for value in all_genes_list if value not in zero_tpm]
expressed_gene_list.remove('nontargeting')

print(len(zero_express_list),len(all_genes_list),len(zero_tpm),len(expressed_gene_list))

33120 20394 3665 16728


In [4]:
# Load p_values for the Hela DMEM WGS from the hit calling process
df_p_values = pd.read_csv('../cp257_dmem_sig_features_mann_whitney_whole_genome_median_p_values_mito.csv').rename(columns={'Unnamed: 0':''}).set_index('')
df_p_values = df_p_values.rename(columns=int)

# Integer names back to sring feature names for downstream analysis
features_dic_forward = {features[i] : i for i in range(len(features))}
features_dic_reverse = {i : features[i] for i in range(len(features))}
df_int_feats = df.rename(columns=features_dic_forward)
df_str_feats = df_int_feats.rename(columns=features_dic_reverse)
p_values = df_p_values.rename(columns=features_dic_reverse)
p_values = p_values[:-1]

p_values

Unnamed: 0,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,...,Nuclei_Texture_SumVariance_DAPI_Painting_10_02_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_01_256
,,,,,,,,,,,,,,,,,,,,,
A1BG,0.697342,0.775861,0.570550,1.213405e-01,0.109759,0.004844,0.150967,0.575716,0.005910,0.115784,...,0.824342,0.184809,0.207096,0.073007,0.001713,0.925936,0.086693,0.058566,0.164348,0.088807
A1CF,0.974272,0.036127,0.919909,1.517474e-01,0.280380,0.222520,0.001427,0.290648,0.222520,0.080593,...,0.066577,0.162058,0.100009,0.938001,0.000679,0.002542,0.548845,0.604517,0.136423,0.717090
A2M,0.515231,0.505260,0.694537,4.436528e-01,0.085997,0.070023,0.252637,0.037420,0.069439,0.117552,...,0.012471,0.012405,0.308318,0.113171,0.091691,0.449077,0.650251,0.992432,0.515231,0.977298
A2ML1,0.127660,0.206865,0.562843,1.819787e-01,0.085306,0.117552,0.237241,0.102379,0.109759,0.038120,...,0.980324,0.947060,0.034404,0.263820,0.768716,0.089521,0.965196,0.922922,0.532526,0.983351
A3GALT2,0.728460,0.005055,0.623175,2.859803e-03,0.000643,0.292384,0.083937,0.620494,0.453637,0.084620,...,0.000583,0.071802,0.732740,0.275340,0.108918,0.002606,0.015898,0.998486,0.001669,0.028165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.575717,0.986371,0.500642,7.377111e-01,0.086693,0.085997,0.257388,0.010898,0.085306,0.812547,...,0.020127,0.013016,0.714256,0.580904,0.786169,0.246397,0.207096,0.103984,0.088098,0.111454
ZYG11B,0.000699,0.148080,0.109759,9.198202e-01,0.088098,0.008179,0.216819,0.000575,0.006190,0.343809,...,0.079286,0.005384,0.469799,0.131499,0.977298,0.238750,0.815492,0.119342,0.134437,0.071205
ZYX,0.800795,0.096367,0.440037,9.839367e-01,0.100009,0.989404,1.000000,0.493430,0.989404,0.959148,...,0.085306,0.109759,0.500641,0.104794,0.527554,0.647522,0.143553,0.000575,0.748503,0.091691


In [5]:
# Claculate the morphological signal scores for each perturbation and sort 
p_values = p_values.loc[expressed_gene_list[:-1]]
p_values_neg_log = -np.log10(p_values)
ranked_list = pd.DataFrame(((p_values <= 0.001) * p_values_neg_log).sum(axis=1).sort_values(ascending=False))

ranked_list

Unnamed: 0,0
,
MED12,3505.619834
ARPC4,3159.619834
PGGT1B,3092.080485
ARPC2,2994.974043
TRRAP,2966.084552
...,...
TMEM267,122.629088
PJVK,88.806326
ERBB2IP,87.040534


In [6]:
ranked_list = ranked_list.reset_index()
ranked_list = ranked_list.rename(columns={'':'GID',0:''})
ranked_list.to_csv('hela_dmem_all_genes.rnk', sep='\t', index=False)
ranked_list

Unnamed: 0,GID,Unnamed: 2
0,MED12,3505.619834
1,ARPC4,3159.619834
2,PGGT1B,3092.080485
3,ARPC2,2994.974043
4,TRRAP,2966.084552
...,...,...
16722,TMEM267,122.629088
16723,PJVK,88.806326
16724,ERBB2IP,87.040534
16725,NPY4R2,71.500876
