In [1]:
### Figure 2C for the PERISCOPE manuscript. ###
### Script by Meraj Ramezani(mramezan@broadinstitute.org) ###
# Import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import permutations
import matplotlib.font_manager as font_manager


%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

In [None]:
# Import the A549 gene level profiles & hits from the hit calling process then subset the gene level profiles for all hits 
df = pd.read_csv('../20200805_A549_WG_Screen_guide_normalized_feature_select_merged_median_ALLBATCHES___CP186___ALLWELLS_gene_aggregated.csv.gz').set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
comp_spef_hits = pd.read_csv('../Fig2_A_B/a549_plate_level_median_per_feat_sig_genes_5_fdr_compartment_specific_hits.csv').rename(columns={'Unnamed: 0':'Gene'}).set_index('Gene')
whole_cell_hits = pd.read_csv('../Fig2_A_B/a549_plate_level_median_per_feat_sig_genes_5_fdr_whole_cell_hits.csv').rename(columns={'Unnamed: 0':'Gene'}).set_index('Gene')
hit_list = list(comp_spef_hits.index) + list(whole_cell_hits.index)
df_hits = df.loc[hit_list]
df_hits

Unnamed: 0_level_0,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,Cells_AreaShape_HuMoment_6,Cells_AreaShape_NormalizedMoment_0_2,...,Nuclei_Texture_SumVariance_ConA_5_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_03_256
Metadata_Foci_Barcode_MatchedTo_GeneCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AARS,-0.045047,-0.017224,-0.072438,0.005527,-0.053934,0.153853,0.393330,-0.232846,0.002481,-0.314468,...,0.042180,-0.265725,-0.159218,0.071226,-0.074440,-0.002175,-0.083249,0.008703,-0.023501,-0.042191
AARS2,-0.051243,0.009448,0.026790,-0.043807,-0.012882,-0.232335,0.147636,-0.219370,0.002847,-0.347326,...,-0.353250,-0.279907,-0.332005,-0.075201,-0.003135,-0.015622,-0.047501,-0.237655,-0.188850,-0.149119
ABCB1,0.047199,0.006450,0.074548,0.003445,0.176262,0.375105,0.083015,0.048952,0.002811,-0.372090,...,-0.179690,-0.034004,0.049832,-0.030342,-0.129509,-0.141190,-0.115632,-0.113522,-0.112155,-0.135680
ABCC12,0.032234,-0.008989,0.022659,0.010388,0.075094,0.304862,0.353020,-0.173005,0.002669,-0.150201,...,-0.282898,-0.055039,-0.240565,-0.116965,-0.119747,-0.107041,-0.149995,-0.197220,-0.220073,-0.195365
ABCC6,0.081149,0.009756,0.008137,0.028262,0.419525,-0.055530,-0.133479,0.019405,0.002637,-0.106360,...,-0.406095,-0.039135,0.079254,-0.077007,-0.195710,-0.167745,-0.162812,-0.326115,-0.198072,-0.158630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF839,-0.066134,-0.018271,-0.024306,-0.062648,0.163565,-0.156702,0.393775,-0.087600,0.003056,-0.029900,...,-0.231338,-0.421132,-0.386149,-0.036153,-0.143302,-0.199990,-0.153215,-0.101859,0.028422,-0.097757
ZNHIT3,-0.107170,-0.002141,0.033646,0.004076,-0.076012,0.125035,0.180921,-0.289795,0.002928,0.136635,...,-0.466995,-0.572875,-0.293130,-0.097884,-0.082042,-0.021018,-0.049476,-0.225265,-0.273350,-0.190145
ZSCAN2,0.009568,-0.001156,0.010852,-0.013950,0.402135,-0.088981,0.046101,-0.057136,0.002899,0.062550,...,-0.444465,-0.378850,-0.389340,-0.060537,-0.205370,-0.236340,-0.165260,-0.277555,-0.208030,-0.232805
ZSCAN21,0.191183,-0.020706,0.069739,0.015839,-0.295015,-0.047918,-0.423525,-0.122584,0.002997,-0.133551,...,0.116691,-0.257145,-0.214258,-0.056209,-0.013722,-0.108214,-0.055857,-0.030136,-0.009387,-0.045435


In [None]:
# Calculate correlation between all gene pairs and create a dictionory of gene pairs/correlation
df_hits_corr = df_hits.T.corr()
genes = list(df_hits_corr.index)
corr_dic = {}
for gene1 in genes:
    for gene2 in genes:
        if len(frozenset([gene1,gene2])) == 1 :
            continue
        else:    
            corr_dic[frozenset([gene1,gene2])]=(df_hits_corr.loc[gene1,gene2])
        
print(len(corr_dic))

In [None]:
# Input the protein clusters from the CORUM3.0 database and subset the human clusters
ppi_data = pd.read_csv('../allComplexes 2.txt',sep = '\t')
ppi_data_h = ppi_data[ppi_data['Organism'] == "Human"]
ppi_data_h

In [None]:
# Create a list of protein clusters with all complexes that had at least 66% of genes represented within the A549 WGS hits
cluster_count = 0
hit_cluster_list_list = []
hit_set = set()
for i in range(len(ppi_data_h)):
    cluster = ppi_data_h.iloc[i]['subunits(Gene name)'].split(';')
    count = 0
    hit_cluster_list = []
    for g in cluster:
        if g in genes:
            count += 1
            hit_set.add(g)
            hit_cluster_list.append(g)
    if (count/len(cluster)) >= 0.66:
        cluster_count += 1
    if hit_cluster_list and (count/len(cluster)) >= 0.66:
        hit_cluster_list_list.append(hit_cluster_list)
print(len(hit_set),cluster_count,len(hit_cluster_list_list))

In [None]:
# Assign correlations to hit gene pairs
hit_pair_set = set()
for l in hit_cluster_list_list:
    for c in list(permutations(l,2)):
        hit_pair_set.add(frozenset(c))

hit_corr_dic = {}
for s in hit_pair_set:
    hit_corr_dic[s] = corr_dic[s]

print('Number of hit pairs',len(hit_pair_set),'\n',
      'Number of hit pairs with correlation',len(hit_corr_dic))

In [None]:
# Plot CORUM gene pair correlation distribution for Fig 2C
sns.set_theme(style="white",rc = {'axes.linewidth': 0.7,'xtick.labelsize':17,'ytick.labelsize':17})


fig, ax = plt.subplots(figsize=(7,4))

a = list(corr_dic.values())

b = list(hit_corr_dic.values())

sns.histplot(a ,color='b' ,stat= 'percent' ,bins= 100, kde=True, line_kws={'lw':1.4} , alpha=0.5,ax = ax);
sns.histplot(b ,color='r' ,stat= 'percent' ,bins= 100, kde=True, line_kws={'lw':1.4} ,alpha=0.5,ax = ax);


ax.set_ylabel('Gene pair frequency',size=17)
ax.set_xlabel('Correlation of profiles',size=17)

font = font_manager.FontProperties(size=14)
ax.legend(['Hit gene pairs','Hit gene pairs in CORUM complex'],loc='upper left',prop=font )

sns.despine(top = True)



fig.savefig('Fig2_C_A549_CORUM_gene_pair_correlation_distribution.png', dpi=300,facecolor='w', edgecolor='w', bbox_inches='tight')
plt.show()