In [5]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from sklearn.decomposition import PCA
import umap.umap_ as umap
import hdbscan

%matplotlib inline  
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

In [2]:
# Set data input folder
# Download files described in README.md to this folder
input_folder = "inputs"

# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)
if not os.path.exists(os.path.join(output_folder,"figure_panels")):
    os.makedirs(os.path.join(output_folder,"figure_panels"), exist_ok=True)

In [8]:
# Import the HeLa DMEM guide-level profiles
df_guide_DMEM = pd.read_csv("../Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS.csv.gz")
# Import the HeLa HPLM guide-level profiles
df_guide_HPLM = pd.read_csv("../Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS.csv.gz")

# Import the HeLa DMEM gene-level profiles
df_gene_DMEM = pd.read_csv("../Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS_gene_aggregated.csv.gz")
# Import the HeLa HPLM gene-level profiles
df_gene_HPLM = pd.read_csv("../Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS_gene_aggregated.csv.gz")

# Import the GO annotations
with open("../common_files/GO_gene_sets.json") as f:
    GO_dict = json.load(f)

# Load p_values from the hit calling process
df_p_values_DMEM = pd.read_csv('../Hit_Calling/outputs/HeLa_DMEM_significant_features_mann_whitney_p_values.csv.gz')
df_p_values_DMEM = df_p_values_DMEM.set_index('Gene')
df_p_values_HPLM = pd.read_csv('../Hit_Calling/outputs/HeLa_HPLM_significant_features_mann_whitney_p_values.csv.gz')
df_p_values_HPLM = df_p_values_HPLM.set_index('Gene')

# Load expression data 
with open("../Hit_Calling/outputs/HeLa_CCLE_expression_summary.json") as f:
    express = json.load(f)
    zero_tpm_list = express['zero_tpm']
    expressed_gene_list = express['expressed_genes']

# Input the protein clusters from the CORUM and STRING databases
CORUM_data = pd.read_csv(os.path.join(input_folder,'CORUM_humanComplexes.txt',sep = '\t'))
STRING_data = pd.read_csv(os.path.join(input_folder,'STRING_data.csv'))

# Figure 3A and 3B

In [12]:
# Extract the features from the guide level profiles
feature_dict = {}
for (df_p_value, condition) in [(df_p_values_DMEM, 'DMEM'), (df_p_values_HPLM, 'HPLM')]:
      features = list(df_p_value.columns)[2:]
      mito_list = [feature for feature in df_p_value.columns if 'mito'  in feature.lower()]
      cona_list = [feature for feature in df_p_value.columns if 'cona'  in feature.lower()]
      dapi_list = [feature for feature in df_p_value.columns if 'dapi'  in feature.lower()]
      wga_list = [feature for feature in df_p_value.columns if 'wga'  in feature.lower()]
      phalloidin_list = [feature for feature in df_p_value.columns if 'phalloidin'  in feature.lower()]
      feature_dict[condition] = {'mito':mito_list,'cona':cona_list,'dapi':dapi_list,'wga':wga_list,'phalloidin':phalloidin_list,'sum':features}
      print(f'Number of features per compartment, {condition}: ','\n',
            'Mito features: ',len(mito_list),'\n',
            'ConA features: ',len(cona_list),'\n',
            'DAPI features: ',len(dapi_list),'\n',
            'WGA features: ',len(wga_list),'\n',
            'Phalloidin features: ',len(phalloidin_list))

Number of features per compartment, DMEM:  
 Mito features:  376 
 ConA features:  223 
 DAPI features:  379 
 WGA features:  287 
 Phalloidin features:  220
Number of features per compartment, HPLM:  
 Mito features:  399 
 ConA features:  250 
 DAPI features:  370 
 WGA features:  337 
 Phalloidin features:  261


In [None]:
# Calculate false discovery rate (FDR)
def FDR_5(control_sig_feature):
    total = len(control_sig_feature)
    fdr_5 = int(total/20)
    FDR_dict = {}
    for channel in ['Mito','ConA','WGA','DAPI','Phalloidin','Sum']:
        count = 0
        c = 200
        feat_5 = 0
        while feat_5 <= fdr_5:
            feat_5 = len(control_sig_feature.query('@channel > @c'))
            count = c + 1
            c-=1
        FDR_dict[channel] = count   
    return FDR_dict['Mito'], FDR_dict['ConA'], FDR_dict['WGA'], FDR_dict['DAPI'], FDR_dict['Phalloidin'], FDR_dict['Sum']

In [13]:
# Quantification of number of significant features per compartment at a certain p-value
for (df_p_values, condition) in [(df_p_values_DMEM, 'DMEM'), (df_p_values_HPLM, 'HPLM')]:
    p_value = 0.001
    gene_list = list(df_p_values.index)

    df_sig_feature = pd.DataFrame(index=gene_list)
    for gene in gene_list:
        for channel in feature_dict[condition].keys():
            count = 0
            for feat in feature_dict[condition][channel]:
                if df_p_values.loc[gene,feat]<p_value:
                    count += 1
            df_sig_feature.loc[gene,channel] = count
        
    df_sig_feature = df_sig_feature.drop(index = 'sig_gene_count')
    # Perturbations divided into 2 groups of controls with 0 TPM and others
    df_sig_feature_zero_tpm = df_sig_feature.loc[zero_tpm_list]
    df_sig_feature_expressed = df_sig_feature.loc[expressed_gene_list]

    # Compute and print the threshold for FDR calculations at 5% from the 0 TPM gene perturbations
    mito_5, cona_5, wga_5, dapi_5, phal_5,sum_5 = FDR_5(df_sig_feature_zero_tpm)
    print('Mito genes', mito_5, '\n', 
        'ConA genes', cona_5, '\n', 
        'WGA genes', wga_5, '\n', 
        'DAPI genes', dapi_5, '\n', 
        'Phalloidin genes', phal_5, '\n',
        'Whole Profile genes', sum_5)
    # Save the number of significant features per channel for whole cell hits and compartment hits
    if condition == 'DMEM':
        whole_cell_hits_DMEM = df_sig_feature_expressed.query('Sum > @sum_5')
        comp_spec_hits_DMEM = df_sig_feature_expressed.query('Sum <= @sum_5').query('Phalloidin > @phal_5 | DAPI > @dapi_5 | WGA > @wga_5 | ConA > @cona_5 | Mito > @mito_5')
        whole_cell_hits_DMEM.to_csv(os.path.join(output_folder,f'HeLa_{condition}_plate_level_median_per_feat_sig_genes_5_FDR_whole_cell_hits.csv'),index=True)
        comp_spec_hits_DMEM.to_csv(os.path.join(output_folder,f'HeLa_{condition}_plate_level_median_per_feat_sig_genes_5_FDR_compartment_specific_hits.csv'),index=True)
    if condition == 'HPLM':
        whole_cell_hits_HPLM = df_sig_feature_expressed.query('Sum > @sum_5')
        comp_spec_hits_HPLM = df_sig_feature_expressed.query('Sum <= @sum_5').query('Phalloidin > @phal_5 | DAPI > @dapi_5 | WGA > @wga_5 | ConA > @cona_5 | Mito > @mito_5')
        whole_cell_hits_HPLM.to_csv(os.path.join(output_folder,f'HeLa_{condition}_plate_level_median_per_feat_sig_genes_5_FDR_whole_cell_hits.csv'),index=True)
        comp_spec_hits_HPLM.to_csv(os.path.join(output_folder,f'HeLa_{condition}_plate_level_median_per_feat_sig_genes_5_FDR_compartment_specific_hits.csv'),index=True)


KeyError: "['CD1D1', 'CD1D2', 'CD200R2', 'CD8B1', 'CLEC2I', 'H13', 'H2-AA', 'H2-AB1', 'H2-D1', 'H2-DMA', 'H2-K1', 'H2-M2', 'H2-M3', 'H2-Q4', 'H2-Q6', 'H2-Q7', 'H2-Q9', 'H2-T23', 'ICOSL', 'IL11RA1', 'IL6RA', 'LY6A', 'LY6C2', 'SIGLECH', 'SKINT4', 'STX4A', 'TRF', 'negCtrl'] not in index"

In [None]:
# Plot hits summary for Fig 3A
DMEM_whole_cell = len(df_sig_feature_expressed.query('Sum > @sum_5'))
DMEM_comp_spec = len(df_sig_feature_expressed.query('Sum <= @sum_5').query('Phalloidin > @phal_5 | DAPI > @dapi_5 | WGA > @wga_5 | ConA > @cona_5 | Mito > @mito_5'))

mpl.rc('axes', linewidth=0.7)
mpl.rc('xtick', labelsize=14)
mpl.rc('xtick', labelsize=14)

fig, ax = plt.subplots( figsize=(2,4))
colors = sns.color_palette('GnBu')[:8]
hits_plot = pd.DataFrame({'Compartment-specific': [DMEM_comp_spec],
                         'Whole cell': [DMEM_whole_cell] },
                        index=['HeLa DMEM'])

hits_plot.plot(kind='bar', 
               stacked=True,
               color=[colors[1], colors[4]],
               legend = 'reverse',
               rot = 0,
               fontsize= 14,
               width = 0.5,
               ax=ax)

ax.set_title('Genes with significant signal (5% FDR)',size=14)
ax.set_ylabel('Number of genes',size=14)

ax.bar_label(ax.containers[0],label_type='center',size=13)
ax.bar_label(ax.containers[1],label_type='center',size=13)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend(loc="upper left", bbox_to_anchor=(0.68,1),frameon=False, ncol=1)
fig.savefig(os.path.join(output_folder,'figure_panels','Fig3A_HeLa_DMEM_whole_cell_compartment_specific_hits.png',dpi = 300,bbox_inches='tight'))

In [None]:
# Plot hits summary for Fig 3B
def absolute_value2(val):
    a  = data[np.abs(data - val/100.*sum(data)).argmin() ]
    return a

labels = ['Mito','ER','Golgi/\nMembrane','DNA','Actin']
data = [len(df_sig_feature_expressed.query('Mito > @mito_5 & Sum <= @sum_5')),
        len(df_sig_feature_expressed.query('ConA > @cona_5 & Sum <= @sum_5')),
        len(df_sig_feature_expressed.query('WGA > @wga_5 & Sum <= @sum_5')),
        len(df_sig_feature_expressed.query('DAPI > @dapi_5 & Sum <= @sum_5')),
        len(df_sig_feature_expressed.query('Phalloidin > @phal_5 & Sum <= @sum_5'))]

colors = sns.color_palette('Greens')[:5]
               
fig, ax = plt.subplots(figsize=(7,4))

sns.set_theme(style='white',palette='RdBu')
plt.pie(data,
        labels=labels,
        autopct=absolute_value2,
        colors=colors,
        textprops={'fontsize': 13},
        explode=(0.0, 0.0, 0.0, 0.0, 0.01))
ax.set_title('Number of Hit Genes Called by Compartment (5% FDR)',size=14)
fig.savefig(os.path.join(output_folder,'figure_panels','Fig3B_HeLa_DMEM_compartment_specific_hits_distribution.png',dpi = 300,bbox_inches='tight'))

In [None]:
# Plot hits summary for Fig 3A
DMEM_whole_cell = len(df_sig_feature_expressed.query('Sum > @sum_5'))
DMEM_comp_spec = len(df_sig_feature_expressed.query('Sum <= @sum_5').query('Phalloidin > @phal_5 | DAPI > @dapi_5 | WGA > @wga_5 | ConA > @cona_5 | Mito > @mito_5'))


mpl.rc('axes', linewidth=0.7)
mpl.rc('xtick', labelsize=14)
mpl.rc('xtick', labelsize=14)

fig, ax = plt.subplots( figsize=(3,4))
colors = sns.color_palette('GnBu')[:8]
hits_plot = pd.DataFrame({'Compartment-specific': [DMEM_comp_spec, 1236], # Load the number of hits from the Fig3_A_B_HeLa_hplm file
                         'Whole cell': [DMEM_whole_cell, 3465] },
                        index=['HeLa DMEM','HeLa HPLM'])

hits_plot.plot(kind='bar', 
               stacked=True,
               color=[colors[1], colors[4]],#['tab:blue', 'tab:red']
               legend = 'reverse',
               rot = 0,
               fontsize= 14,
               width = 0.5,
               ax=ax)

ax.set_title('Genes with significant signal (5% FDR)',size=14) #Genes with significant signal above noise
ax.set_ylabel('Number of genes',size=14)

ax.bar_label(ax.containers[0],label_type='center',size=13)
ax.bar_label(ax.containers[1],label_type='center',size=13)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend(loc="upper left", bbox_to_anchor=(0.83,1),frameon=False, ncol=1)
fig.savefig(os.path.join(output_folder,'figure_panels','Fig3A_HeLa_DMEM_HPLM_whole_cell_compartment_specific_hits.png',dpi = 300,bbox_inches='tight'))

# Figure 3C

In [None]:
# FIGURE OUT WHAT DOES/DOESN'T NEED TO BE RELOADED HERE
# Import the Hela HPLM gene level profiles & hits from the hit calling process then subset the gene level profiles for all hits 
df = pd.read_csv('../20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS_gene_aggregated.csv.gz').set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
comp_spef_hits = pd.read_csv('../Fig3_A_B/cp257_hplm_plate_level_median_per_feat_sig_genes_5_fdr_compartment_specific_hits.csv').rename(columns={'Unnamed: 0':'Gene'}).set_index('Gene')
whole_cell_hits = pd.read_csv('../Fig3_A_B/cp257_hplm_plate_level_median_per_feat_sig_genes_5_fdr_whole_cell_hits.csv').rename(columns={'Unnamed: 0':'Gene'}).set_index('Gene')
hit_list = list(comp_spef_hits.index) + list(whole_cell_hits.index)
df_hits = df.loc[hit_list]

In [None]:
# Calculate correlation between all gene pairs and create a dictionory of gene pairs/correlation
for (df_hits, condition) in [(df_hits_DMEM, 'DMEM'), (df_hits_HPLM, 'HPLM')]:
    df_hits_corr = df_hits.T.corr()
    genes = list(df_hits_corr.index)
    corr_dic = {}
    for gene1 in genes:
        for gene2 in genes:
            if len(frozenset([gene1,gene2])) == 1 :
                continue
            else:    
                corr_dic[frozenset([gene1,gene2])]=(df_hits_corr.loc[gene1,gene2])

    # Create a list of protein clusters with all complexes that had at least 66% of genes represented within the Hela DMEM WGS hits
    cluster_count = 0
    hit_cluster_list_list = []
    hit_set = set()
    for i in range(len(ppi_data_h)):
        cluster = ppi_data_h.iloc[i]['subunits(Gene name)'].split(';')
        count = 0
        hit_cluster_list = []
        for g in cluster:
            if g in genes:
                count += 1
                hit_set.add(g)
                hit_cluster_list.append(g)
        if (count/len(cluster)) >= 0.66:
            cluster_count += 1
        if hit_cluster_list and (count/len(cluster)) >= 0.66:
            hit_cluster_list_list.append(hit_cluster_list)
    print(len(hit_set),cluster_count,len(hit_cluster_list_list))

    # Assign correlations to hit gene pairs
    hit_pair_set = set()
    for l in hit_cluster_list_list:
        for c in list(permutations(l,2)):
            hit_pair_set.add(frozenset(c))

    hit_corr_dic = {}
    for s in hit_pair_set:
        hit_corr_dic[s] = corr_dic[s]

    print(' Number of hit pairs',len(hit_pair_set),'\n',
        'Number of hit pairs with correlation',len(hit_corr_dic))
    
    # Plot CORUM gene pair correlation distribution for Fig 3C
    sns.set_theme(style="white",rc = {'axes.linewidth': 0.7,'xtick.labelsize':17,'ytick.labelsize':17})
    fig, ax = plt.subplots(figsize=(7,4))
    all_corr = list(corr_dic.values())
    cluster_corr = list(hit_corr_dic.values())

    sns.histplot(all_corr ,color='b' ,stat= 'percent' ,bins= 100, kde=True, line_kws={'lw':1.4} , alpha=0.5,ax = ax);
    sns.histplot(cluster_corr ,color='r' ,stat= 'percent' ,bins= 100, kde=True, line_kws={'lw':1.4} ,alpha=0.5,ax = ax);

    ax.set_ylabel('Gene pair frequency',size=17)
    ax.set_xlabel('Correlation of profiles',size=17)
    font = font_manager.FontProperties(size=13)
    ax.legend(['Hit gene pairs','Hit gene pairs in CORUM complex'],loc='upper left',bbox_to_anchor=(0.0,1.15),frameon=False,prop=font )
    sns.despine(top = True)

    fig.savefig(f'Fig3_C_Hela_{condition}_CORUM_gene_pair_correlation_distribution.png', dpi=300,facecolor='w', edgecolor='w', bbox_inches='tight')
    plt.show()

# Figure 3D

In [None]:
for (df_hits, condition) in [(df_hits_DMEM, 'DMEM'), (df_hits_HPLM, 'HPLM')]:
    # Subset the STRING predicted protein links to Hela DMEM WGS hits
    ppi_data_name_hits = STRING_data[STRING_data['protein1'].isin(genes) & STRING_data['protein2'].isin(genes)]
    
    # Catalogue all correlations
    corr_dic = {}
    for gene1 in genes:
        for gene2 in genes:
            if len(frozenset([gene1,gene2])) == 1 :
                continue
            else:    
                corr_dic[frozenset([gene1,gene2])]=(df_hits_corr.loc[gene1,gene2])
    print('Overall number of paired correlation ' , len(corr_dic))

    # Determine min, max, intervals for profile correlations
    corr_min = min(corr_dic.values())
    corr_max = max(corr_dic.values())   
    interval = (corr_max-corr_min)/6

    # Bin the correlations into 6 categories 
    results = ppi_data_name_hits.copy(deep=True).reset_index(drop=True)
    results["correlation"] = ''
    results["correlation_bin"] = ''

    bins = [-0.96,-0.64,-0.32,0,0.32,0.64,0.96]
    for i in range(len(results)):
        gene1 = results.iloc[i]['protein1']
        gene2 = results.iloc[i]['protein2']
        corr_value = corr_dic[frozenset([gene1,gene2])]
        results.at[i , 'correlation'] = corr_value
        if corr_value > bins[5]:
            results.at[i , 'correlation_bin'] = f'{bins[5]} to {bins[6]}'
        elif corr_value > bins[4] and corr_value <= bins[5]:
            results.at[i , 'correlation_bin'] = f'{bins[4]} to {bins[5]}'
        elif corr_value > bins[3] and corr_value <= bins[4]:
            results.at[i , 'correlation_bin'] = f'{bins[3]} to {bins[4]}'
        elif corr_value > bins[2] and corr_value <= bins[3]:
            results.at[i , 'correlation_bin'] = f'{bins[2]} to {bins[3]}'
        elif corr_value > bins[1] and corr_value <= bins[2]:
            results.at[i , 'correlation_bin'] = f'{bins[1]} to {bins[2]}'
        elif corr_value <= bins[1]:
            results.at[i , 'correlation_bin'] = f'{bins[0]} to {bins[1]}'
    
    # Plot STRING score X gene pair correlation for Fig 3D
    sns.set_theme(style="white",rc = {'axes.linewidth': 0.7})

    fig, ax = plt.subplots(figsize=(7,4))
    order = [f'{bins[0]} to {bins[1]}',f'{bins[1]} to {bins[2]}',f'{bins[2]} to {bins[3]}',f'{bins[3]} to {bins[4]}',f'{bins[4]} to {bins[5]}',f'{bins[5]} to {bins[6]}']

    flierprops = dict(markerfacecolor='white', markersize=0.1, linestyle='none')
    sns.boxenplot(data=results,
                x = 'correlation_bin',
                y = 'combined_score', 
                order=order, 
                width = 0.6,
                palette = 'RdBu_r',
                ax=ax)

    ax.set_xlabel('Correlation of profiles',size=16)
    ax.set_ylabel('STRING score',size=16)
    ax.tick_params(axis='both', which='major', labelsize=14)

    labels = [item for item in ax.get_yticks()]
    ax.set_yticklabels([str(round(float(label))) for label in labels], size=16)
    ax.set_xticklabels(order, size=16,rotation=30)

    sns.despine(top = True,left=False,bottom=False)

    fig.savefig('Fig3_D_Hela_DMEM_STRING_gene_pair_correlation_score.png', 
                dpi=300,
                facecolor='w', 
                edgecolor='w', 
                bbox_inches='tight')
    plt.show()

# Figure 3E

In [None]:
for (df_hits, condition) in [(df_hits_DMEM, 'DMEM'), (df_hits_HPLM, 'HPLM')]:
    # Perform principal componenet analysis and select components representing 70% of variation in data & print the variation of components selected by PCA
    pca = PCA()
    pca.fit(df_hits)
    x = list(pca.explained_variance_ratio_)
    y = [sum(x[:i+1]) for i in range(len(x))]
    y[74]

    # Perform principal componenet analysis and select components representing 70% of variation in data
    pca = PCA(n_components=75)
    df_hits_pca = pd.DataFrame(pca.fit_transform(df_hits),index=df_hits.index)

    # Perform UMAP dimensionality reduction to project data into a 2 dimentional plane 
    clusterable_embedding = umap.UMAP(
        n_neighbors=4,
        min_dist=0.04,
        n_components=2,
        random_state=43,
        metric = 'cosine',
    ).fit_transform(df_hits_pca)


    # THERES MORE. 