In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from scipy.spatial import distance
import pathlib
import os
import json
from sklearn.decomposition import PCA

np.random.seed(7)

In [2]:
# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)
if not os.path.exists(os.path.join(output_folder,"figure_panels")):
    os.makedirs(os.path.join(output_folder,"figure_panels"), exist_ok=True)

In [3]:
# Import the HeLa DMEM gene-level profiles
df_hits_pca_DMEM = pd.read_csv("../3_HeLa_WGS_Summary/outputs/HeLa_DMEM_hit_list_PCA.csv")

# Import annotated gene sets
with open("../common_files/annotated_gene_sets.json") as f:
    gene_set_dict = json.load(f)

# Import the HeLa DMEM hits from the hit calling process then subset the gene level profiles for all hits 
comp_spec_hits = pd.read_csv('../3_HeLa_WGS_Summary/outputs/HeLa_DMEM_plate_level_median_per_feat_sig_genes_5_fdr_compartment_specific_hits.csv').set_index('Gene')
whole_cell_hits = pd.read_csv('../3_HeLa_WGS_Summary/outputs/HeLa_DMEM_plate_level_median_per_feat_sig_genes_5_fdr_whole_cell_hits.csv').set_index('Gene')
whole_cell_hit_list = list(whole_cell_hits.index)

hit_list = list(comp_spec_hits.index) + list(whole_cell_hits.index)

Unnamed: 0_level_0,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,...,Nuclei_Texture_SumVariance_DAPI_Painting_10_02_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_01_256
Metadata_Foci_Barcode_MatchedTo_GeneCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A4GALT,-0.003687,-0.001517,-0.024211,-0.002230,-0.000985,0.048217,-0.247624,-0.385445,-0.181542,-0.310485,...,-0.205460,-0.124003,-0.061160,-0.064534,-0.158805,-0.175165,-0.100943,-0.116820,-0.127088,-0.208700
AAAS,-0.002542,-0.160110,0.006678,0.023752,0.001771,-0.348870,-0.051906,-0.200550,0.296850,0.118100,...,-0.338825,-0.461380,-0.037178,-0.003460,-0.064844,-0.215570,-0.125000,0.213670,0.063166,0.079643
AADAT,0.010149,-0.050984,-0.000033,-0.011308,0.000147,-0.390320,0.192267,0.127596,0.360045,-0.037015,...,0.035560,0.096781,0.002351,0.014290,-0.047730,-0.055333,-0.027467,-0.030975,-0.054734,-0.066406
AAGAB,0.001373,-0.073191,0.005038,-0.008009,-0.000422,-0.110842,0.408275,0.010436,0.014049,-0.013480,...,-0.364305,-0.374600,-0.045733,-0.048909,-0.093886,-0.073781,-0.127453,-0.269980,0.009450,-0.024553
AARSD1,0.011580,-0.099327,-0.017418,0.022201,0.004731,-0.048940,-0.060862,-0.170685,-0.057970,-0.051002,...,-0.251240,-0.230710,-0.123475,-0.036832,-0.078641,-0.171565,-0.146525,-0.317805,-0.210240,-0.273045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZPR1,0.000413,-0.015735,-0.015433,0.001812,-0.000215,-0.551765,-0.001942,0.403800,0.569805,-0.217560,...,0.242875,0.536345,0.335195,0.045955,-0.013645,0.059679,-0.001273,0.092637,-0.032070,-0.005467
ZSWIM8,0.008489,0.287793,-0.001380,0.012269,0.002484,-0.070610,-0.190660,-0.032445,-0.026930,-0.222235,...,-0.109120,0.011316,0.010509,0.018169,-0.121815,-0.111510,-0.119964,-0.045433,0.318355,-0.044189
ZYX,-0.001728,0.127841,-0.003141,-0.000141,0.000653,-0.057800,-0.026700,0.050321,-0.057195,-0.178050,...,0.126379,-0.020406,-0.070986,-0.017315,-0.124905,-0.129980,-0.075756,0.019228,-0.261445,0.002777
ZZEF1,-0.002265,-0.025710,-0.010416,-0.026102,0.000046,-0.036503,0.055400,0.115180,-0.083702,-0.045825,...,-0.068458,-0.202371,-0.158160,-0.057480,-0.109966,-0.057180,-0.089847,-0.457715,-0.409825,-0.516860


In [6]:
# Function to perform hierarchial clustering based on profiles correlation and drawing the heatmap
def clustered_heat_map(data,file_name,font_scale=0.8,linewidth=1):

        corr = data.corr()
        sns.set(font_scale=font_scale)


        ax = sns.clustermap(
            corr, 
            vmin=-1, vmax=1, center=0,
            method = 'ward',
            figsize = (30 ,30),
            cmap=sns.diverging_palette(250, 10 ,s=100,l=35, n=100),
            linewidth = linewidth,
            linecolor='white',
            dendrogram_ratio =0.1,
            square=True,
            cbar_kws={"shrink": .05,"aspect":40}
        )        
        ax.cax.set_visible(False)
        ax.ax_row_dendrogram.set_visible(False)
        ax.ax_col_dendrogram.set_visible(False)
        output_file = pathlib.Path(f"{file_name}")
        plt.savefig(os.path.join(output_folder,output_file),bbox_inches='tight', dpi=600,facecolor='w', edgecolor='w')
        plt.show()

In [7]:
# Subset the profiles to select relevant genes
gene_set = 'PI3KAkt_Signaling'
gene_list = gene_set_dict[gene_set]
genes_df =  df_hits_pca_DMEM[df_hits_pca_DMEM['Metadata_Foci_Barcode_MatchedTo_GeneCode'].isin(gene_list)]
genes_profiles = genes_df.set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
# Cluster the profiles & draw the heatmap
clustered_heat_map(genes_profiles.T,                
                   f'HeLa_DMEM_median_profile_heatmap_clustering_gene_hits_{gene_set}_plate_level.png',
                   1.6,
                   0.8)



71

In [9]:
# Subset the profiles to select relevant genes
gene_set = 'Ribosome'
gene_list = gene_set_dict[gene_set]
genes_df =  df_hits_pca_DMEM[df_hits_pca_DMEM['Metadata_Foci_Barcode_MatchedTo_GeneCode'].isin(gene_list)]
genes_profiles = genes_df.set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
# Cluster the profiles & draw the heatmap
clustered_heat_map(genes_profiles.T,
                   f'HeLa_DMEM_median_profile_heatmap_clustering_gene_hits_{gene_set}_plate_level.png',
                   2.8,
                   0.6)



146