In [None]:
import os
import pandas as pd
import numpy as np
from pycytominer import normalize,feature_select,aggregate,consensus
from pycytominer.cyto_utils import output,infer_cp_features
from pycytominer.cyto_utils.util import (
    get_pairwise_correlation,
    check_correlation_method,
    infer_cp_features,
)

In [None]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "inputs_data"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)


In [None]:
# Load barcodes used in experiment
guide_df = pd.read_csv('../common_files/Barcodes.csv')
# Guides with other dialouts were not used in this experiment
guide_df = guide_df.query('dialout == 1 | dialout ==3')
guide_list = list(guide_df['sgRNA'])

# Plate level aggregation

## HeLa

In [None]:
# Aggregation may be too computationally intensive for a standard personal computer
# Skip aggregation by downloading aggregated files following instructions in README.md

# Load normalized plate_level profiles, remove ghost guides, and merge 
DMEM_plates = ['CP257A','CP257B','CP257D','CP257F','CP257H']
HPLM_plates = ['CP257J','CP257K','CP257L','CP257N']

DMEM_list = []
for plate in DMEM_plates:
    filename = f'20210422_6W_CP257_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_DMEM_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_DMEM_df = pre_DMEM_df[pre_DMEM_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    DMEM_list.append(pre_DMEM_df)

HeLa_DMEM_df = pd.concat(DMEM_list)

HPLM_list = []
for plate in HPLM_plates:
    filename = f'20210422_6W_CP257_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_HPLM_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_HPLM_df = pre_HPLM_df[pre_HPLM_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    HPLM_list.append(pre_HPLM_df)

HeLa_HPLM_df = pd.concat(HPLM_list)

HeLa_DMEM_df.head()

In [None]:
# Perform feature selection on merged profiles
HeLa_DMEM_feature_selected_df = feature_select(
            profiles=HeLa_DMEM_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist'],
            na_cutoff= 0,
            corr_threshold=0.9
        )

HeLa_DMEM_feature_selected_df.to_csv(os.path.join(output_folder,'20210422_6W_CP257_guide_normalized_feature_select_merged_ALLBATCHES___DMEM___ALLWELLS.csv.gz'))

HeLa_HPLM_feature_selected_df = feature_select(
            profiles=HeLa_HPLM_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist'],
            na_cutoff= 0,
            corr_threshold=0.9
        )

HeLa_HPLM_feature_selected_df.to_csv(os.path.join(output_folder,'20210422_6W_CP257_guide_normalized_feature_select_merged_ALLBATCHES___HPLM___ALLWELLS.csv.gz'))

In [None]:
# Perform median aggregation on profiles
HeLa_DMEM_feature_selected_median_df= aggregate(
                    population_df=HeLa_DMEM_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

HeLa_DMEM_feature_selected_median_df.to_csv(os.path.join(output_folder,'20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS.csv.gz',index = False))

HeLa_HPLM_feature_selected_median_df= aggregate(
                    population_df=HeLa_HPLM_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

HeLa_HPLM_feature_selected_median_df.to_csv(os.path.join(output_folder,'20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS.csv.gz',index = False))

## A549

In [None]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['CP186','CP186B','CP186C','CP186D','CP186E','CP186F','CP186G','CP186H','CP186N']

A549_list = []
for plate in plates:
    filename = f'20210422_6W_CP257_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_df = pre_df[pre_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    A549_list.append(pre_df)

A549_df = pd.concat(A549_list)
A549_df.head()

In [None]:
# Perform feature selection on merged profiles
A549_feature_selected_df = feature_select(
            profiles=A549_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist'],
            na_cutoff= 0,
            corr_threshold=0.9
        )

A549_feature_selected_df.to_csv(os.path.join(output_folder,'20200805_A549_WG_Screen_guide_normalized_feature_select_merged_ALLBATCHES___CP186___ALLWELLS.csv.gz'))

In [None]:
# Perform median aggregation on profiles
A549_feature_selected_median_df= aggregate(
                    population_df=A549_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

A549_feature_selected_median_df.to_csv(os.path.join(output_folder,'20200805_A549_WG_Screen_guide_normalized_feature_select_median_merged_ALLBATCHES___CP186___ALLWELLS.csv.gz',index = False))

# Gene level aggregation

In [None]:

# List files to undergo gene level aggregation
file_list = ["20200805_A549_WG_Screen_guide_normalized_feature_select_merged_median_ALLBATCHES___CP186___ALLWELLS.csv.gz",
"20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS.csv.gz",
"20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [None]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(input_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

In [None]:
dmem_median_gene_index_df = dmem_median_gene_df.set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode').dropna()
dmem_median_gene_index_df

In [None]:
def heat_map(data,title,file_name):
        corr = data.corr()
        
        plt.clf()
        sns.set(rc={'figure.figsize':(30,30)})

        ax = sns.heatmap(
            corr, 
            vmin=-1, vmax=1, center=0,
            cmap=sns.diverging_palette(10, 220, n=200),
            linewidth = 0.2,
            linecolor='white',
            square=True,
            cbar_kws={"shrink": .76}
        )
        ax.set_xticklabels(
            ax.get_xticklabels(),
            rotation=90,
            horizontalalignment='right'
        )
        ax.set_title(title,fontsize=30)
        
        plt.tight_layout()
        fig = ax.get_figure()
        
        output_file = pathlib.Path(f"{file_name}")
        fig.savefig(output_file,dpi=300)

def clustered_heat_map(data,title,file_name):
        corr = data.corr()
        sns.set(font_scale=0.8)

        ax = sns.clustermap(
            corr, 
            vmin=-1, vmax=1, center=0,
            method = 'ward',
            figsize = (30 ,30),
            cmap=sns.diverging_palette(10, 220, n=200),
            linewidth = 0.2,
            linecolor='white',
            dendrogram_ratio =0.1,
            square=True,
            cbar_kws={"shrink": .6}
        )
        ax.fig.suptitle(title,fontsize=30)
        
        output_file = pathlib.Path(f"{file_name}")
        ax.savefig(output_file,dpi=300) 


In [None]:
clustered_heat_map(dmem_median_gene_index_df.T,'CP257 DMEM_arm Median Aggregated Clustered Profile Heatmap - gene level hits(From well_level)','cp257_dmem_median_profile_heatmap_clustering_gene_hits_well_level.png')
heat_map(dmem_median_gene_index_df.T,'CP257 DMEM_arm Median Aggregated Profile Heatmap - gene level hits(From well_level)','cp257_dmem_median_profile_heatmap_gene_hits_well_level')