In [1]:
### Calculation of gene level profiles from the A549 guide level profiles for the PERISCOPE manuscript. ###
### Script by Meraj Ramezani(mramezan@broadinstitute.org) ###
# Import relevant librariesimport pandas as pdimport pandas as pd
import pandas as pd
import numpy as np
from pycytominer import normalize, feature_select ,aggregate,consensus
from pycytominer.cyto_utils import output,infer_cp_features
from pycytominer.cyto_utils.util import (
    get_pairwise_correlation,
    check_correlation_method,
    infer_cp_features,
)

In [2]:
# load normalized_feature_selected profiles for the cp257 dmem arm
filename = '(input the file address)/20200805_A549_WG_Screen_guide_normalized_feature_select_merged_median_ALLBATCHES___CP186___ALLWELLS.csv.gz'
a549_median_df = pd.read_csv(filename)
a549_median_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,...,Nuclei_Texture_SumVariance_ConA_5_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_03_256
0,A1BG,CAAGAGAAAGACCACGAGCA,-0.085682,-0.007318,0.048388,0.005510,-0.029950,0.452470,0.128870,-0.251250,...,0.26915,-0.510610,-0.813530,-0.037740,-0.090246,-0.078726,-0.084498,-0.013743,0.06056,-0.093787
1,A1BG,CATCTTCTTTCACCTGAACG,0.009774,-0.003006,0.049408,0.006071,0.059289,-0.732130,-0.477380,-0.192550,...,-0.36389,-0.159780,-0.334250,-0.028623,-0.190650,-0.066117,-0.126050,-0.298190,-0.26774,-0.203280
2,A1BG,CTCCGGGGAGAACTCCGGCG,-0.058185,0.016190,-0.128340,-0.010445,-0.198250,0.118750,0.325120,-0.185930,...,-0.55675,0.168100,-0.094460,-0.088934,-0.256140,-0.121030,-0.277940,-0.344150,-0.29566,-0.349900
3,A1BG,TGGAAGTCCACTCCACTCAG,-0.097838,-0.008437,0.116660,0.004938,0.229590,0.605420,0.599460,-0.172990,...,-0.46291,0.406220,0.103390,-0.082164,-0.243800,-0.209550,-0.144330,-0.347550,-0.34362,-0.309940
4,A1CF,AGTTATGTTAGGTATACCCG,-0.057318,-0.029085,-0.034795,-0.026508,0.422990,0.374530,0.081628,0.045344,...,-0.28838,-0.023266,0.435720,-0.002229,0.076596,0.139300,0.128120,-0.069559,-0.23150,-0.108940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80855,nontargeting,TTTATGCATTTAATACGCCG,0.015472,-0.008321,-0.013841,-0.000692,0.004790,-0.049257,0.126070,-0.260380,...,-0.36470,-0.153800,-0.143790,-0.074116,-0.214040,-0.230090,-0.164120,-0.281430,-0.27678,-0.241060
80856,nontargeting,TTTCTAGTTACTACTGGACG,0.003309,-0.000789,-0.006572,-0.001090,-0.008685,-0.015765,0.135290,-0.221200,...,-0.40125,-0.074483,-0.066195,-0.055062,-0.203310,-0.222460,-0.201870,-0.243910,-0.24811,-0.195830
80857,nontargeting,TTTGGCAGTACCTTTTATTA,0.008112,-0.001499,-0.002163,0.000775,0.120530,0.019983,0.098362,-0.156850,...,-0.37106,-0.139890,-0.156200,-0.078060,-0.213080,-0.223870,-0.187550,-0.268350,-0.25324,-0.215150
80858,nontargeting,TTTTACCTTGTTCACATGGA,-0.004330,-0.004018,0.004450,-0.001802,0.175830,-0.049480,0.092460,-0.161920,...,-0.34700,-0.159690,-0.176560,-0.065137,-0.185410,-0.184750,-0.142810,-0.236550,-0.24324,-0.192800


In [3]:
# Filter out irrelavant guides
filename = '(input the file address)/wgs_guide_list.csv'
guide_df = pd.read_csv(filename)
a549_guide_df = guide_df.query('dialout == 1 | dialout ==3')
a549_guide_list = list(a549_guide_df['sgRNA'])
a549_median_df = a549_median_df[a549_median_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(a549_guide_list)]
a549_median_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,...,Nuclei_Texture_SumVariance_ConA_5_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_03_256
0,A1BG,CAAGAGAAAGACCACGAGCA,-0.085682,-0.007318,0.048388,0.005510,-0.029950,0.452470,0.128870,-0.251250,...,0.26915,-0.510610,-0.813530,-0.037740,-0.090246,-0.078726,-0.084498,-0.013743,0.06056,-0.093787
1,A1BG,CATCTTCTTTCACCTGAACG,0.009774,-0.003006,0.049408,0.006071,0.059289,-0.732130,-0.477380,-0.192550,...,-0.36389,-0.159780,-0.334250,-0.028623,-0.190650,-0.066117,-0.126050,-0.298190,-0.26774,-0.203280
2,A1BG,CTCCGGGGAGAACTCCGGCG,-0.058185,0.016190,-0.128340,-0.010445,-0.198250,0.118750,0.325120,-0.185930,...,-0.55675,0.168100,-0.094460,-0.088934,-0.256140,-0.121030,-0.277940,-0.344150,-0.29566,-0.349900
3,A1BG,TGGAAGTCCACTCCACTCAG,-0.097838,-0.008437,0.116660,0.004938,0.229590,0.605420,0.599460,-0.172990,...,-0.46291,0.406220,0.103390,-0.082164,-0.243800,-0.209550,-0.144330,-0.347550,-0.34362,-0.309940
4,A1CF,AGTTATGTTAGGTATACCCG,-0.057318,-0.029085,-0.034795,-0.026508,0.422990,0.374530,0.081628,0.045344,...,-0.28838,-0.023266,0.435720,-0.002229,0.076596,0.139300,0.128120,-0.069559,-0.23150,-0.108940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80855,nontargeting,TTTATGCATTTAATACGCCG,0.015472,-0.008321,-0.013841,-0.000692,0.004790,-0.049257,0.126070,-0.260380,...,-0.36470,-0.153800,-0.143790,-0.074116,-0.214040,-0.230090,-0.164120,-0.281430,-0.27678,-0.241060
80856,nontargeting,TTTCTAGTTACTACTGGACG,0.003309,-0.000789,-0.006572,-0.001090,-0.008685,-0.015765,0.135290,-0.221200,...,-0.40125,-0.074483,-0.066195,-0.055062,-0.203310,-0.222460,-0.201870,-0.243910,-0.24811,-0.195830
80857,nontargeting,TTTGGCAGTACCTTTTATTA,0.008112,-0.001499,-0.002163,0.000775,0.120530,0.019983,0.098362,-0.156850,...,-0.37106,-0.139890,-0.156200,-0.078060,-0.213080,-0.223870,-0.187550,-0.268350,-0.25324,-0.215150
80858,nontargeting,TTTTACCTTGTTCACATGGA,-0.004330,-0.004018,0.004450,-0.001802,0.175830,-0.049480,0.092460,-0.161920,...,-0.34700,-0.159690,-0.176560,-0.065137,-0.185410,-0.184750,-0.142810,-0.236550,-0.24324,-0.192800


In [5]:
# Perform profile aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

a549_median_gene_df = aggregate(
            population_df=a549_median_df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )
a549_median_gene_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_HuMoment_1,Cells_AreaShape_HuMoment_6,...,Nuclei_Texture_SumVariance_ConA_5_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_01_256,Nuclei_Texture_SumVariance_DAPI_Painting_10_03_256,Nuclei_Texture_SumVariance_Mito_5_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_01_256,Nuclei_Texture_SumVariance_Phalloidin_10_03_256,Nuclei_Texture_SumVariance_Phalloidin_5_01_256,Nuclei_Texture_SumVariance_WGA_10_01_256,Nuclei_Texture_SumVariance_WGA_10_03_256,Nuclei_Texture_SumVariance_WGA_5_03_256
0,A1BG,-0.071933,-0.005162,0.048898,0.005224,0.014670,0.285610,0.226995,-0.189240,0.002904,...,-0.413400,0.004160,-0.214355,-0.059952,-0.217225,-0.099878,-0.135190,-0.321170,-0.281700,-0.256610
1,A1CF,-0.043870,-0.020819,0.031277,0.001221,0.394330,-0.010311,0.050353,0.019761,0.002908,...,-0.293670,-0.263560,-0.124159,0.006351,0.054605,-0.002325,0.103262,-0.095674,-0.222795,-0.102812
2,A2M,0.045538,-0.010160,-0.046016,0.003136,0.121663,0.224305,0.210295,-0.126460,0.002642,...,-0.114512,-0.187022,0.100243,-0.028514,-0.164265,-0.185640,-0.127717,-0.107746,-0.097464,-0.063409
3,A2ML1,0.027912,0.004839,0.041878,0.007637,0.082211,-0.102908,0.009968,-0.245457,0.002648,...,-0.167673,-0.138828,-0.089541,-0.058652,-0.130200,-0.210895,-0.139018,-0.090193,-0.044246,-0.130268
4,A3GALT2,-0.017885,0.002400,-0.028306,0.006864,0.032896,-0.187055,0.083699,-0.111916,0.002894,...,-0.362785,-0.297570,-0.261780,-0.089968,-0.178270,-0.150190,-0.136820,-0.066775,-0.074249,-0.196910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20388,ZYG11B,-0.061426,0.015783,-0.018821,0.013978,0.034580,-0.015917,0.103634,-0.221900,0.002901,...,-0.182105,-0.229673,-0.169857,-0.039879,-0.186320,-0.246110,-0.206195,-0.182590,-0.185433,-0.182660
20389,ZYX,-0.065264,0.003621,-0.017143,-0.011676,0.186640,-0.060132,0.410635,-0.103690,0.002750,...,-0.226165,-0.342180,-0.411980,-0.059490,-0.160440,-0.242200,-0.119252,-0.052917,-0.173740,-0.186615
20390,ZZEF1,0.184946,-0.015807,-0.018600,0.016341,0.211842,-0.093404,0.034704,-0.060925,0.002810,...,-0.385525,-0.159358,-0.159678,-0.071837,-0.060733,-0.139756,-0.071566,-0.196455,-0.261540,-0.177825
20391,ZZZ3,0.040050,-0.025408,0.000594,0.041732,-0.040540,0.067500,0.241397,-0.271200,0.002850,...,-0.309378,-0.096347,-0.040463,-0.056001,-0.026861,-0.109388,0.007185,-0.106997,-0.110468,-0.025339


In [6]:
a549_median_gene_df.to_csv('20200805_A549_WG_Screen_guide_normalized_feature_select_merged_median_ALLBATCHES___CP186___ALLWELLS_gene_aggregated.csv.gz',index=False)