In [1]:
!python3 --version

Python 3.8.10


In [2]:
import os
import itertools

import pandas
import numpy
import pycytominer

from utils import calculate_percent_replicating_MOA

In [3]:
def do_feature_select(plate_df):
    feature_select_features = pycytominer.cyto_utils.infer_cp_features(
        plate_df
    )

    return pycytominer.feature_select(
        profiles=plate_df,
        features=feature_select_features,
        operation=['variance_threshold','correlation_threshold',
        'drop_na_columns','blocklist']
    )

In [4]:
def all_combo_dropouts_in_a_category(category_list,outfile,subsample=None):
    to_try = [y for x in range(len(category_list)) for y in list(set(itertools.combinations(category_list,x)))]

    if os.path.exists(outfile):
        dropout_df = pandas.read_csv(outfile)
        tried=list(dropout_df['Dropout'])
        print(f"Already try {tried},skipping")
        to_try = [x for x in to_try if str(x) not in tried]
    else:
        dropout_df = pandas.DataFrame()
    for eachdropout in to_try:
        normalized_master = pandas.read_csv('../profiles-pilots/profiles/Stain5_CondC_Standard/BR00120274/BR00120274_normalized_negcon.csv.gz')
        col_list = normalized_master.columns
        if subsample:
             for each_item in subsample:
                metadata_list = [x for x in col_list if 'Metadata' in x] 
                col_list = metadata_list+[x for x in col_list if each_item in x]
        for each_item in eachdropout:
            col_list = [x for x in col_list if each_item not in x]
        dropout_dict = {}
        temp_dropout_dict = {}
        for eachplate in range(70,78):
            normalized = pandas.read_csv(
                f'../profiles-pilots/profiles/Stain5_CondC_Standard/BR001202{eachplate}/BR001202{eachplate}_normalized_negcon.csv.gz',
                usecols=col_list)
            feature_select = do_feature_select(normalized)
            temp_dropout_dict[f'{eachplate}_featnum']=len(feature_select.columns)
            dropout_dict[f'BR001202{eachplate}']= calculate_percent_replicating_MOA('','',data_df=feature_select)

        results_array = numpy.array(list(dropout_dict.values()))
        for k,v in temp_dropout_dict.items():
            dropout_dict[k]=v
        dropout_dict['Mean']=results_array.mean()
        dropout_dict['Median']=numpy.median(results_array)
        dropout_dict['Standard deviation']=numpy.std(results_array)
        dropout_dict['Dropout']=eachdropout
        dropout_dict['Remaining'] = [x for x in category_list if x not in eachdropout]
        dropout_dict['n_columns']=len(col_list)
        dropout_df = dropout_df.append(dropout_dict,ignore_index=True)
        print(eachdropout,dropout_dict['Mean'])
        dropout_df.to_csv(outfile,index=False)
    print(dropout_df['Mean'].describe())
    print(dropout_df['Median'].describe())

In [5]:
all_combo_dropouts_in_a_category(['Nuclei','Cells','Cytoplasm'],
                                 'checkpoint_csvs/compartment_dropout_combinations.csv')

Already try ['()', "('Cytoplasm',)", "('Cells',)", "('Nuclei',)", "('Nuclei', 'Cells')", "('Cells', 'Cytoplasm')", "('Nuclei', 'Cytoplasm')"],skipping
count    7.000000
mean     0.525397
std      0.022305
min      0.495833
25%      0.506250
50%      0.536111
75%      0.540972
max      0.551389
Name: Mean, dtype: float64
count    7.000000
mean     0.530159
std      0.019207
min      0.500000
25%      0.516667
50%      0.538889
75%      0.544444
max      0.550000
Name: Median, dtype: float64


In [6]:
all_combo_dropouts_in_a_category(['DNA','RNA','ER','Mito','AGP','Brightfield','BFLow','BFHigh'],
                                 'checkpoint_csvs/channel_dropout_combinations.csv')

Already try ['()', "('Mito',)", "('RNA',)", "('Brightfield',)", "('DNA',)", "('BFLow',)", "('BFHigh',)", "('ER',)", "('AGP',)", "('DNA', 'ER')", "('DNA', 'RNA')", "('RNA', 'ER')", "('ER', 'Mito')", "('ER', 'BFLow')", "('ER', 'Brightfield')", "('ER', 'BFHigh')", "('BFLow', 'BFHigh')", "('Brightfield', 'BFHigh')", "('Brightfield', 'BFLow')", "('DNA', 'Mito')", "('DNA', 'BFLow')", "('DNA', 'Brightfield')", "('DNA', 'BFHigh')", "('Mito', 'Brightfield')", "('Mito', 'BFLow')", "('Mito', 'BFHigh')", "('RNA', 'Brightfield')", "('AGP', 'BFLow')", "('AGP', 'BFHigh')", "('ER', 'AGP')", "('RNA', 'BFLow')", "('RNA', 'AGP')", "('RNA', 'Mito')", "('RNA', 'BFHigh')", "('AGP', 'Brightfield')", "('Mito', 'AGP')", "('DNA', 'AGP')", "('RNA', 'Mito', 'AGP')", "('RNA', 'AGP', 'BFLow')", "('Mito', 'Brightfield', 'BFLow')", "('DNA', 'ER', 'BFLow')", "('DNA', 'Mito', 'Brightfield')", "('DNA', 'ER', 'AGP')", "('ER', 'AGP', 'BFHigh')", "('Mito', 'AGP', 'Brightfield')", "('RNA', 'Mito', 'BFHigh')", "('DNA', 'ER',

In [7]:
all_combo_dropouts_in_a_category(['AreaShape','Correlation','Granularity','Intensity','Neighbors','RadialDistribution','Texture'],
                                 'checkpoint_csvs/feature_dropout_combinations.csv')

Already try ['()', "('Granularity',)", "('Neighbors',)", "('Texture',)", "('RadialDistribution',)", "('AreaShape',)", "('Intensity',)", "('Correlation',)", "('Intensity', 'Neighbors')", "('AreaShape', 'RadialDistribution')", "('Intensity', 'Texture')", "('AreaShape', 'Intensity')", "('Correlation', 'Granularity')", "('AreaShape', 'Neighbors')", "('Correlation', 'RadialDistribution')", "('AreaShape', 'Texture')", "('Correlation', 'Intensity')", "('Neighbors', 'RadialDistribution')", "('Correlation', 'Neighbors')", "('Correlation', 'Texture')", "('Neighbors', 'Texture')", "('Granularity', 'RadialDistribution')", "('AreaShape', 'Correlation')", "('Granularity', 'Intensity')", "('Granularity', 'Neighbors')", "('Granularity', 'Texture')", "('Intensity', 'RadialDistribution')", "('RadialDistribution', 'Texture')", "('AreaShape', 'Granularity')", "('Granularity', 'Neighbors', 'Texture')", "('AreaShape', 'Granularity', 'Intensity')", "('AreaShape', 'Granularity', 'Neighbors')", "('Intensity', 

In [None]:
for eachchannel in ['DNA','RNA','ER','Mito','AGP','Brightfield','BFLow','BFHigh']:
    print(eachchannel)
    all_combo_dropouts_in_a_category(['AreaShape','Correlation','Granularity','Intensity','Neighbors','RadialDistribution','Texture'],
                                 f'checkpoint_csvs/{eachchannel}_only_feature_dropout_combinations.csv',subsample=[eachchannel])
    

DNA
() 0.5208333333333333
('Neighbors',) 0.5166666666666666
('Intensity',) 0.5180555555555555
