In [1]:
import pandas
import pycytominer

In [2]:
aggregated_features = pandas.read_csv('ljosa_jbiomolscreen_2013_per_well_mean.csv')
image_metadata = pandas.read_csv('BBBC021_v1_image.csv')
moa_metadata = pandas.read_csv('BBBC021_v1_moa.csv')
aggregated_features.shape

(632, 518)

In [3]:
# The image_metadata has 4 sites per well, but we only want one here
stripped_image_metadata = image_metadata[image_metadata['Image_FileName_Actin'].str.contains('_s1_')]

In [4]:
annotated_features = aggregated_features.merge(stripped_image_metadata, left_on=['Image_Metadata_Plate','Image_Metadata_Well'], right_on=['Image_Metadata_Plate_DAPI','Image_Metadata_Well_DAPI'], how='left')
annotated_features.shape

(632, 531)

In [5]:
moa_columns = {x:'Metadata_'+x for x in moa_metadata.columns if 'Metadata' not in x}
moa_metadata = moa_metadata.rename(columns = moa_columns)
moa_annotated_features = annotated_features.merge(moa_metadata, left_on=['Image_Metadata_Compound','Image_Metadata_Concentration'],right_on=['Metadata_compound','Metadata_concentration'],how='inner')
moa_annotated_features.shape

(632, 534)

In [6]:
metadata_cols = [x for x in moa_annotated_features.columns if 'Metadata' in x]
non_metadata_cols = [x for x in  moa_annotated_features.columns if 'Metadata' not in x]
reordered_annotated_features =  moa_annotated_features[metadata_cols+non_metadata_cols]
namefixed_annotated_columns = {x:x[x.index('Metadata_'):] for x in reordered_annotated_features.columns if 'Metadata_' in x}
namefixed_annotated_features = reordered_annotated_features.rename(columns=namefixed_annotated_columns)
columns_to_manually_remove = ['Metadata_Plate_DAPI','Metadata_Well_DAPI','Metadata_compound',
                              'Metadata_concentration','TableNumber','ImageNumber','Image_FileName_DAPI',
                              'Image_PathName_DAPI','Image_FileName_Tubulin','Image_PathName_Tubulin',
                              'Image_FileName_Actin','Image_PathName_Actin','Replicate']
cleaned_features = namefixed_annotated_features.drop(columns=columns_to_manually_remove)
cleaned_features.shape

(632, 521)

In [7]:
cleaned_features.to_csv('BBBC021_annotated.csv',index=False)

pycytominer.cyto_utils.write_gct(cleaned_features,output_file='BBBC021_annotated.gct')

In [8]:
normalized = pycytominer.normalize(profiles=cleaned_features,method='mad_robustize')

normalized.to_csv('BBBC021_normalized_mad_robustize.csv',index=False)

pycytominer.cyto_utils.write_gct(normalized,output_file='BBBC021_normalized_mad_robustize.gct')

In [9]:
feature_selected = pycytominer.feature_select(profiles=normalized, operation = ['variance_threshold','correlation_threshold','drop_na_columns','blocklist'])

feature_selected.to_csv('BBBC021_feature_selected_mad_robustize.csv',index=False)

pycytominer.cyto_utils.write_gct(feature_selected,output_file='BBBC021_feature_selected_mad_robustize.gct')