In [1]:
import glob
import pandas as pd
from preprocess import preprocess_profiles

### Preprocessing parameters

In [2]:
MAD_EPSILON = 0.0
INC_IMAGE_FEATURES = True
FEAT_SELECT_OPS = ["variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist"]

### List plates with overlapping ORFs

See https://github.com/jump-cellpainting/jump-cellpainting/issues/78#issuecomment-805942281

In [3]:
JCP_OVERLAP_PLATES = [
    "OKA05.06.07.08.A",
    "OAB84.85.86.87.A",
    "OAA97.98.99.XX.A",
    "OAB25.26.27.28.A",
    "OAB41.OAC17.OAB78.79.A",
    "OAA49.59.79.80.A",
    "OAA58.60.61.62.A",
    "OAA85.86.87.88.A",
    "OAB13.14.15.16.A",
    "OAB33.34.35.36.A",
    "OAB37.38.39.40.A",
]

### Read platemaps

First need to pull them from https://github.com/jump-cellpainting/jump-orf-data/tree/master/metadata/platemaps

In [4]:
platemaps = glob.glob("../../jump-orf-data/metadata/platemaps/*/*.csv")
platemaps = pd.concat((pd.read_csv(f) for f in platemaps), ignore_index=True)
assert ~platemaps.duplicated().any()

platemaps.columns = ["Metadata_Plate", "Metadata_Plate_Map_Name"]
platemaps

Unnamed: 0,Metadata_Plate,Metadata_Plate_Map_Name
0,BR00126544,control
1,BR00121426,JUMP-Target-2_compound_platemap
2,BR00126542,OAB37.38.39.40.A
3,BR00126541,OAB37.38.39.40.A
4,BR00126540,OAB37.38.39.40.A
...,...,...
272,BR00123627,OAA41.42.43.OAB45.A
273,BR00123628,OAA41.42.43.OAB45.A
274,BR00123629,OAA41.42.43.OAB45.A
275,BR00123621,OAA41.42.43.OAB45.A


See distribution of number of replicates per plate map

In [5]:
platemaps.groupby("Metadata_Plate_Map_Name").count().value_counts()

Metadata_Plate
5                 41
6                  2
10                 2
4                  1
14                 1
22                 1
dtype: int64

### Select first 5 plates for each overlapping platemap

In [6]:
# select only JCP overlap plates
overlap_plates = platemaps[platemaps["Metadata_Plate_Map_Name"].isin(JCP_OVERLAP_PLATES)]

# select first 2 plates from each plate map
overlap_plates = overlap_plates.groupby("Metadata_Plate_Map_Name").head(5)

overlap_plates

Unnamed: 0,Metadata_Plate,Metadata_Plate_Map_Name
2,BR00126542,OAB37.38.39.40.A
3,BR00126541,OAB37.38.39.40.A
4,BR00126540,OAB37.38.39.40.A
5,BR00126539,OAB37.38.39.40.A
6,BR00126538,OAB37.38.39.40.A
41,BR00125164,OAA58.60.61.62.A
42,BR00125163,OAA58.60.61.62.A
43,BR00125162,OAA58.60.61.62.A
44,BR00124785,OAA58.60.61.62.A
45,BR00124784,OAA58.60.61.62.A


### Read profiles

In [7]:
ann_df = pd.read_parquet("output/raw_filtered_profiles.parquet")
ann_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,82.535004,82.875999,76.996002,77.473999,76.582001,77.233002,78.186996,80.055,77.632004,79.955002
1,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,72.304001,70.678001,67.073997,67.804001,67.042,67.779999,68.393997,70.334999,68.253998,70.283997
2,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,91.155998,89.515999,84.224998,84.994003,84.225998,85.214996,85.594002,88.263,85.778,88.174004
3,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,71.458,70.707001,66.263,67.061996,66.330002,67.092003,67.583,69.57,67.693001,69.511002
4,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,62.820999,61.692001,58.573002,58.917,58.855999,59.345001,59.471001,61.16,59.66,61.311001


### Keep only selected plates

In [8]:
ann_df = ann_df.merge(overlap_plates, on="Metadata_Plate")
print(f"{ann_df.shape=}")
ann_df.groupby("Metadata_Plate")["Metadata_Well"].count()

ann_df.shape=(19144, 4778)


Metadata_Plate
BR00117037    353
BR00117038    353
BR00117039    353
BR00117040    353
BR00117041    353
BR00121552    268
BR00121553    268
BR00121554    268
BR00121555    268
BR00121556    268
BR00123613    358
BR00123614    358
BR00123616    358
BR00123617    358
BR00123618    358
BR00123785    355
BR00123786    355
BR00123787    355
BR00123790    355
BR00123791    355
BR00123945    354
BR00123947    354
BR00123948    354
BR00123949    354
BR00123950    354
BR00123951    354
BR00124766    354
BR00124767    354
BR00124768    354
BR00124769    354
BR00124784    354
BR00124785    354
BR00124787    360
BR00124788    360
BR00125162    354
BR00125163    354
BR00125164    354
BR00125619    359
BR00125620    360
BR00125621    360
BR00126056    357
BR00126057    357
BR00126058    357
BR00126059    357
BR00126060    357
BR00126395    359
BR00126396    359
BR00126397    359
BR00126398    359
BR00126399    359
BR00126538    357
BR00126539    357
BR00126540    357
BR00126541    357
BR00126542   

### Save subset profiles

In [9]:
ann_df.to_parquet("output/subset_profiles.parquet", index=False)

### Preprocess subset features

In [10]:
ann_df = preprocess_profiles(ann_df)
ann_df.shape

(19143, 584)

### Save processed subset profiles

In [11]:
ann_df.to_parquet("output/subset_processed_profiles.parquet", index=False)