# Load profiles and save it as a parquet file

## Import libraries

In [12]:
import pandas as pd
from load_profiles import load_profiles

## Load metadata

In [13]:
plates = pd.read_csv("../../datasets/metadata/plate.csv.gz")
wells = pd.read_csv("../../datasets/metadata/well.csv.gz")
compound = pd.read_csv("../../datasets/metadata/compound.csv.gz")
orf = pd.read_csv("../../datasets/metadata/orf.csv.gz")


## Load raw profiles

Setting `columns = None` below will load all of the features.

<div class="alert alert-warning">
WARNING: Files are located in S3. This loop loads only two features per each sampled plate; loading many feature and/or many plates can take several minutes.
</div>

In [14]:
profiles = load_profiles(
    dataset="cpg0016-jump-fixed",
    source="source_4",
    batch="2021_06_14_Batch6",
    # plate="BR00124787",
    # columns=["Metadata_Source", "Metadata_Plate", "Metadata_Well", "Cells_AreaShape_Eccentricity", "Nuclei_AreaShape_Area"],
    # output="input/raw_profiles_batch6.parquet",
)

2023-04-05 11:33:41 INFO     Loading profiles from cellpainting-gallery/cpg0016-jump-fixed/source_4/workspace/profiles/2021_06_14_Batch6
2023-04-05 11:34:01 INFO     Found 17 files
2023-04-05 11:34:01 INFO     Load profiles...


### Remove extra schema columns

See https://github.com/broadinstitute/position-effect-correction/pull/4#discussion_r1158626931

In [24]:
profiles.drop(columns=['dataset', 'source', 'workspace', 'profiles', 'batch', 'plate'], inplace=True)
profiles.columns


Index(['Metadata_Source', 'Metadata_Plate', 'Metadata_Well',
       'Cells_AreaShape_Area', 'Cells_AreaShape_BoundingBoxArea',
       'Cells_AreaShape_BoundingBoxMaximum_X',
       'Cells_AreaShape_BoundingBoxMaximum_Y',
       'Cells_AreaShape_BoundingBoxMinimum_X',
       'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_Center_X',
       ...
       'Nuclei_Texture_Variance_RNA_10_02_256',
       'Nuclei_Texture_Variance_RNA_10_03_256',
       'Nuclei_Texture_Variance_RNA_3_00_256',
       'Nuclei_Texture_Variance_RNA_3_01_256',
       'Nuclei_Texture_Variance_RNA_3_02_256',
       'Nuclei_Texture_Variance_RNA_3_03_256',
       'Nuclei_Texture_Variance_RNA_5_00_256',
       'Nuclei_Texture_Variance_RNA_5_01_256',
       'Nuclei_Texture_Variance_RNA_5_02_256',
       'Nuclei_Texture_Variance_RNA_5_03_256'],
      dtype='object', length=4765)

### Join features with metadata


In [26]:
metadata = orf.merge(wells, on="Metadata_JCP2022")
ann_dframe = metadata.merge(
    profiles, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)
print(f"{ann_dframe.shape=}")
ann_dframe.groupby("Metadata_Plate")["Metadata_Well"].count()


ann_dframe.shape=(5500, 4777)


Metadata_Plate
BR00123945    364
BR00124761    368
BR00124766    364
BR00124767    364
BR00124768    364
BR00124769    364
BR00124770    368
BR00124771    368
BR00124772    368
BR00124773    368
BR00124775    368
BR00124776    368
BR00124777    368
BR00124778    368
BR00124779    368
Name: Metadata_Well, dtype: int64

Print sample of rows from `ann_dframe` (only `Metadata_` columns)

In [27]:
ann_dframe.filter(regex="^Metadata_").sample(5)

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well
2324,JCP2022_908893,ccsbBroad304_09532,ORF005554.1_TRC304.1,pLX_304,NM_152393.4,KLHL40,131377,9606,kelch like family member 40,99.6,1863.0,trt,source_4,BR00124777,B17
364,JCP2022_901313,ccsbBroad304_01402,ORF013020.1_TRC304.1,pLX_304,NM_000326.5,RLBP1,6017,9606,retinaldehyde binding protein 1,100.0,951.0,trt,source_4,BR00124779,P14
5134,JCP2022_915128,ccsbBroad304_99985,ORFC00001.1_TRC304.1,pLX_304,BFP.1,BFP,BFP,CONTROL,Hahn Lab BFP,99.5,717.0,negcon,source_4,BR00124767,A01
4947,JCP2022_914710,ccsbBroad304_15727,ORF005400.1_TRC304.1,pLX_304,NM_001191006.3,SRSF10,10772,9606,serine and arginine rich splicing factor 10,100.0,519.0,trt,source_4,BR00124773,O02
542,JCP2022_901919,ccsbBroad304_02056,ORF016768.1_TRC304.1,pLX_304,NM_004198.3,CHRNA6,8973,9606,cholinergic receptor nicotinic alpha 6 subunit,100.0,1482.0,trt,source_4,BR00124775,G19


## Save profiles

In [28]:
ann_dframe.to_parquet("output/metadata_profiles_batch6.parquet", index=False)

## Subsample for testing

In [18]:
OVERLAP_PLATES = [
    "OAB84.85.86.87.A",
    "OAA97.98.99.XX.A",
    "OAB25.26.27.28.A",
    "OAB41.OAC17.OAB78.79.A",
    "OAA49.59.79.80.A",
    "OAA58.60.61.62.A",
    "OAA85.86.87.88.A",
    "OAB13.14.15.16.A",
    "OAB33.34.35.36.A",
    "OAB37.38.39.40.A",
]

In [24]:
profiles = pd.read_parquet("input/raw_profiles.parquet")
profiles

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,dataset,source,workspace,profiles,batch,plate
0,source_4,BR00117035,A01,5834.000000,12214.000000,569.679993,552.330017,460.510010,444.850006,515.229980,...,34.516998,35.355000,34.396000,35.499001,2021_04_26_Batch1,BR00117035,,,,
1,source_4,BR00117035,A02,5463.799805,11298.000000,573.630005,521.140015,469.579987,416.619995,521.219971,...,41.456001,42.639999,41.368000,42.675999,2021_04_26_Batch1,BR00117035,,,,
2,source_4,BR00117035,A03,5416.000000,10838.000000,602.969971,526.530029,501.630005,423.839996,551.570007,...,43.127998,44.056000,43.116001,44.318001,2021_04_26_Batch1,BR00117035,,,,
3,source_4,BR00117035,A04,5949.799805,12099.000000,606.369995,542.229980,498.839996,433.579987,552.299988,...,48.040001,49.353001,47.851002,49.601002,2021_04_26_Batch1,BR00117035,,,,
4,source_4,BR00117035,A05,5820.700195,11727.000000,574.479980,538.989990,469.359985,430.269989,521.330017,...,37.455002,38.362000,37.421001,38.554001,2021_04_26_Batch1,BR00117035,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106347,source_4,BR00127149,P20,3994.500000,7444.100098,585.150024,593.719971,499.470001,508.179993,541.820007,...,48.408001,49.757999,48.396000,49.648998,2021_08_30_Batch13,BR00127149,,,,
106348,source_4,BR00127149,P21,3768.500000,7743.100098,574.130005,595.590027,484.209991,510.839996,528.650024,...,43.914001,44.806000,43.602001,45.058998,2021_08_30_Batch13,BR00127149,,,,
106349,source_4,BR00127149,P22,3907.800049,7337.000000,558.799988,587.570007,474.859985,502.049988,516.130005,...,46.410999,47.698002,46.439999,47.862000,2021_08_30_Batch13,BR00127149,,,,
106350,source_4,BR00127149,P23,3965.000000,7350.500000,591.299988,591.190002,506.019989,506.510010,548.289978,...,32.987000,33.810001,33.090000,33.995998,2021_08_30_Batch13,BR00127149,,,,


In [20]:
profiles = pd.read_parquet("input/raw_profiles.parquet")



ann_dframe = metadata.merge(
    profiles, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)
print(f"{ann_dframe.shape=}")
ann_dframe

ann_dframe.shape=(86699, 4783)


Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,dataset,source,workspace,profiles,batch,plate
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,78.186996,80.055000,77.632004,79.955002,2021_06_21_Batch7,BR00125170,,,,
1,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,68.393997,70.334999,68.253998,70.283997,2021_06_21_Batch7,BR00125171,,,,
2,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,85.594002,88.263000,85.778000,88.174004,2021_06_21_Batch7,BR00125172,,,,
3,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,67.583000,69.570000,67.693001,69.511002,2021_06_21_Batch7,BR00125173,,,,
4,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,59.471001,61.160000,59.660000,61.311001,2021_06_21_Batch7,BR00125174,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86694,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,48.321999,49.332001,48.254002,49.356998,2021_08_23_Batch12,BR00126718,,,,
86695,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,62.254002,63.467999,62.254002,63.646000,2021_08_23_Batch12,BR00126718,,,,
86696,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,54.183998,55.118999,54.115002,55.305000,2021_08_23_Batch12,BR00126718,,,,
86697,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,49.897999,51.120998,49.785999,50.945999,2021_08_23_Batch12,BR00126718,,,,


In [23]:
profiles.columns

Index(['Metadata_Source', 'Metadata_Plate', 'Metadata_Well',
       'Cells_AreaShape_Area', 'Cells_AreaShape_BoundingBoxArea',
       'Cells_AreaShape_BoundingBoxMaximum_X',
       'Cells_AreaShape_BoundingBoxMaximum_Y',
       'Cells_AreaShape_BoundingBoxMinimum_X',
       'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_Center_X',
       ...
       'Nuclei_Texture_Variance_RNA_5_00_256',
       'Nuclei_Texture_Variance_RNA_5_01_256',
       'Nuclei_Texture_Variance_RNA_5_02_256',
       'Nuclei_Texture_Variance_RNA_5_03_256', 'dataset', 'source',
       'workspace', 'profiles', 'batch', 'plate'],
      dtype='object', length=4771)

In [9]:
import glob

# pull from https://github.com/jump-cellpainting/jump-orf-data/tree/master/metadata/platemaps
platemaps = glob.glob("../../jump-orf-data/metadata/platemaps/*/*.csv")
platemaps = pd.concat((pd.read_csv(f) for f in platemaps), ignore_index=True)
assert ~platemaps.duplicated().any()

platemaps.columns = ["Metadata_Plate", "Metadata_Plate_Map_Name"]
platemaps


Unnamed: 0,Metadata_Plate,Metadata_Plate_Map_Name
0,BR00126544,control
1,BR00121426,JUMP-Target-2_compound_platemap
2,BR00126542,OAB37.38.39.40.A
3,BR00126541,OAB37.38.39.40.A
4,BR00126540,OAB37.38.39.40.A
...,...,...
272,BR00123627,OAA41.42.43.OAB45.A
273,BR00123628,OAA41.42.43.OAB45.A
274,BR00123629,OAA41.42.43.OAB45.A
275,BR00123621,OAA41.42.43.OAB45.A


In [10]:
ann_dframe = ann_dframe.merge(platemaps, on="Metadata_Plate")
ann_dframe

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,dataset,source,workspace,profiles,batch,plate,Metadata_Plate_Map_Name
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,80.055000,77.632004,79.955002,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
1,JCP2022_900011,ccsbBroad304_00013,ORF009063.1_TRC304.1,pLX_304,NM_001612.6,ACRV1,56,9606,acrosomal vesicle protein 1,100.0,...,92.719002,89.843002,92.597000,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
2,JCP2022_900033,ccsbBroad304_00037,ORF015627.1_TRC304.1,pLX_304,NM_001136.5,AGER,177,9606,advanced glycosylation end-product specific re...,100.0,...,131.880005,127.940002,131.960007,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
3,JCP2022_900063,ccsbBroad304_00069,ORF005433.1_TRC304.1,pLX_304,NM_001153.5,ANXA4,307,9606,annexin A4,100.0,...,85.179001,82.646004,85.292999,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
4,JCP2022_900084,ccsbBroad304_00091,ORF014376.1_TRC304.1,pLX_304,NM_001651.4,AQP5,362,9606,aquaporin 5,100.0,...,90.223000,87.663002,90.227997,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86694,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,44.397999,43.558998,44.814999,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86695,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,46.152000,45.151001,46.422001,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86696,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,33.742001,32.956001,33.784000,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86697,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,40.847000,40.073002,40.930000,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A


In [19]:
# check JCP overlap between plates, https://github.com/jump-cellpainting/jump-cellpainting/issues/78#issuecomment-805942281

overlap = list(set(ann_dframe.loc[ann_dframe["Metadata_Plate_Map_Name"] == 'OKA05.06.07.08.A', "Metadata_JCP2022"].unique()).intersection(
    set(ann_dframe.loc[ann_dframe["Metadata_Plate_Map_Name"] == 'OAB84.85.86.87.A', "Metadata_JCP2022"].unique())
))
print(f"{overlap=}")

ann_dframe.loc[ann_dframe["Metadata_JCP2022"].isin(overlap)]

overlap=['JCP2022_915131', 'JCP2022_915128', 'JCP2022_901127', 'JCP2022_908011', 'JCP2022_900475', 'JCP2022_900001', 'JCP2022_910867', 'JCP2022_912974', 'JCP2022_906380', 'JCP2022_915130', 'JCP2022_915129', 'JCP2022_912981', 'JCP2022_915132']


Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,dataset,source,workspace,profiles,batch,plate,Metadata_Plate_Map_Name
340,JCP2022_900001,BAD CONSTRUCT,BAD CONSTRUCT,pLX_304,,,,,,,...,78.267998,75.882004,78.258003,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
341,JCP2022_900001,BAD CONSTRUCT,BAD CONSTRUCT,pLX_304,,,,,,,...,86.281998,83.939003,85.920998,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
342,JCP2022_900001,BAD CONSTRUCT,BAD CONSTRUCT,pLX_304,,,,,,,...,104.739998,102.239998,104.930000,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
343,JCP2022_900001,BAD CONSTRUCT,BAD CONSTRUCT,pLX_304,,,,,,,...,145.500000,140.309998,144.050003,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
344,JCP2022_915128,ccsbBroad304_99985,ORFC00001.1_TRC304.1,pLX_304,BFP.1,BFP,BFP,CONTROL,Hahn Lab BFP,99.5,...,63.132999,61.470001,63.311001,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86694,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,44.397999,43.558998,44.814999,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86695,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,46.152000,45.151001,46.422001,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86696,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,33.742001,32.956001,33.784000,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86697,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,40.847000,40.073002,40.930000,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A


In [12]:
# anti join `Metadata_JCP2022` to get the rows in `wells` that are not in `orf`
wells[~wells.Metadata_JCP2022.isin(orf.Metadata_JCP2022)]



Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
0,source_10,Dest210531-152149,A01,JCP2022_085227
1,source_10,Dest210531-152149,A02,JCP2022_033924
2,source_10,Dest210531-152149,A03,JCP2022_056163
3,source_10,Dest210531-152149,A04,JCP2022_054175
4,source_10,Dest210531-152149,A05,JCP2022_999999
...,...,...,...,...
586613,source_8,A1170544,P20,JCP2022_041390
586614,source_8,A1170544,P21,JCP2022_999999
586615,source_8,A1170544,P22,JCP2022_024363
586616,source_8,A1170544,P23,JCP2022_033924
