# Load profiles and save it as a parquet file

## Import libraries

In [1]:
import pandas as pd
from load_profiles import load_profiles

## Load metadata

In [2]:
plates = pd.read_csv("../../datasets/metadata/plate.csv.gz")
wells = pd.read_csv("../../datasets/metadata/well.csv.gz")
compound = pd.read_csv("../../datasets/metadata/compound.csv.gz")
orf = pd.read_csv("../../datasets/metadata/orf.csv.gz")


## Download raw profiles

Setting `columns = None` below will load all of the features.

<div class="alert alert-warning">
WARNING: Files are located in S3. This loop loads only two features per each sampled plate; loading many feature and/or many plates can take several minutes.
</div>

In [3]:
load_profiles(
    dataset="cpg0016-jump",
    source="source_4",
    batch="2021_06_14_Batch6",
    # plate="BR00124787",
    columns=["Metadata_Source", "Metadata_Plate", "Metadata_Well", "Cells_AreaShape_Eccentricity", "Nuclei_AreaShape_Area"],
    output="input/raw_profiles_batch6.parquet",
)

2023-04-04 17:34:06 INFO     Loading columns: ['Metadata_Source', 'Metadata_Plate', 'Metadata_Well', 'Cells_AreaShape_Eccentricity', 'Nuclei_AreaShape_Area']
2023-04-04 17:34:06 INFO     Loading profiles from cellpainting-gallery/cpg0016-jump/source_4/workspace/profiles/2021_06_14_Batch6


## Loading profiles


In [None]:
profiles = pd.read_parquet("input/raw_profiles_batch6.parquet")
profiles.groupby("Metadata_Plate")["Metadata_Well"].count()


Metadata_Plate
BR00121429    384
BR00123945    384
BR00124761    384
BR00124766    384
BR00124767    384
BR00124768    384
BR00124769    384
BR00124770    384
BR00124771    384
BR00124772    384
BR00124773    384
BR00124774    384
BR00124775    384
BR00124776    384
BR00124777    384
BR00124778    384
BR00124779    384
Name: Metadata_Well, dtype: int64

Join features with metadata


In [None]:
metadata = orf.merge(wells, on="Metadata_JCP2022")
ann_dframe = metadata.merge(
    profiles, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)
print(f"{ann_dframe.shape=}")
ann_dframe.groupby("Metadata_Plate")["Metadata_Well"].count()


Metadata_Plate
BR00123945    364
BR00124761    368
BR00124766    364
BR00124767    364
BR00124768    364
BR00124769    364
BR00124770    368
BR00124771    368
BR00124772    368
BR00124773    368
BR00124775    368
BR00124776    368
BR00124777    368
BR00124778    368
BR00124779    368
Name: Metadata_Well, dtype: int64

Print sample of rows from `ann_dframe` (only `Metadata_` columns)

In [None]:
ann_dframe.filter(regex="^Metadata_").sample(5)

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well
4034,JCP2022_913764,ccsbBroad304_14738,ORF012652.2_TRC304.1,pLX_304,NM_002611.5,PDK2,5164,9606,pyruvate dehydrogenase kinase 2,100.0,1221.0,trt,source_4,BR00124769,A13
3479,JCP2022_913190,ccsbBroad304_14135,ORF001119.1_TRC304.1,pLX_304,NM_015960.3,CUTC,51076,9606,cutC copper transporter,59.3,484.0,trt,source_4,BR00124779,O08
3858,JCP2022_913687,ccsbBroad304_14658,ORF007297.2_TRC304.1,pLX_304,NM_002082.3,GRK6,2870,9606,G protein-coupled receptor kinase 6,99.8,1767.0,trt,source_4,BR00124768,F15
2883,JCP2022_910912,ccsbBroad304_11704,ORF003091.1_TRC304.1,pLX_304,XM_011525638.3,ANKRD12,23253,9606,ankyrin repeat domain 12,16.4,990.0,trt,source_4,BR00124778,D01
1519,JCP2022_906081,ccsbBroad304_06499,ORF006387.1_TRC304.1,pLX_304,NM_001320611.1,KPNA2,3838,9606,karyopherin subunit alpha 2,99.6,1587.0,trt,source_4,BR00124777,I22


## Save profiles

In [None]:
ann_dframe.to_parquet("output/metadata_profiles_batch6.parquet", index=False)

## Subsample for testing

In [None]:
profiles = pd.read_parquet("input/raw_profiles.parquet")
ann_dframe = metadata.merge(
    profiles, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)
print(f"{ann_dframe.shape=}")

ann_dframe.shape=(86699, 4783)


In [None]:
import glob

platemaps = glob.glob("../../jump-orf-data/metadata/platemaps/*/*.csv")
platemaps = pd.concat((pd.read_csv(f) for f in platemaps), ignore_index=True)
assert ~platemaps.duplicated().any()

platemaps.columns = ["Metadata_Plate", "Metadata_Plate_Map_Name"]
platemaps


Unnamed: 0,Metadata_Plate,Metadata_Plate_Map_Name
0,BR00126544,control
1,BR00121426,JUMP-Target-2_compound_platemap
2,BR00126542,OAB37.38.39.40.A
3,BR00126541,OAB37.38.39.40.A
4,BR00126540,OAB37.38.39.40.A
...,...,...
272,BR00123627,OAA41.42.43.OAB45.A
273,BR00123628,OAA41.42.43.OAB45.A
274,BR00123629,OAA41.42.43.OAB45.A
275,BR00123621,OAA41.42.43.OAB45.A


In [None]:
ann_dframe = ann_dframe.merge(platemaps, on="Metadata_Plate")
ann_dframe

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,...,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,dataset,source,workspace,profiles,batch,plate,Metadata_Plate_Map_Name
0,JCP2022_900002,ccsbBroad304_00001,ORF008415.1_TRC304.1,pLX_304,NM_001160173.3,NAT1,9,9606,N-acetyltransferase 1,100.0,...,80.055000,77.632004,79.955002,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
1,JCP2022_900011,ccsbBroad304_00013,ORF009063.1_TRC304.1,pLX_304,NM_001612.6,ACRV1,56,9606,acrosomal vesicle protein 1,100.0,...,92.719002,89.843002,92.597000,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
2,JCP2022_900033,ccsbBroad304_00037,ORF015627.1_TRC304.1,pLX_304,NM_001136.5,AGER,177,9606,advanced glycosylation end-product specific re...,100.0,...,131.880005,127.940002,131.960007,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
3,JCP2022_900063,ccsbBroad304_00069,ORF005433.1_TRC304.1,pLX_304,NM_001153.5,ANXA4,307,9606,annexin A4,100.0,...,85.179001,82.646004,85.292999,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
4,JCP2022_900084,ccsbBroad304_00091,ORF014376.1_TRC304.1,pLX_304,NM_001651.4,AQP5,362,9606,aquaporin 5,100.0,...,90.223000,87.663002,90.227997,2021_06_21_Batch7,BR00125170,,,,,OAB09.10.11.12.A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86694,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,44.397999,43.558998,44.814999,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86695,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,46.152000,45.151001,46.422001,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86696,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,33.742001,32.956001,33.784000,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A
86697,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,...,40.847000,40.073002,40.930000,2021_08_23_Batch12,BR00126714,,,,,OKA01.02.03.04.A


In [None]:
# check JCP overlap between plates, https://github.com/jump-cellpainting/jump-cellpainting/issues/78#issuecomment-805942281

set(ann_dframe.loc[ann_dframe["Metadata_Plate_Map_Name"] == 'OKA05.06.07.08.A', "Metadata_JCP2022"].unique()).intersection(
    set(ann_dframe.loc[ann_dframe["Metadata_Plate_Map_Name"] == 'OAB84.85.86.87.A', "Metadata_JCP2022"].unique())
)

{'JCP2022_900001',
 'JCP2022_900475',
 'JCP2022_901127',
 'JCP2022_906380',
 'JCP2022_908011',
 'JCP2022_910867',
 'JCP2022_912974',
 'JCP2022_912981',
 'JCP2022_915128',
 'JCP2022_915129',
 'JCP2022_915130',
 'JCP2022_915131',
 'JCP2022_915132'}

In [None]:
# anti join `Metadata_JCP2022` to get the rows in `wells` that are not in `orf`
wells[~wells.Metadata_JCP2022.isin(orf.Metadata_JCP2022)]



Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
0,source_10,Dest210531-152149,A01,JCP2022_085227
1,source_10,Dest210531-152149,A02,JCP2022_033924
2,source_10,Dest210531-152149,A03,JCP2022_056163
3,source_10,Dest210531-152149,A04,JCP2022_054175
4,source_10,Dest210531-152149,A05,JCP2022_999999
...,...,...,...,...
586613,source_8,A1170544,P20,JCP2022_041390
586614,source_8,A1170544,P21,JCP2022_999999
586615,source_8,A1170544,P22,JCP2022_024363
586616,source_8,A1170544,P23,JCP2022_033924
