# Load profiles and save it as a parquet file

## Import libraries

In [1]:
import pandas as pd

## Helper functions

In [2]:
profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

## Load metadata

In [3]:
plates = pd.read_csv("../../datasets/metadata/plate.csv.gz")
wells = pd.read_csv("../../datasets/metadata/well.csv.gz")
compound = pd.read_csv("../../datasets/metadata/compound.csv.gz")
orf = pd.read_csv("../../datasets/metadata/orf.csv.gz")


## Load plates

In [4]:
sample = (
    plates.query('Metadata_Source=="source_4"')
    .query('Metadata_PlateType=="ORF"')
    .query('Metadata_Batch=="2021_07_12_Batch8"') # drop this later
)

# count the number of plates per batch
sample.groupby("Metadata_Batch")["Metadata_Plate"].count()


Metadata_Batch
2021_07_12_Batch8    20
Name: Metadata_Plate, dtype: int64

## Loading profiles

Now let's load the profiles from these plates.

Setting `columns = None` below will load all of the features.

<div class="alert alert-warning">
WARNING: Files are located in S3. This loop loads only two features per each sampled plate; loading many feature and/or many plates can take several minutes.
</div>

In [5]:
dframes = []
columns = [
    "Metadata_Source",
    "Metadata_Plate",
    "Metadata_Well",
    "Cells_AreaShape_Eccentricity",
    "Nuclei_AreaShape_Area",
]
for _, row in sample.iterrows():
    s3_path = profile_formatter.format(**row.to_dict())
    dframes.append(
        pd.read_parquet(s3_path, storage_options={"anon": True}, columns=columns)
    )
dframes = pd.concat(dframes)

dframes.groupby("Metadata_Plate")["Metadata_Well"].count()


Join features with metadata


In [None]:
metadata = orf.merge(wells, on="Metadata_JCP2022")
ann_dframe = metadata.merge(
    dframes, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)

ann_dframe.groupby("Metadata_Plate")["Metadata_Well"].count()


Metadata_Plate
BR00124787    368
BR00124788    368
BR00125619    367
BR00125620    368
BR00125621    368
BR00125622    368
BR00125623    368
BR00125624    368
BR00125625    368
BR00125626    368
BR00125627    368
BR00125628    368
BR00125629    368
BR00125630    368
BR00125631    368
BR00125633    368
BR00125634    368
BR00125635    368
BR00125636    368
Name: Metadata_Well, dtype: int64

Print sample of rows from `ann_dframe` (only `Metadata_` columns)

In [None]:
ann_dframe.filter(regex="^Metadata_").sample(5)

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well
4467,JCP2022_909883,ccsbBroad304_10603,ORF010829.1_TRC304.1,pLX_304,XLOC_l2_006955_3,XLOC_l2_006955,XLOC_l2_006955,9606,XLOC_l2_006955,,348.0,trt,source_4,BR00125620,E15
6889,JCP2022_915132,ccsbBroad304_99997,ORFC00005.1_TRC304.1,pLX_304,eGFP.1,eGFP,eGFP,CONTROL,Hahn Lab eGFP,100.0,717.0,poscon,source_4,BR00125623,O19
422,JCP2022_900688,ccsbBroad304_00735,ORF004121.1_TRC304.1,pLX_304,NM_005666.4,CFHR2,3080,9606,complement factor H related 2,100.0,810.0,trt,source_4,BR00125633,B11
6145,JCP2022_914595,ccsbBroad304_15601,ORF000771.1_TRC304.1,pLX_304,NM_001369516.1,STAT3,6774,9606,signal transducer and activator of transcripti...,100.0,2307.0,trt,source_4,BR00125626,M11
6658,JCP2022_915129,ccsbBroad304_99988,ORFC00002.1_TRC304.1,pLX_304,HcRed.1,HcRed,HcRed,CONTROL,,,824.0,negcon,source_4,BR00125628,K11


## Save profiles

In [None]:
ann_dframe.to_parquet("output/raw_profiles.parquet", index=False)

In [None]:
# anti join `Metadata_JCP2022` to get the rows in `wells` that are not in `orf`
wells[~wells.Metadata_JCP2022.isin(orf.Metadata_JCP2022)]



Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
0,source_10,Dest210531-152149,A01,JCP2022_085227
1,source_10,Dest210531-152149,A02,JCP2022_033924
2,source_10,Dest210531-152149,A03,JCP2022_056163
3,source_10,Dest210531-152149,A04,JCP2022_054175
4,source_10,Dest210531-152149,A05,JCP2022_999999
...,...,...,...,...
586613,source_8,A1170544,P20,JCP2022_041390
586614,source_8,A1170544,P21,JCP2022_999999
586615,source_8,A1170544,P22,JCP2022_024363
586616,source_8,A1170544,P23,JCP2022_033924
