# Load profiles and save it as a parquet file

## Import libraries

In [25]:
import io
import pandas as pd
import plotly.express as px
import plotly.io as pio

## Helper functions

In [26]:
profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

## Load metadata

In [27]:
plates = pd.read_csv("../../datasets/metadata/plate.csv.gz")
wells = pd.read_csv("../../datasets/metadata/well.csv.gz")
compound = pd.read_csv("../../datasets/metadata/compound.csv.gz")
orf = pd.read_csv("../../datasets/metadata/orf.csv.gz")


## Load plates

In [28]:
sample = (
    plates.query('Metadata_Source=="source_4"')
    .query('Metadata_PlateType=="ORF"')
    .query('Metadata_Batch=="2021_07_12_Batch8" | Metadata_Batch=="2021_07_26_Batch9"') # drop this later
    .sample(2)
)

# count the number of plates per batch
sample.groupby("Metadata_Batch")["Metadata_Plate"].count()


Metadata_Batch
2021_07_12_Batch8    1
2021_07_26_Batch9    1
Name: Metadata_Plate, dtype: int64

## Loading profiles

Now let's load the profiles from these plates.

Setting `columns = None` below will load all of the features.

<div class="alert alert-warning">
WARNING: Files are located in S3. This loop loads only two features per each sampled plate; loading many feature and/or many plates can take several minutes.
</div>

In [29]:
dframes = []
columns = [
    "Metadata_Source",
    "Metadata_Plate",
    "Metadata_Well",
    "Cells_AreaShape_Eccentricity",
    "Nuclei_AreaShape_Area",
]
for _, row in sample.iterrows():
    s3_path = profile_formatter.format(**row.to_dict())
    dframes.append(
        pd.read_parquet(s3_path, storage_options={"anon": True}, columns=columns)
    )
dframes = pd.concat(dframes)

dframes.groupby("Metadata_Plate")["Metadata_Well"].count()


Metadata_Plate
BR00125619    383
BR00126046    384
Name: Metadata_Well, dtype: int64

Join features with metadata


In [30]:
metadata = orf.merge(wells, on="Metadata_JCP2022")
ann_dframe = metadata.merge(
    dframes, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"]
)

ann_dframe.groupby("Metadata_Plate")["Metadata_Well"].count()


Metadata_Plate
BR00125619    367
BR00126046    368
Name: Metadata_Well, dtype: int64

Print sample of rows from `ann_dframe` (only `Metadata_` columns)

In [31]:
ann_dframe.filter(regex="^Metadata_").sample(5)

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_Source,Metadata_Plate,Metadata_Well
542,JCP2022_913301,ccsbBroad304_14253,ORF018390.1_TRC304.1,pLX_304,NM_015458.4,MTMR9,66036,9606,myotubularin related protein 9,,1646.0,trt,source_4,BR00126046,F10
642,JCP2022_914431,ccsbBroad304_15431,ORF018511.1_TRC304.1,pLX_304,NM_000516.6,GNAS,2778,9606,GNAS complex locus,100.0,1182.0,trt,source_4,BR00125619,I03
135,JCP2022_904530,ccsbBroad304_04838,ORF002582.1_TRC304.1,pLX_304,NM_181456.2,MRPL55,128308,9606,mitochondrial ribosomal protein L55,100.0,384.0,trt,source_4,BR00125619,M10
121,JCP2022_904018,ccsbBroad304_04288,ORF013024.1_TRC304.1,pLX_304,XM_011537211.3,RPS6KL1,83694,9606,ribosomal protein S6 kinase like 1,92.5,1623.0,trt,source_4,BR00125619,F07
494,JCP2022_913027,ccsbBroad304_13967,ORF014742.1_TRC304.1,pLX_304,NM_001128148.3,TFRC,7037,9606,transferrin receptor,0.7,2279.0,trt,source_4,BR00126046,C15


## Save profiles

In [32]:
ann_dframe.to_parquet("output/profiles.parquet")