## Load profiles and save it as a parquet file

In [1]:
from pathlib import Path

import pandas as pd
from load import load

In [2]:
ORF_DATA_CONFIG = {
    "dataset": "cpg0016-jump-fixed",
    "source": "source_4",
    "component": "profiles",
}

### Load raw profiles

Setting `columns = None` below will load all of the features.

<div class="alert alert-warning">
WARNING: Files are located in S3. This loop loads only two features per each sampled plate; loading many feature and/or many plates can take several minutes.
</div>

In [3]:
profiles = load(**ORF_DATA_CONFIG)

## Example of loading individual batches / plates
# profiles = load(
#     dataset="cpg0016-jump-fixed",
#     source="source_4",
#     component="profiles",
    # batch="2021_06_14_Batch6",
    # plate="BR00124787",
    # columns=["Metadata_Source", "Metadata_Plate", "Metadata_Well", "Cells_AreaShape_Eccentricity", "Nuclei_AreaShape_Area"],
    # output="input/raw_profiles_batch6.parquet",
# )

2023-05-04 11:23:10 INFO     Loading profiles from cellpainting-gallery/cpg0016-jump-fixed/source_4/workspace/profiles
2023-05-04 11:27:38 INFO     Found 277 files
2023-05-04 11:27:38 INFO     Load profiles...


In [4]:
profiles

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256,dataset,source,workspace,profiles,batch,plate
0,source_4,BR00117035,A01,5834.000000,12214.000000,569.679993,552.330017,460.510010,444.850006,515.229980,...,34.516998,35.355000,34.396000,35.499001,2021_04_26_Batch1,BR00117035,,,,
1,source_4,BR00117035,A02,5463.799805,11298.000000,573.630005,521.140015,469.579987,416.619995,521.219971,...,41.456001,42.639999,41.368000,42.675999,2021_04_26_Batch1,BR00117035,,,,
2,source_4,BR00117035,A03,5416.000000,10838.000000,602.969971,526.530029,501.630005,423.839996,551.570007,...,43.127998,44.056000,43.116001,44.318001,2021_04_26_Batch1,BR00117035,,,,
3,source_4,BR00117035,A04,5949.799805,12099.000000,606.369995,542.229980,498.839996,433.579987,552.299988,...,48.040001,49.353001,47.851002,49.601002,2021_04_26_Batch1,BR00117035,,,,
4,source_4,BR00117035,A05,5820.700195,11727.000000,574.479980,538.989990,469.359985,430.269989,521.330017,...,37.455002,38.362000,37.421001,38.554001,2021_04_26_Batch1,BR00117035,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106347,source_4,BR00127149,P20,3994.500000,7444.100098,585.150024,593.719971,499.470001,508.179993,541.820007,...,48.408001,49.757999,48.396000,49.648998,2021_08_30_Batch13,BR00127149,,,,
106348,source_4,BR00127149,P21,3768.500000,7743.100098,574.130005,595.590027,484.209991,510.839996,528.650024,...,43.914001,44.806000,43.602001,45.058998,2021_08_30_Batch13,BR00127149,,,,
106349,source_4,BR00127149,P22,3907.800049,7337.000000,558.799988,587.570007,474.859985,502.049988,516.130005,...,46.410999,47.698002,46.439999,47.862000,2021_08_30_Batch13,BR00127149,,,,
106350,source_4,BR00127149,P23,3965.000000,7350.500000,591.299988,591.190002,506.019989,506.510010,548.289978,...,32.987000,33.810001,33.090000,33.995998,2021_08_30_Batch13,BR00127149,,,,


### Save raw profiles

In [5]:
profiles.to_parquet("input/raw_profiles.parquet", index=False)