# Programmatically generating metadata

This notebook allows to generate the metadata related to the images. It is based on the ofllowing [notebook](https://github.com/theislab/jump-cpg0016-segmentation/blob/main/notebooks/generate_example_config.ipynb) from the The Carpenter-Singh lab at the Broad Institute.

In [70]:
import json
import pandas as pd
import logging
import pathlib as pl
import requests
import os
import string
import re

## Set up template files

In [2]:
plate_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/plate.csv.gz"
well_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/well.csv.gz"
compound_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/compound.csv.gz"
orf_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/orf.csv.gz"

meta_dir = pl.Path("../metadata/")
meta_dir.mkdir(parents=True, exist_ok=True)

plate_path = meta_dir.joinpath("plate.csv.gz")
well_path = meta_dir.joinpath("well.csv.gz")
compound_path = meta_dir.joinpath("compound.csv.gz")
orf_path = meta_dir.joinpath("orf.csv.gz")

meta_links = {
    "plate": (plate_path, plate_link),
    "well": (well_path, well_link),
    "compound": (compound_path, compound_link),
    "orf": (orf_path, orf_link),
}

for (path, link) in meta_links.values():
    if not path.is_file():
        file = requests.get(link)
        with path.open("wb") as f:
            f.write(file.content)

plate = pd.read_csv(meta_links["plate"][0])
well = pd.read_csv(meta_links["well"][0])
compound = pd.read_csv(meta_links["compound"][0])
orf = pd.read_csv(meta_links["orf"][0])

profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

loaddata_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/load_data_csv/"
    "{Metadata_Batch}/{Metadata_Plate}/load_data_with_illum.parquet"
)

df = plate.merge(right=well)

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
0,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A02,JCP2022_033924
1,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A03,JCP2022_085227
2,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A04,JCP2022_033924


In [5]:
df.iloc[:3]

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
0,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A02,JCP2022_033924
1,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A03,JCP2022_085227
2,source_1,Batch1_20221004,UL000109,COMPOUND_EMPTY,A04,JCP2022_033924


In [6]:
well.iloc[:3]

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022
0,source_1,UL000081,A02,JCP2022_033924
1,source_1,UL000081,A03,JCP2022_085227
2,source_1,UL000081,A04,JCP2022_033924


In [7]:
compound.iloc[:3]

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,Metadata_SMILES
0,JCP2022_000001,AAAHWCWPZPSPIW-UHFFFAOYSA-N,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,CCc1nccn1-c1cccc(C2CCCN2C(=O)c2ccc(OCCN(C)C)cc...
1,JCP2022_000002,AAAJHRMBUHXWLD-UHFFFAOYSA-N,InChI=1S/C11H13ClN2O/c12-10-4-2-9(3-5-10)8-14-...,O=C1NCCCN1Cc1ccc(Cl)cc1
2,JCP2022_000004,AAANUZMCJQUYNX-UHFFFAOYSA-N,InChI=1S/C13H22N4O2S/c1-2-7-16-13(5-6-15-16)20...,CCCn1nccc1S(=O)(=O)N1CC2CCC1CNC2


### Subseting the metadata

We subset the metadata to the one we actually downloaded

In [51]:
SEL_PLATES = ["EC000005"]

In [52]:
df_sub = df[df["Metadata_Plate"].isin(SEL_PLATES)].copy()

We create the indentifier prefix from the image

In [53]:
import string


def generate_image_name(pos):
    row = pos[0]
    col = pos[1:]
    if len(col)>2: return None
    col = int(col)
    row_idx = string.ascii_uppercase.index(row)+1
    fmt_string = "r{:02}c{:02}".format(row_idx,col)
    return fmt_string
    

df_sub["Image_Prefix"] =df["Metadata_Well"].apply(generate_image_name)

We can now map it to the image name that we have downloaded

In [160]:
FOLDER_IMAGE = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/EC000005/images/Images"
pimages = os.listdir(FOLDER_IMAGE)
pimages = [x for x in pimages if x.endswith("tiff")]
df_images = pd.DataFrame({"Filename":pimages})
df_images["Image_Prefix"] = df_images["Filename"].str[:6]
df_images = df_images.merge(df_sub,on="Image_Prefix",how="left")
df_images = df_images.drop(columns="Image_Prefix")
df_images = df_images.drop(columns="Metadata_JCP2022")
df_images["Image_PathName"] = df_images["Filename"].apply(lambda x:"file:{}/{}".format(FOLDER_IMAGE,x))
colnames = df_images.columns.tolist()
colnames = [colnames[-1]]+colnames[:-1]
df_images = df_images[colnames]
df_images["Metadata_Channel"] = df_images["Filename"].str[13:16]
df_images["Metadata_WellFrame"] = df_images["Filename"].str[:12]


In [161]:
df_images

Unnamed: 0,Image_PathName,Filename,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_Channel,Metadata_WellFrame
0,file:/mnt/c/Users/alexi/Documents/data/images/...,r01c01f01p01-ch1sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,A01,ch1,r01c01f01p01
1,file:/mnt/c/Users/alexi/Documents/data/images/...,r01c01f01p01-ch2sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,A01,ch2,r01c01f01p01
2,file:/mnt/c/Users/alexi/Documents/data/images/...,r01c01f01p01-ch3sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,A01,ch3,r01c01f01p01
3,file:/mnt/c/Users/alexi/Documents/data/images/...,r01c01f01p01-ch4sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,A01,ch4,r01c01f01p01
4,file:/mnt/c/Users/alexi/Documents/data/images/...,r01c01f01p01-ch5sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,A01,ch5,r01c01f01p01
...,...,...,...,...,...,...,...,...,...
20731,file:/mnt/c/Users/alexi/Documents/data/images/...,r16c24f09p01-ch2sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,P24,ch2,r16c24f09p01
20732,file:/mnt/c/Users/alexi/Documents/data/images/...,r16c24f09p01-ch3sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,P24,ch3,r16c24f09p01
20733,file:/mnt/c/Users/alexi/Documents/data/images/...,r16c24f09p01-ch4sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,P24,ch4,r16c24f09p01
20734,file:/mnt/c/Users/alexi/Documents/data/images/...,r16c24f09p01-ch5sk1fk1fl1.tiff,source_11,Batch1,EC000005,COMPOUND,P24,ch5,r16c24f09p01


Using more interpretable label

In [45]:
CHANNEL_LABELING = {"ch1":"DNA","ch2":"ER","ch3":"nucl_cytoRNA","ch4":"actin_golgi_wga","ch5":"mito","ch6":"unknown"}

In [165]:
df_images["Metadata_ChannelBio"] = df_images["Metadata_Channel"].map(CHANNEL_LABELING)

In [166]:
acols = df_images.columns.tolist()
REMOVED_COL = ["Metadata_ChannelBio","Metadata_Channel","Filename","Image_PathName"]
index_col = [x for x in acols if x not in REMOVED_COL]

In [167]:
df_images["Metadata_ChannelBio"] = df_images["Metadata_ChannelBio"].apply(lambda x:"Image_FileName_"+x)

We change the path to adapt to windows

In [168]:
import pathlib
df_images["Image_PathName"] = df_images["Image_PathName"].str[12:].apply(lambda x: "C:\\"+str(pathlib.PureWindowsPath(x)))


In [169]:
df_images_wide = df_images.pivot(index=index_col,columns=["Metadata_ChannelBio"],values="Image_PathName").reset_index()

Subsampling images for test purpose

In [173]:
df_images_w_subsample = df_images_wide.sample(500,random_state=512)

In [174]:
PATH_META = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/EC000005/metadata_wide.csv"
df_images_w_subsample.to_csv(PATH_META,index=False)

We can now save the metadata table

In [145]:
PATH_META = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/EC000005/metadata_subset.csv"
df_images.to_csv(PATH_META,index=False)

In [155]:
PATH_META = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/EC000005/metadata_subset_wide.csv"
df_images_wide.to_csv(PATH_META,index=False)

Adapting the table for

### Target crispr ORF datasets
Sepcifically focus on getting the metadataand getting the samples from https://github.com/jump-cellpainting/JUMP-Target.

In [6]:
plate_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/plate.csv.gz"
well_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/well.csv.gz"
compound_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/compound.csv.gz"
orf_link = "https://github.com/jump-cellpainting/datasets/raw/main/metadata/orf.csv.gz"

meta_dir = pl.Path("../metadata/")
meta_dir.mkdir(parents=True, exist_ok=True)

plate_path = meta_dir.joinpath("plate.csv.gz")
well_path = meta_dir.joinpath("well.csv.gz")
compound_path = meta_dir.joinpath("compound.csv.gz")
orf_path = meta_dir.joinpath("orf.csv.gz")

meta_links = {
    "plate": (plate_path, plate_link),
    "well": (well_path, well_link),
    "compound": (compound_path, compound_link),
    "orf": (orf_path, orf_link),
}

for (path, link) in meta_links.values():
    if not path.is_file():
        file = requests.get(link)
        with path.open("wb") as f:
            f.write(file.content)

plate = pd.read_csv(meta_links["plate"][0])
well = pd.read_csv(meta_links["well"][0])
compound = pd.read_csv(meta_links["compound"][0])
orf = pd.read_csv(meta_links["orf"][0])

profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

loaddata_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/load_data_csv/"
    "{Metadata_Batch}/{Metadata_Plate}/load_data_with_illum.parquet"
)

df = plate.merge(right=well)

AttributeError: 'Series' object has no attribute 'values_count'

In [20]:
df_targets = df[df.Metadata_PlateType.str.contains("TARGET2")]

In [47]:
def selecting_jcp_perturbations(df,nperturb=40,max_plates=5):
    labels = df.Metadata_JCP2022.value_counts()
    DMSO = labels.index.tolist()[0]
    labels = labels.iloc[1:]
    sel_labels = labels.sample(nperturb).index.tolist()
    sel_plates = df.Metadata_Plate[df.Metadata_JCP2022.isin(sel_labels)].value_counts()
    sel_plates = sel_plates.iloc[:max_plates].index.tolist()
    return df[df.Metadata_Plate.isin(sel_plates)]
    

In [101]:
selected_targets = selecting_jcp_perturbations(df_targets,nperturb=200,max_plates=4)

In [102]:
selected_targets

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
1028928,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693
1028929,source_9,20210901_Run8,GR00003340,TARGET2,A02,JCP2022_033924
1028930,source_9,20210901_Run8,GR00003340,TARGET2,A03,JCP2022_109350
1028931,source_9,20210901_Run8,GR00003340,TARGET2,A04,JCP2022_067426
1028932,source_9,20210901_Run8,GR00003340,TARGET2,A05,JCP2022_050797
...,...,...,...,...,...,...
1070395,source_9,20210918-Run11,GR00004371,TARGET2,Z44,JCP2022_060040
1070396,source_9,20210918-Run11,GR00004371,TARGET2,Z45,JCP2022_019314
1070397,source_9,20210918-Run11,GR00004371,TARGET2,Z46,JCP2022_018899
1070398,source_9,20210918-Run11,GR00004371,TARGET2,Z47,JCP2022_033924


# We extract the path of the selected plates

In [103]:


def s3_cli_str(record):
    x = record.to_dict()
    return "aws s3 cp --recursive \"s3://cellpainting-gallery/cpg0016-jump/{Metadata_Source}/images/{Metadata_Batch}/images/{Metadata_Plate}/\" \"./{Metadata_Plate}\"".format(**x)

In [104]:
selected_targets.apply(s3_cli_str,axis=1).unique()

array(['aws s3 cp --recursive "s3://cellpainting-gallery/cpg0016-jump/source_9/images/20210901_Run8/images/GR00003340/" "./GR00003340"',
       'aws s3 cp --recursive "s3://cellpainting-gallery/cpg0016-jump/source_9/images/20210914-Run9/images/GR00003300/" "./GR00003300"',
       'aws s3 cp --recursive "s3://cellpainting-gallery/cpg0016-jump/source_9/images/20210915-Run10/images/GR00003310/" "./GR00003310"',
       'aws s3 cp --recursive "s3://cellpainting-gallery/cpg0016-jump/source_9/images/20210918-Run11/images/GR00004371/" "./GR00004371"'],
      dtype=object)

In [8]:
PATH_META = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/metadata_targets.csv"
if not os.path.isfile(PATH_META):
    selected_targets.to_csv(PATH_META,index=False)
else:
    selected_targets = pd.read_csv(PATH_META)

We extract all the compounds

In [19]:
all_comp_id = set(compound.Metadata_JCP2022.unique())
sel_comp_id = set(selected_targets.Metadata_JCP2022.unique())

Checking if all selected prturbations are in the dataset

In [28]:
sel_comp_id.difference(all_comp_id)

set()

In [36]:
compound_cols = ["Metadata_JCP2022","Metadata_InChIKey","Metadata_SMILES"]
selected_targets_comp = selected_targets.merge(compound[compound_cols],how="left",on="Metadata_JCP2022")

In [38]:
PATH_META_COMP = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/metadata_targets_with_comp.csv"
selected_targets_comp.to_csv(PATH_META_COMP,index=False)

Wide form metadata generation

In [108]:
NFRAME = 4

In [40]:
selected_targets

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693
1,source_9,20210901_Run8,GR00003340,TARGET2,A02,JCP2022_033924
2,source_9,20210901_Run8,GR00003340,TARGET2,A03,JCP2022_109350
3,source_9,20210901_Run8,GR00003340,TARGET2,A04,JCP2022_067426
4,source_9,20210901_Run8,GR00003340,TARGET2,A05,JCP2022_050797
...,...,...,...,...,...,...
6139,source_9,20210918-Run11,GR00004371,TARGET2,Z44,JCP2022_060040
6140,source_9,20210918-Run11,GR00004371,TARGET2,Z45,JCP2022_019314
6141,source_9,20210918-Run11,GR00004371,TARGET2,Z46,JCP2022_018899
6142,source_9,20210918-Run11,GR00004371,TARGET2,Z47,JCP2022_033924


Formatting the well 

In [49]:
CHANNEL_LABELING_SUB = CHANNEL_LABELING.copy()
del CHANNEL_LABELING_SUB["ch6"]
CHANNEL_LABELING_SUB

{'ch1': 'DNA',
 'ch2': 'ER',
 'ch3': 'nucl_cytoRNA',
 'ch4': 'actin_golgi_wga',
 'ch5': 'mito'}

In [350]:
# PART TO MODIFY DEPENDING OF YOUR LOCATIONS
STR_FILE_LOCATION = "C:\\Users\\alexi\\Documents\\data\\images\\cellpainting\\cpg0016-jump\\data\\<Plate>\\Images\\<WellFmt>f0<Frame>p01-<Channel>sk1fk1fl1.tiff"
STR_FILE_LOCATION_DOCKER = "/mnt/input/<Plate>/Images/<WellFmt>f0<Frame>p01-<Channel>sk1fk1fl1.tiff"
LETTER_TO_POS = {letter:(il+1) for il,letter in enumerate(string.ascii_uppercase)}
NFRAME = 4

def letters_to_number(row_letters):
    nletters = len(row_letters)
    cc = 0
    for il,l in enumerate(row_letters[::-1]):
        cc += (26**il)*LETTER_TO_POS[l]
    return cc

def well_formatting(well):
    """Convert for example C2 to r03c02"""
    vre = "([A-Z]+)([0-9]+)"
    vmatch = re.match(vre,well)
    row_letters = vmatch[1]
    col = vmatch[2]
    nletters = len(row_letters)
    row_idx = letters_to_number(row_letters)
    pos_string = "r{:02}c{:02}".format(row_idx,col)
    return pos_string

def fmt_row(x,channel,frame):
    return {
        "Plate":x["Metadata_Plate"],
        "WellFmt": well_formatting(x["Metadata_Well"]),
        "Channel":str(channel),
        "Frame":str(frame)
    }
def build_path_image(dic_fmt,str_location):
    for key,val in dic_fmt.items():
        replaced = "<"+key+">"
        str_location = str_location.replace(replaced,val)
    return str_location

def generate_file_path(row,base_string):
    arows = []
    for iframe in range(1,NFRAME+1):
        path_dict = {}
        for ch,ch_name in CHANNEL_LABELING_SUB.items():
            dic_fmt = fmt_row(row,channel=ch,frame=iframe)
            path = build_path_image(dic_fmt,base_string)
            path_dict["Image_FileName_"+ch_name] = path
        crow = row.copy()
        for pname,path in path_dict.items():
            crow[pname] = path
        arows.append(crow.T)
    return pd.concat(arows,axis=1).T
        
            

In [223]:
selected_targets.iloc[230,:]

Metadata_Source             source_9
Metadata_Batch         20210901_Run8
Metadata_Plate            GR00003340
Metadata_PlateType           TARGET2
Metadata_Well                   AD39
Metadata_JCP2022      JCP2022_033400
Name: 230, dtype: object

In [224]:
generate_file_path(selected_targets_comp.iloc[230,:]).Image_FileName_DNA.iloc[2]

'C:\\Users\\alexi\\Documents\\data\\images\\cellpainting\\cpg0016-jump\\data\\GR00003340\\Images\\r30c39f03p01-ch1sk1fk1fl1.tiff'

In [307]:
BATCH_SIZE = 500

In [333]:
import functools
pgen = functools.partial(generate_file_path,base_string=STR_FILE_LOCATION)
selected_meta_full = pd.concat(selected_targets_comp.apply(pgen,axis=1).tolist())

Same things for the WSL paths

In [351]:
pgen_docker = functools.partial(generate_file_path,base_string=STR_FILE_LOCATION_DOCKER)
selected_meta_full_docker = pd.concat(selected_targets_comp.apply(pgen_docker,axis=1).tolist())

We verify that all the files exists to prevent crashes

In [335]:
def convert_path_to_wsl(x):
    return x.replace("\\","/").replace("C:","/mnt/c")

def check_images_path(x):
    return all([os.path.exists(convert_path_to_wsl(y)) for y in x])

In [336]:
all_file_exist = (selected_meta_full[[x for x in selected_meta_full.columns if x.startswith("Image_FileName")]]).apply(check_images_path,axis=1)

In [337]:
f_selected_meta_full = selected_meta_full.loc[all_file_exist].copy()
f_selected_meta_full["Metadata_Window"] = f_selected_meta_full.Image_FileName_DNA.str.extract("r[0-9]{1,2}c[0-9]{1,2}(f[0-9]{2})")
f_selected_meta_full["Metadata_WellPlateFrame"] = f_selected_meta_full[["Metadata_Plate","Metadata_Well","Metadata_Window"]].apply(lambda x:"_".join(x),axis=1)

In [352]:
f_selected_meta_docker = selected_meta_full_docker.loc[all_file_exist].copy()
f_selected_meta_docker["Metadata_Window"] = f_selected_meta_docker.Image_FileName_DNA.str.extract("r[0-9]{1,2}c[0-9]{1,2}(f[0-9]{2})")
f_selected_meta_docker["Metadata_WellPlateFrame"] = f_selected_meta_docker[["Metadata_Plate","Metadata_Well","Metadata_Window"]].apply(lambda x:"_".join(x),axis=1)

In [353]:
f_selected_meta_docker

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022,Metadata_InChIKey,Metadata_SMILES,Image_FileName_DNA,Image_FileName_ER,Image_FileName_nucl_cytoRNA,Image_FileName_actin_golgi_wga,Image_FileName_mito,Metadata_Window,Metadata_WellPlateFrame
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/input/GR00003340/Images/r01c01f01p01-ch1s...,/mnt/input/GR00003340/Images/r01c01f01p01-ch2s...,/mnt/input/GR00003340/Images/r01c01f01p01-ch3s...,/mnt/input/GR00003340/Images/r01c01f01p01-ch4s...,/mnt/input/GR00003340/Images/r01c01f01p01-ch5s...,f01,GR00003340_A01_f01
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/input/GR00003340/Images/r01c01f02p01-ch1s...,/mnt/input/GR00003340/Images/r01c01f02p01-ch2s...,/mnt/input/GR00003340/Images/r01c01f02p01-ch3s...,/mnt/input/GR00003340/Images/r01c01f02p01-ch4s...,/mnt/input/GR00003340/Images/r01c01f02p01-ch5s...,f02,GR00003340_A01_f02
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/input/GR00003340/Images/r01c01f03p01-ch1s...,/mnt/input/GR00003340/Images/r01c01f03p01-ch2s...,/mnt/input/GR00003340/Images/r01c01f03p01-ch3s...,/mnt/input/GR00003340/Images/r01c01f03p01-ch4s...,/mnt/input/GR00003340/Images/r01c01f03p01-ch5s...,f03,GR00003340_A01_f03
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/input/GR00003340/Images/r01c01f04p01-ch1s...,/mnt/input/GR00003340/Images/r01c01f04p01-ch2s...,/mnt/input/GR00003340/Images/r01c01f04p01-ch3s...,/mnt/input/GR00003340/Images/r01c01f04p01-ch4s...,/mnt/input/GR00003340/Images/r01c01f04p01-ch5s...,f04,GR00003340_A01_f04
1,source_9,20210901_Run8,GR00003340,TARGET2,A02,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,C[S+](C)[O-],/mnt/input/GR00003340/Images/r01c02f01p01-ch1s...,/mnt/input/GR00003340/Images/r01c02f01p01-ch2s...,/mnt/input/GR00003340/Images/r01c02f01p01-ch3s...,/mnt/input/GR00003340/Images/r01c02f01p01-ch4s...,/mnt/input/GR00003340/Images/r01c02f01p01-ch5s...,f01,GR00003340_A02_f01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6142,source_9,20210918-Run11,GR00004371,TARGET2,Z47,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,C[S+](C)[O-],/mnt/input/GR00004371/Images/r26c47f04p01-ch1s...,/mnt/input/GR00004371/Images/r26c47f04p01-ch2s...,/mnt/input/GR00004371/Images/r26c47f04p01-ch3s...,/mnt/input/GR00004371/Images/r26c47f04p01-ch4s...,/mnt/input/GR00004371/Images/r26c47f04p01-ch5s...,f04,GR00004371_Z47_f04
6143,source_9,20210918-Run11,GR00004371,TARGET2,Z48,JCP2022_113600,ZIUDADZJCKGWKR-UHFFFAOYSA-N,Cc1cc(-c2ccc3c(c2)CCC3N2CC3(CCN(C(=O)Cc4cn5cc(...,/mnt/input/GR00004371/Images/r26c48f01p01-ch1s...,/mnt/input/GR00004371/Images/r26c48f01p01-ch2s...,/mnt/input/GR00004371/Images/r26c48f01p01-ch3s...,/mnt/input/GR00004371/Images/r26c48f01p01-ch4s...,/mnt/input/GR00004371/Images/r26c48f01p01-ch5s...,f01,GR00004371_Z48_f01
6143,source_9,20210918-Run11,GR00004371,TARGET2,Z48,JCP2022_113600,ZIUDADZJCKGWKR-UHFFFAOYSA-N,Cc1cc(-c2ccc3c(c2)CCC3N2CC3(CCN(C(=O)Cc4cn5cc(...,/mnt/input/GR00004371/Images/r26c48f02p01-ch1s...,/mnt/input/GR00004371/Images/r26c48f02p01-ch2s...,/mnt/input/GR00004371/Images/r26c48f02p01-ch3s...,/mnt/input/GR00004371/Images/r26c48f02p01-ch4s...,/mnt/input/GR00004371/Images/r26c48f02p01-ch5s...,f02,GR00004371_Z48_f02
6143,source_9,20210918-Run11,GR00004371,TARGET2,Z48,JCP2022_113600,ZIUDADZJCKGWKR-UHFFFAOYSA-N,Cc1cc(-c2ccc3c(c2)CCC3N2CC3(CCN(C(=O)Cc4cn5cc(...,/mnt/input/GR00004371/Images/r26c48f03p01-ch1s...,/mnt/input/GR00004371/Images/r26c48f03p01-ch2s...,/mnt/input/GR00004371/Images/r26c48f03p01-ch3s...,/mnt/input/GR00004371/Images/r26c48f03p01-ch4s...,/mnt/input/GR00004371/Images/r26c48f03p01-ch5s...,f03,GR00004371_Z48_f03


We add a batch label for processing in parallel

In [338]:
f_selected_meta_full["Metadata_ProcessingBatch"] = (pd.Series(list(range(f_selected_meta_full.shape[0])))//BATCH_SIZE)+1

We create the same table for WSL

In [342]:
f_selected_meta_full_wsl = f_selected_meta_full.copy()

We change all the path to WSL

In [344]:
for ncol in f_selected_meta_full.columns:
    if ncol.startswith("Image_FileName"):
        f_selected_meta_full[ncol] = (f_selected_meta_full[ncol].str.replace("\\","/")).str.replace("C:","/mnt/c")

In [345]:
f_selected_meta_full_wsl.iloc[:5]

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022,Metadata_InChIKey,Metadata_SMILES,Image_FileName_DNA,Image_FileName_ER,Image_FileName_nucl_cytoRNA,Image_FileName_actin_golgi_wga,Image_FileName_mito,Metadata_Window,Metadata_WellPlateFrame,Metadata_ProcessingBatch
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,f01,GR00003340_A01_f01,1
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,f02,GR00003340_A01_f02,1
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,f03,GR00003340_A01_f03,1
0,source_9,20210901_Run8,GR00003340,TARGET2,A01,JCP2022_087693,TZDUHAJSIBHXDL-UHFFFAOYSA-N,CC(OC(=O)NCC1(CC(=O)O)CCCCC1)OC(=O)C(C)C,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,f04,GR00003340_A01_f04,1
1,source_9,20210901_Run8,GR00003340,TARGET2,A02,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,C[S+](C)[O-],/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,/mnt/c/Users/alexi/Documents/data/images/cellp...,f01,GR00003340_A02_f01,1


In [309]:
f_selected_meta_full.shape,selected_meta_full.shape

((19810, 16), (24576, 13))

In [310]:
PATH_META_COMP_FULL_FILTERED = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/metadata_targets_wide_filtered.csv"
f_selected_meta_full.to_csv(PATH_META_COMP_FULL_FILTERED,index=False)

In [None]:
PATH_META_COMP_FULL_FILTERED_WSL = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/metadata_targets_wide_filtered_wsl.csv"
f_selected_meta_full.to_csv(PATH_META_COMP_FULL_FILTERED,index=False)

In [354]:
PATH_META_COMP_FILTERED_DOCKER = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/metadata_targets_docker.csv"
f_selected_meta_docker.to_csv(PATH_META_COMP_FILTERED_DOCKER,index=False)

Adding frame

In [297]:
f_selected_meta_full.columns

Index(['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate',
       'Metadata_PlateType', 'Metadata_Well', 'Metadata_JCP2022',
       'Metadata_InChIKey', 'Metadata_SMILES', 'Image_FileName_DNA',
       'Image_FileName_ER', 'Image_FileName_nucl_cytoRNA',
       'Image_FileName_actin_golgi_wga', 'Image_FileName_mito',
       'Metadata_Window', 'Metadata_WellPlateFrame'],
      dtype='object')

### Splitting by batches

In [355]:
PATH_RESULTS = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/results_target"
PATH_CLI = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/results_target/run_docker.sh"
if not os.path.exists(PATH_RESULTS):
    os.makedirs(PATH_RESULTS)

In [356]:
import numpy as np

In [369]:
def meta_path(batch):
    return "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/results_target/batch"+str(batch)+"/metadata_batch_"+str(batch)+".csv"

def meta_path_docker(batch):
    return "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/results_target_docker/batch"+str(batch)+"/metadata_batch_"+str(batch)+".csv"
    
def result_path(batch):
    return "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/results_target/batch"+str(batch)+"/results"

def result_path_docker(batch):
    return "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/results_target_docker/batch"+str(batch)+"/results"

def build_cli_batch(batch,psep="\\",start="C:"):
    cli_tab = ["cellprofiler.exe -c -r -p \""+start+"\\Users\\alexi\\Documents\\dev\\portfolio\\imaging\\segmentation\\cellprofiler\\segmentation_pipeline_v2.cppipe\" ",
               "--data-file \"C:\\Users\\alexi\\Documents\\data\\images\\cellpainting\\cpg0016-jump\\data\\",
               "results_target\\batch"+str(batch)+"\\metadata_batch_"+str(batch),".csv\" -o  \"C:\\Users\\alexi\\Documents\\data\\",
               "images\\cellpainting\\cpg0016-jump\\data\\results_target\\batch"+str(batch)+"\\results\""]
    return "".join(cli_tab)

def build_cli_batch2(batch,idx1,idx2):
    cli_tab = ["cellprofiler.exe -c -r -p \"C:\\Users\\alexi\\Documents\\dev\\portfolio\\imaging\\segmentation\\cellprofiler\\segmentation_pipeline_v2.cppipe\" ",
               "--data-file \"C:\\Users\\alexi\\Documents\\data\\images\\cellpainting\\cpg0016-jump\\data\\metadata_targets_wide_filtered.csv\" "
               "-f "+str(idx1)+" -l "+str(idx2)+" -o  \"C:\\Users\\alexi\\Documents\\data\\",
               "images\\cellpainting\\cpg0016-jump\\data\\results_target\\batch"+str(batch)+"\\results\""]
    return "".join(cli_tab)

DOCKERNAME = "cellpro"
def build_cli_docker(batch,idx1,idx2):
    cli_tab = ["docker run --rm -v /mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data:/mnt/input ",DOCKERNAME," cellprofiler -c -r -p \"/mnt/input/segmentation_pipeline_v2.cppipe\" ",
              "--data-file \"/mnt/input/metadata_targets_docker.csv\" ",
                  "-f "+str(idx1)+" -l "+str(idx2)+" -o \"",
              "/mnt/input/results_target_docker/batch"+str(batch)+"\""]
    return "".join(cli_tab)

In [364]:
build_cli_docker(5,idx1=1,idx2=100)

'docker run -it --rm -v /mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data:/mnt/input cellpro cellprofiler -c -r -p "/mnt/input/segmentation_pipeline_v2.cppipe" --data-file "/mnt/input/metadata_targets_docker.csv" -f 1 -l 100 -o "/mnt/input/results_target_docker/batch5"'

In [373]:
PATH_CLI = "/mnt/c/Users/alexi/Documents/data/images/cellpainting/cpg0016-jump/data/run_docker.sh"
BATCH_SIZE=100
if os.path.exists(PATH_CLI):
    os.remove(PATH_CLI)
nsample = f_selected_meta_full.shape[0]
total_batch = nsample//BATCH_SIZE+1
for ibatch in range(1,(total_batch+1)):
    min_idx = (ibatch-1)*BATCH_SIZE+1
    max_idx = (ibatch)*BATCH_SIZE
    if max_idx>nsample:
        max_idx = nsample
    batch_tab = f_selected_meta_full.iloc[min_idx:max_idx]
    if min_idx<1:
        min_idx=1
    pmeta = meta_path_docker(ibatch)
    presult = result_path_docker(ibatch)
    if not os.path.exists(presult):
        os.makedirs(presult)
    batch_tab.to_csv(pmeta,index=False)
    vcli = build_cli_docker(ibatch,idx1=min_idx,idx2=max_idx)
    with open(PATH_CLI,"a") as f:
        f.write(vcli+" &\n")
        if ibatch%5==0:
            f.write("wait\n")
        


In [330]:
f_selected_meta_full.shape

(19810, 16)

In [None]:
f_selected_meta_full.Image_FileName_DNA