## Curate metadata information on platemaps

For L1000 and Cell Painting data

In [1]:
import pathlib
import pandas as pd

In [2]:
# Step 1: L1000
file = "../L1000/L1000_lvl4_cpd_replicate_datasets/l1000_level4_cpd_replicates.csv.gz"
l1000_df = pd.read_csv(file)

print(l1000_df.shape)
l1000_df.head(2)

(27837, 988)


Unnamed: 0,replicate_id,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,...,205379_at,sig_id,pert_id,pert_idose,det_plate,det_well,dose,Metadata_broad_sample,pert_iname,moa
0,REP.A001_A549_24H_X1_B27:A03,0.3547,-0.494,-0.1721,-0.0339,-0.4355,1.8263,-0.1316,0.0853,-0.466,...,0.1046,REP.A001_A549_24H:A03,DMSO,-666,REP.A001_A549_24H_X1_B27|REP.A001_A549_24H_X2_...,A03,0,DMSO,DMSO,Control vehicle
1,REP.A001_A549_24H_X2_B27:A03,-0.0447,-1.639,-0.5276,-0.5092,-0.5733,0.2445,0.6159,-1.1273,2.825,...,-1.5836,REP.A001_A549_24H:A03,DMSO,-666,REP.A001_A549_24H_X1_B27|REP.A001_A549_24H_X2_...,A03,0,DMSO,DMSO,Control vehicle


In [3]:
# Extract out metadata information necessary for analysis
metadata_plate_df = pd.DataFrame(
    [pd.Series(x) for x in l1000_df.replicate_id.str.split(":")],
)

metadata_plate_df.columns = ["plate", "well_position"]
metadata_plate_df = metadata_plate_df.assign(
    plate_map=metadata_plate_df.plate.str[:17]
)

# Make sure each plate only has one of the same well (no duplicates)
assert (
    metadata_plate_df.drop_duplicates(subset=["plate", "well_position"]).shape
    == metadata_plate_df.shape
)

l1000_meta_cols = [
    "plate",
    "well_position",
    "plate_map",
    "replicate_id",
    "dose",
    "Metadata_broad_sample",
    "pert_iname",
    "moa"
]

l1000_metadata_df = pd.concat([metadata_plate_df, l1000_df], axis="columns").loc[:, l1000_meta_cols]
l1000_metadata_df.pert_iname = l1000_metadata_df.pert_iname.str.lower()
l1000_metadata_df.moa = l1000_metadata_df.moa.str.lower()

# Output to file
file = pathlib.Path("data/L1000_platemap_metadata.tsv.gz")
l1000_metadata_df.to_csv(file, sep="\t", index=False)

print(l1000_metadata_df.shape)
l1000_metadata_df.head(2)

(27837, 8)


Unnamed: 0,plate,well_position,plate_map,replicate_id,dose,Metadata_broad_sample,pert_iname,moa
0,REP.A001_A549_24H_X1_B27,A03,REP.A001_A549_24H,REP.A001_A549_24H_X1_B27:A03,0,DMSO,dmso,control vehicle
1,REP.A001_A549_24H_X2_B27,A03,REP.A001_A549_24H,REP.A001_A549_24H_X2_B27:A03,0,DMSO,dmso,control vehicle


In [4]:
# Step 2: Cell Painting
file = "../cell_painting/cellpainting_lvl4_cpd_replicate_datasets/cp_level4_cpd_replicates.csv.gz"
cp_df = pd.read_csv(file, low_memory=False)

print(cp_df.shape)
cp_df.head(2)

(51833, 812)


Unnamed: 0,Metadata_broad_sample,Metadata_pert_id,Metadata_Plate,Metadata_Well,Metadata_broad_id,Metadata_moa,Metadata_dose_recode,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,...,Nuclei_Texture_SumVariance_Mito_5_0,Nuclei_Texture_SumVariance_RNA_20_0,Nuclei_Texture_Variance_DNA_20_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,broad_id,pert_iname,moa,replicate_name
0,DMSO,,SQ00015211,A01,,,0,2.366133,-0.032317,0.618638,...,-0.571713,0.757513,-0.63458,1.440005,-0.044817,0.264545,DMSO,DMSO,Control vehicle,replicate_0
1,DMSO,,SQ00015211,A02,,,0,0.376276,-1.135014,0.179721,...,-0.17745,-1.684778,-1.235524,1.958443,0.165417,-0.985233,DMSO,DMSO,Control vehicle,replicate_1


In [5]:
commit = "e9737c3e4e4443eb03c2c278a145f12efe255756"
cp_platemap_file = f"https://github.com/broadinstitute/lincs-cell-painting/raw/{commit}/metadata/platemaps/2016_04_01_a549_48hr_batch1/barcode_platemap.csv"
cp_meta_df = pd.read_csv(cp_platemap_file, sep=",")

cp_meta_df.columns = [f"Metadata_{x}" for x in cp_meta_df.columns]


cp_meta_cols = [
    "Metadata_Assay_Plate_Barcode",
    "Metadata_Well",
    "Metadata_Plate_Map_Name",
    "replicate_name",
    "Metadata_dose_recode",
    "Metadata_broad_sample",
    "pert_iname",
    "moa"
]

# Merge
cp_metadata_df = (
    cp_meta_df
    .merge(
        cp_df,
        left_on=["Metadata_Assay_Plate_Barcode"],
        right_on="Metadata_Plate",
        how="right"
    )
    .loc[:, cp_meta_cols]
)

cp_metadata_df.pert_iname = cp_metadata_df.pert_iname.str.lower()
cp_metadata_df.moa = cp_metadata_df.moa.str.lower()

# Output to file
file = pathlib.Path("data/CellPainting_platemap_metadata.tsv.gz")
cp_metadata_df.to_csv(file, sep="\t", index=False)

print(cp_metadata_df.shape)
cp_metadata_df.head(2)

(51833, 8)


Unnamed: 0,Metadata_Assay_Plate_Barcode,Metadata_Well,Metadata_Plate_Map_Name,replicate_name,Metadata_dose_recode,Metadata_broad_sample,pert_iname,moa
0,SQ00015211,A01,C-7161-01-LM6-027,replicate_0,0,DMSO,dmso,control vehicle
1,SQ00015211,A02,C-7161-01-LM6-027,replicate_1,0,DMSO,dmso,control vehicle
