## Describe profiling input data

In [1]:
import pathlib
import pandas as pd

## How many total compounds?

In [2]:
aligned_file = pathlib.Path("Profiles_level4/aligned_moa_CP_L1000.csv")
aligned_df = pd.read_csv(aligned_file)

print(aligned_df.shape)
aligned_df.head()

(1571, 4)


Unnamed: 0,Metadata_broad_sample,broad_id,pert_iname,moa
0,BRD-A00147595-001-01-5,BRD-A00147595,balaglitazone,Insulin sensitizer|PPAR receptor partial agonist
1,BRD-A00218260-001-03-4,BRD-A00218260,flutrimazole,Sterol demethylase inhibitor
2,BRD-A00376169-001-01-6,BRD-A00376169,kbg,Neprilysin inhibitor
3,BRD-A00546892-001-02-6,BRD-A00546892,biperiden,Acetylcholine receptor antagonist
4,BRD-A00938334-001-01-3,BRD-A00938334,drospirenone,Mineralocorticoid receptor antagonist


## How many total MOAs?

In [3]:
moa_list = (
    pd.DataFrame(
        pd.concat([
            pd.Series(x) for x in aligned_df.moa.str.split("|")
        ])
        .dropna(), columns=['moa']
    )
)

moa_list.moa = moa_list.moa.str.lower()
moa_list = (
    pd.DataFrame(
        moa_list.moa.value_counts()
    )
    .reset_index()
    .rename(columns={"moa": "compound_count", "index": "moa"})
)

print(moa_list.moa.nunique())
moa_list.head()

554


Unnamed: 0,moa,compound_count
0,cyclooxygenase inhibitor,42
1,phosphodiesterase inhibitor,41
2,adrenergic receptor antagonist,40
3,serotonin receptor antagonist,38
4,dopamine receptor antagonist,35


## How many perturbations and compounds in common?

In [4]:
common_file = pathlib.Path("..", "6.paper_figures", "data", "significant_compounds_by_threshold_both_assays.tsv.gz")
common_df = pd.read_csv(common_file, sep="\t")

# Note this includes dose information
print(common_df.shape)
common_df.head(2)

(7962, 10)


Unnamed: 0,compound,dose,median_replicate_score_cellpainting,median_replicate_score_l1000,pass_cellpainting_thresh,pass_l1000_thresh,pass_both,cell_painting_num_reproducible,l1000_num_reproducible,total_reproducible
0,17-hydroxyprogesterone-caproate,0.04 uM,0.060655,0.024738,True,False,False,5,3,8
1,2-iminobiotin,0.04 uM,0.0427,-0.133693,False,False,False,0,2,2


In [5]:
# What about only common compounds?
common_perts_df = common_df.loc[:, "compound"].drop_duplicates()
common_perts_df.shape

(1327,)

## How many MOAs in common?

In [6]:
# From the Consensus/Data_Type/1.MOA-MEDIAN notebooks, we see that there are 583 MOAs in common
print(583)

583


## How many plates and platemaps?

### L1000

In [7]:
l1000_meta_file = pathlib.Path("Profiles_level4/L1000/L1000_figshare_data/col_meta_level_3_REP.A_A549_only_n27837.txt")
l1000_meta_df = pd.read_csv(l1000_meta_file, sep="\t")

print(l1000_meta_df.shape)
l1000_meta_df.head()

(27837, 45)


Unnamed: 0,distil_id,bead_batch,bead_revision,bead_set,cell_id,count_cv,count_mean,det_mode,det_plate,det_well,...,x_mg_per_ml,x_mmoles_per_liter,x_purity,x_smiles,x_volume_ul,pert_plate,batch,nearest_dose,brew_prefix,group_id
0,REP.A001_A549_24H_X1_B27:A03,b27,r2,"dp52,dp53",A549,39,93,duo,REP.A001_A549_24H_X1_B27,A03,...,-666.0,-666.0,-666.0,-666,13,REP.A001,REP.A,-666.0,REP.A001_A549_24H,REP.A001_A549_24H:A03
1,REP.A001_A549_24H_X1_B27:A04,b27,r2,"dp52,dp53",A549,41,67,duo,REP.A001_A549_24H_X1_B27,A04,...,-666.0,-666.0,-666.0,-666,13,REP.A001,REP.A,-666.0,REP.A001_A549_24H,REP.A001_A549_24H:A04
2,REP.A001_A549_24H_X1_B27:A05,b27,r2,"dp52,dp53",A549,39,109,duo,REP.A001_A549_24H_X1_B27,A05,...,-666.0,-666.0,-666.0,-666,13,REP.A001,REP.A,-666.0,REP.A001_A549_24H,REP.A001_A549_24H:A05
3,REP.A001_A549_24H_X1_B27:A06,b27,r2,"dp52,dp53",A549,38,117,duo,REP.A001_A549_24H_X1_B27,A06,...,-666.0,-666.0,-666.0,-666,13,REP.A001,REP.A,-666.0,REP.A001_A549_24H,REP.A001_A549_24H:A06
4,REP.A001_A549_24H_X1_B27:A07,b27,r2,"dp52,dp53",A549,40,89,duo,REP.A001_A549_24H_X1_B27,A07,...,1.10546,10.0,100.0,NNC(N)=N,13,REP.A001,REP.A,10.0,REP.A001_A549_24H,REP.A001_A549_24H:A07


In [8]:
# L1000 plate maps
len(l1000_meta_df.pert_plate.unique())

25

### Cell Painting

In [9]:
cp_platemap_file = "https://github.com/broadinstitute/lincs-cell-painting/raw/94bfaeeab0d107beac262b4307aa6e9b783625fa/metadata/platemaps/broad_sample_info.tsv"
cp_meta_df = pd.read_csv(cp_platemap_file, sep="\t")

print(cp_meta_df.shape)
cp_meta_df.head()

(1652, 4)


Unnamed: 0,broad_sample,broad_id,plate_map_name,solvent
0,,,C-7161-01-LM6-018,DMSO
1,BRD-A62035778-004-03-6,BRD-A62035778,C-7161-01-LM6-018,DMSO
2,BRD-K62196610-001-01-6,BRD-K62196610,C-7161-01-LM6-018,DMSO
3,BRD-K62277907-001-01-6,BRD-K62277907,C-7161-01-LM6-018,DMSO
4,BRD-A62071884-001-04-6,BRD-A62071884,C-7161-01-LM6-018,DMSO


In [10]:
# Cell Painting plate maps
len(cp_meta_df.plate_map_name.unique())

28

In [11]:
# Example platemap
eg_plate_file = "https://github.com/broadinstitute/lincs-cell-painting/raw/94bfaeeab0d107beac262b4307aa6e9b783625fa/metadata/platemaps/2016_04_01_a549_48hr_batch1/platemap/C-7161-01-LM6-001.txt"
eg_plate_df = pd.read_csv(eg_plate_file, sep="\t")

eg_plate_df.broad_sample = eg_plate_df.broad_sample.fillna("DMSO")

print(eg_plate_df.shape)
eg_plate_df.head(2)

(384, 6)


Unnamed: 0,plate_map_name,well_position,broad_sample,mg_per_ml,mmoles_per_liter,solvent
0,C-7161-01-LM6-001,A01,DMSO,,,DMSO
1,C-7161-01-LM6-001,A02,DMSO,,,DMSO


In [12]:
eg_plate_df.broad_sample.value_counts().head(5)

DMSO                      24
BRD-K60230970-001-10-0    12
BRD-K50691590-001-02-2    12
BRD-K27938825-001-02-4     6
BRD-K28542495-003-13-5     6
Name: broad_sample, dtype: int64

In [13]:
eg_plate_df.query("broad_sample not in ['DMSO', 'BRD-K50691590-001-02-2', 'BRD-K60230970-001-10-0']").broad_sample.nunique()

56