# Select data
Since we only are evaluating a subsection of the data, we want to run less through DP.
These are the same steps as in `baseline/02_clean_data`

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [48]:
df = pd.read_csv('enriched_index.csv')

In [49]:
# 0. single out the dmso
dmso_df = df.query("Metadata_dose_recode == 0")
# 1. Only keep the dose 6
df = df.query("Metadata_dose_recode == 6 ")
# 2. Get rid of all compounds that have unknown MOAs
df = df.query("Metadata_moa != 'unknown'")
# 3. Drop all single MOAs
unique = df.drop_duplicates(['Metadata_broad_sample' ,'Metadata_moa'], keep = 'first')
ls = unique.Metadata_moa.value_counts() != 1
keys = ls[ls].keys()
df = df[df["Metadata_moa"].isin(keys)]
# add the DMSO data back on
df = pd.concat(
 [dmso_df, df],
    ignore_index=True
)

In [None]:
# Since some pngs are missing, we need to get rid of some rows
delete row: 'SQ00015232/r06c07f01p01-ch1sk1fk1fl1.png'

In [50]:
# validate
print(df.Metadata_broad_sample.value_counts())
print(df.Metadata_moa.value_counts())

DMSO                      29268
BRD-A09349126-001-10-7       45
BRD-A34006693-001-15-4       45
BRD-K24968862-001-01-0       45
BRD-K86797399-001-04-4       45
                          ...  
BRD-K42948882-305-01-1       18
BRD-A37959677-001-04-5       18
BRD-K43578482-001-01-4       18
BRD-K44273375-001-25-0       18
BRD-K42679050-001-02-1       18
Name: Metadata_broad_sample, Length: 1144, dtype: int64
unknown                                                              29268
phosphodiesterase inhibitor                                           1458
adrenergic receptor antagonist                                        1395
cyclooxygenase inhibitor                                              1187
histamine receptor antagonist                                         1098
                                                                     ...  
angiogenesis inhibitor|tumor necrosis factor production inhibitor       81
tricyclic antidepressant                                           

In [51]:
df.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_broad_sample,Metadata_moa,Metadata_mmoles_per_liter,Metadata_dose_recode,RNA,ER,AGP,Mito,DNA,Concentration,Treatment_ID,Compound,pert_iname,Treatment_Replicate,Treatment,Plate_Map_Name,Split
0,SQ00014812,A01,1,DMSO,unknown,0.0,0,SQ00014812/r01c01f01p01-ch3sk1fk1fl1.tiff,SQ00014812/r01c01f01p01-ch2sk1fk1fl1.tiff,SQ00014812/r01c01f01p01-ch4sk1fk1fl1.tiff,SQ00014812/r01c01f01p01-ch5sk1fk1fl1.tiff,SQ00014812/r01c01f01p01-ch1sk1fk1fl1.tiff,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
1,SQ00014812,A01,2,DMSO,unknown,0.0,0,SQ00014812/r01c01f02p01-ch3sk1fk1fl1.tiff,SQ00014812/r01c01f02p01-ch2sk1fk1fl1.tiff,SQ00014812/r01c01f02p01-ch4sk1fk1fl1.tiff,SQ00014812/r01c01f02p01-ch5sk1fk1fl1.tiff,SQ00014812/r01c01f02p01-ch1sk1fk1fl1.tiff,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
2,SQ00014812,A01,3,DMSO,unknown,0.0,0,SQ00014812/r01c01f03p01-ch3sk1fk1fl1.tiff,SQ00014812/r01c01f03p01-ch2sk1fk1fl1.tiff,SQ00014812/r01c01f03p01-ch4sk1fk1fl1.tiff,SQ00014812/r01c01f03p01-ch5sk1fk1fl1.tiff,SQ00014812/r01c01f03p01-ch1sk1fk1fl1.tiff,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
3,SQ00014812,A01,4,DMSO,unknown,0.0,0,SQ00014812/r01c01f04p01-ch3sk1fk1fl1.tiff,SQ00014812/r01c01f04p01-ch2sk1fk1fl1.tiff,SQ00014812/r01c01f04p01-ch4sk1fk1fl1.tiff,SQ00014812/r01c01f04p01-ch5sk1fk1fl1.tiff,SQ00014812/r01c01f04p01-ch1sk1fk1fl1.tiff,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
4,SQ00014812,A01,5,DMSO,unknown,0.0,0,SQ00014812/r01c01f05p01-ch3sk1fk1fl1.tiff,SQ00014812/r01c01f05p01-ch2sk1fk1fl1.tiff,SQ00014812/r01c01f05p01-ch4sk1fk1fl1.tiff,SQ00014812/r01c01f05p01-ch5sk1fk1fl1.tiff,SQ00014812/r01c01f05p01-ch1sk1fk1fl1.tiff,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training


In [26]:
df.to_csv('sub_index.csv', index=False)

This index file is now ready for DeepProfiler to run. The output can then be compared to the baseline results.

In [22]:
df = pd.read_csv('sub_index.csv')

In [9]:
df.query("Metadata_moa == 'focal adhesion kinase inhibitor'").Metadata_broad_sample.unique()

array(['BRD-K99545815-001-06-3', 'BRD-K43578482-001-01-4'], dtype=object)

In [7]:

dic = {}
for moa in df.Metadata_moa.unique():
    dic[moa] = len(df.query("Metadata_moa == @moa").Metadata_broad_sample.unique())

In [8]:
list(dic.values()).count(1)

1

In [24]:
df.RNA.replace('tiff', 'png', regex = True, inplace=True)
df.ER.replace('tiff', 'png', regex = True, inplace=True)
df.AGP.replace('tiff', 'png', regex = True, inplace=True)
df.Mito.replace('tiff', 'png', regex = True, inplace=True)
df.DNA.replace('tiff', 'png', regex = True, inplace=True)

In [25]:
df.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_broad_sample,Metadata_moa,Metadata_mmoles_per_liter,Metadata_dose_recode,RNA,ER,AGP,Mito,DNA,Concentration,Treatment_ID,Compound,pert_iname,Treatment_Replicate,Treatment,Plate_Map_Name,Split
0,SQ00014812,A01,1,DMSO,unknown,0.0,0,SQ00014812/r01c01f01p01-ch3sk1fk1fl1.png,SQ00014812/r01c01f01p01-ch2sk1fk1fl1.png,SQ00014812/r01c01f01p01-ch4sk1fk1fl1.png,SQ00014812/r01c01f01p01-ch5sk1fk1fl1.png,SQ00014812/r01c01f01p01-ch1sk1fk1fl1.png,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
1,SQ00014812,A01,2,DMSO,unknown,0.0,0,SQ00014812/r01c01f02p01-ch3sk1fk1fl1.png,SQ00014812/r01c01f02p01-ch2sk1fk1fl1.png,SQ00014812/r01c01f02p01-ch4sk1fk1fl1.png,SQ00014812/r01c01f02p01-ch5sk1fk1fl1.png,SQ00014812/r01c01f02p01-ch1sk1fk1fl1.png,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
2,SQ00014812,A01,3,DMSO,unknown,0.0,0,SQ00014812/r01c01f03p01-ch3sk1fk1fl1.png,SQ00014812/r01c01f03p01-ch2sk1fk1fl1.png,SQ00014812/r01c01f03p01-ch4sk1fk1fl1.png,SQ00014812/r01c01f03p01-ch5sk1fk1fl1.png,SQ00014812/r01c01f03p01-ch1sk1fk1fl1.png,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
3,SQ00014812,A01,4,DMSO,unknown,0.0,0,SQ00014812/r01c01f04p01-ch3sk1fk1fl1.png,SQ00014812/r01c01f04p01-ch2sk1fk1fl1.png,SQ00014812/r01c01f04p01-ch4sk1fk1fl1.png,SQ00014812/r01c01f04p01-ch5sk1fk1fl1.png,SQ00014812/r01c01f04p01-ch1sk1fk1fl1.png,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
4,SQ00014812,A01,5,DMSO,unknown,0.0,0,SQ00014812/r01c01f05p01-ch3sk1fk1fl1.png,SQ00014812/r01c01f05p01-ch2sk1fk1fl1.png,SQ00014812/r01c01f05p01-ch4sk1fk1fl1.png,SQ00014812/r01c01f05p01-ch5sk1fk1fl1.png,SQ00014812/r01c01f05p01-ch1sk1fk1fl1.png,,0,DMSO,,1,DMSO@NA,C-7161-01-LM6-022,Training
