In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils.readProfiles import *

### Metadata column in each dataset to match perturbations across modalities

Table 1.

| Dataset               | perturbation match column<br/>CP | perturbation match column<br/>GE | Control perturbation  <br/>CP | Control perturbation<br/>GE |
| :-------------------- | :------------------------------- | :------------------------------- | :---------------------------- | :-------------------------- |
| CDRP-BBBC047-Bray     | Metadata_Sample_Dose             | pert_sample_dose                 | DMSO                          | DMSO                        |
| CDRPBIO-BBBC036-Bray  | Metadata_Sample_Dose             | pert_sample_dose                 | DMSO                          | DMSO                        |
| TA-ORF-BBBC037-Rohban | Metadata_broad_sample            | pert_id                          | DMSO                          | DMSO                        |
| LUAD-BBBC041-Caicedo  | x_mutation_status                | allele                           | DMSO                          | DMSO                        |
| LINCS-Pilot1          | Metadata_pert_id_dose            | pert_id_dose                     | DMSO                          | DMSO                        |


In [23]:
ds_info_dict = {
    "CDRP": [
        "CDRP-BBBC047-Bray",
        ["Metadata_Sample_Dose", "pert_sample_dose"],
        [["DMSO"], ["DMSO"]],
    ],
    "CDRP-bio": [
        "CDRPBIO-BBBC036-Bray",
        ["Metadata_Sample_Dose", "pert_sample_dose"],
        [["DMSO"], ["DMSO"]],
    ],
    "TAORF": [
        "TA-ORF-BBBC037-Rohban",
        [
            "Metadata_broad_sample",
            "pert_id",
        ],
        [["DMSO"], ["DMSO"]],
    ],
    "LUAD": [
        "LUAD-BBBC041-Caicedo",
        ["x_mutation_status", "allele"],
        [["DMSO"], ["DMSO"]],
    ],
    "LINCS": [
        "LINCS-Pilot1",
        ["Metadata_pert_id_dose", "pert_id_dose"],
        [["DMSO"], ["DMSO"]],
    ],
}
# pd.DataFrame(ds_info_dict.values(), index=ds_info_dict.keys()).to_markdown(index=False)

### In this notebook you can find examples of how to:
- read or treatment level profiles 
- and match them across modalities



* Finctions used in this notebook:

   - Read **treatment** level data
      - read_treatment_level_profiles
      
   - Read and match **treatment** level data
      - read_paired_treatment_level_profiles
      
   - Read **Replicate** level data
      - read_replicate_level_profiles
   
   - Read and match **Replicate** level data
      - read_paired_replicate_level_profiles


### User input parameters

In [17]:
####################### Root directories ###############################################
procProf_dir = "/home/ubuntu/datasetsbucket/Rosetta-GE-CP/"
# procProf_dir='/home/ubuntu/bucket/projects/2018_04_20_Rosetta/workspace/'

############################# Dataset ##################################################
# dataset options: 'LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'
dataset = "LUAD"

####################### Type of cell painting profile to read ##########################
# CP Profile Type options: 'augmented' , 'normalized', 'normalized_variable_selected'
profileType = "normalized_variable_selected"

############################ Filtering low quality samples option #######################
# filtering to compounds which have high replicates for both GE and CP datasets
# highRepOverlapEnabled=0
# 'highRepUnion','highRepOverlap'
filter_perts = "highRepUnion"


### Read Replicate level profiles

In [4]:
dataset = "LUAD"
per_plate_normalized_flag = 1
[cp_data_repLevel, cp_features], [
    l1k_data_repLevel,
    l1k_features,
] = read_replicate_level_profiles(
    procProf_dir, dataset, profileType, per_plate_normalized_flag
)


  l1k_data_repLevel=pd.read_csv(dataDir+'/L1000/replicate_level_l1k.csv.gz')


In [7]:
# luad_genes_to_inspect_ims=['PRKAG2','BNIP3L','NIPSNAP1','MYO10']
# cp_data_repLevel['x_mutation_status']

### Read and pair Replicate level profiles

In [None]:
nRep = "2"
(
    mergedProfiles_repLevel,
    cp_features,
    l1k_features,
) = read_paired_replicate_level_profiles(
    procProf_dir, dataset, profileType, nRep, filter_perts, per_plate_normalized_flag
)


TAORF: Replicate Level Shapes (nSamples x nFeatures): cp:  1920 , 63 ,  l1k:  729 , 978
l1k n of rep:  2.0
cp n of rep:  5.0
CP: from  324  to  218
l1k: from  327  to  78
CP and l1k high rep union:  260


### Read treatment level profiles

In [21]:
[cp_data_treatLevel, cp_features], [
    l1k_data_treatLevel,
    l1k_features,
] = read_treatment_level_profiles(
    procProf_dir, dataset, profileType, filter_perts, per_plate_normalized_flag
)


TAORF: Replicate Level Shapes (nSamples x nFeatures): cp:  1920 , 63 ,  l1k:  729 , 978
l1k n of rep:  2.0
cp n of rep:  5.0
CP: from  324  to  218
l1k: from  327  to  78
CP and l1k high rep union:  260


### Read and pair treatment level profiles

In [22]:
(
    mergedProfiles_treatLevel,
    cp_features,
    l1k_features,
) = read_paired_treatment_level_profiles(
    procProf_dir, dataset, profileType, filter_perts, per_plate_normalized_flag
)


TAORF: Replicate Level Shapes (nSamples x nFeatures): cp:  1920 , 63 ,  l1k:  729 , 978
l1k n of rep:  2.0
cp n of rep:  5.0
CP: from  324  to  218
l1k: from  327  to  78
CP and l1k high rep union:  260
Treatment Level Shapes (nSamples x nFeatures+metadata): (224, 65) (148, 980) Merged Profiles Shape: (111, 1044)


In [40]:
# l1k_data_repLevel[ds_info_dict[dataset][1][1]].unique()
# cp_data_repLevel[ds_info_dict[dataset][1][0]].unique()

In [41]:
# per_plate_normalized_flag