In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils.readProfiles import *

In [4]:
# ls

### Metadata column in each dataset to match perturbations across modalities

Table 1.

| Dataset                  |  perturbation match column<br/>CP  | perturbation match column<br/>GE   | Control perturbation value <br/>CP/GE|
|:----------------------|:-----------------|:-----------------------------|:--------------|
| CDRP-BBBC047-Bray     |  Metadata_Sample_Dose | pert_sample_dose | negcon |
| CDRPBIO-BBBC036-Bray  | Metadata_Sample_Dose | pert_sample_dose | negcon |
| TA-ORF-BBBC037-Rohban | Metadata_broad_sample | pert_id        | negcon |
| LUAD-BBBC041-Caicedo  |  x_mutation_status | allele             | negcon|
| LINCS-Pilot1          | Metadata_pert_id_dose | pert_id_dose   | negcon |


In [4]:
ds_info_dict={'CDRP':['CDRP-BBBC047-Bray',['Metadata_Sample_Dose','pert_sample_dose']],
              'CDRP-bio':['CDRPBIO-BBBC036-Bray',['Metadata_Sample_Dose','pert_sample_dose']],
              'TAORF':['TA-ORF-BBBC037-Rohban',['Metadata_broad_sample','pert_id']],
              'LUAD':['LUAD-BBBC041-Caicedo',['x_mutation_status','allele']],
              'LINCS':['LINCS-Pilot1',['Metadata_pert_id_dose','pert_id_dose']]}
# pd.DataFrame(ds_info_dict.values(), index=ds_info_dict.keys()).to_markdown(index=False)

### In this notebook you can find examples of how to:
- read or treatment level profiles 
- and match them across modalities



* Finctions used in this notebook:

   - Read **treatment** level data
      - read_treatment_level_profiles
      
   - Read and match **treatment** level data
      - read_paired_treatment_level_profiles
      
   - Read **Replicate** level data
      - read_replicate_level_profiles
   
   - Read and match **Replicate** level data
      - read_paired_replicate_level_profiles


### User input parameters

In [3]:
####################### Root directories ###############################################
# procProf_dir='/home/ubuntu/datasetsbucket/Rosetta-GE-CP/'
procProf_dir='/home/ubuntu/bucket/projects/2018_04_20_Rosetta/workspace/'

############################# Dataset ##################################################
# dataset options: 'LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'
dataset='LINCS'

####################### Type of cell painting profile to read ##########################
# CP Profile Type options: 'augmented' , 'normalized', 'normalized_variable_selected'
profileType='normalized_variable_selected'

############################ Filtering low quality samples option #######################
# filtering to compounds which have high replicates for both GE and CP datasets
# highRepOverlapEnabled=0
# 'highRepUnion','highRepOverlap'
filter_perts='highRepUnion'
repCorrFilePath='./results/RepCor/RepCorrDF.xlsx'

filter_repCorr_params=[filter_perts,repCorrFilePath]

### Read Replicate level profiles

In [10]:
dataset='LINCS'
per_plate_normalized_flag=0
[cp_data_repLevel,cp_features], [l1k_data_repLevel,l1k_features]=\
read_replicate_level_profiles(procProf_dir,dataset,profileType,per_plate_normalized_flag)


  read_replicate_level_profiles(procProf_dir,dataset,profileType,per_plate_normalized_flag)


In [11]:
cp_data_repLevel.columns[cp_data_repLevel.columns.str.contains('Metadata')]

Index(['Metadata_plate_map_name', 'Metadata_broad_sample',
       'Metadata_mg_per_ml', 'Metadata_mmoles_per_liter', 'Metadata_solvent',
       'Metadata_pert_id', 'Metadata_pert_mfc_id', 'Metadata_pert_well',
       'Metadata_pert_id_vendor', 'Metadata_cell_id',
       'Metadata_broad_sample_type', 'Metadata_pert_vehicle',
       'Metadata_pert_type', 'Metadata_broad_id', 'Metadata_InChIKey14',
       'Metadata_moa', 'Metadata_target', 'Metadata_broad_date',
       'Metadata_alternative_moa', 'Metadata_alternative_target',
       'Metadata_Plate', 'Metadata_Well', 'Metadata_Assay_Plate_Barcode',
       'Metadata_Plate_Map_Name', 'Metadata_Batch_Number',
       'Metadata_Batch_Date', 'Metadata_dose_recode', 'Metadata_pert_id_dose'],
      dtype='object')

### Read and pair Replicate level profiles

In [6]:
nRep=2
per_plate_normalized_flag=1
mergedProfiles_repLevel,cp_features,l1k_features = read_paired_replicate_level_profiles(procProf_dir,\
                            dataset,profileType,nRep,filter_repCorr_params,per_plate_normalized_flag)

  [cp_data_repLevel,cp_features], [l1k_data_repLevel,l1k_features] = read_replicate_level_profiles(dataset_rootDir,dataset,profileType,per_plate_normalized_flag);


LUAD: Replicate Level Shapes (nSamples x nFeatures): cp:  6144 , 291 ,  l1k:  4232 , 978
l1k n of rep:  8.0
cp n of rep:  8.0
CP: from  593  to  364
l1k: from  529  to  275
CP and l1k high rep union:  442


### Read treatment level profiles

In [7]:
[cp_data_treatLevel,cp_features], [l1k_data_treatLevel,l1k_features] = \
read_treatment_level_profiles(procProf_dir,dataset,profileType,filter_repCorr_params,per_plate_normalized_flag)

  [cp_data_repLevel,cp_features], [l1k_data_repLevel,l1k_features] = read_replicate_level_profiles(dataset_rootDir,dataset,profileType,per_plate_normalized_flag);


LUAD: Replicate Level Shapes (nSamples x nFeatures): cp:  6144 , 291 ,  l1k:  4232 , 978
l1k n of rep:  8.0
cp n of rep:  8.0
CP: from  593  to  364
l1k: from  529  to  275
CP and l1k high rep union:  442


### Read and pair treatment level profiles

In [8]:
mergedProfiles_treatLevel,cp_features,l1k_features = \
read_paired_treatment_level_profiles(procProf_dir,dataset,profileType,filter_repCorr_params,per_plate_normalized_flag)

  [cp_data_repLevel,cp_features], [l1k_data_repLevel,l1k_features] = read_replicate_level_profiles(dataset_rootDir,dataset,profileType,per_plate_normalized_flag);


LUAD: Replicate Level Shapes (nSamples x nFeatures): cp:  6144 , 291 ,  l1k:  4232 , 978
l1k n of rep:  8.0
cp n of rep:  8.0
CP: from  593  to  364
l1k: from  529  to  275
CP and l1k high rep union:  442
Treatment Level Shapes (nSamples x nFeatures+metadata): (440, 292) (411, 979) Merged Profiles Shape: (408, 1270)


In [40]:
# l1k_data_repLevel[ds_info_dict[dataset][1][1]].unique()
# cp_data_repLevel[ds_info_dict[dataset][1][0]].unique()

In [41]:
# per_plate_normalized_flag