In [16]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from cytominer_eval import evaluate_metrics

from utils import remove_empty_wells

plt.rcParams["font.family"] = ["Roboto"]
plt.rcParams["font.size"] = 14

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def get_metacols(df):
    """return a list of metadata columns"""
    return [c for c in df.columns if c.startswith("Metadata_")]


def get_featurecols(df):
    """returna  list of featuredata columns"""
    return [c for c in df.columns if not c.startswith("Metadata")]


def get_metadata(df):
    """return dataframe of just metadata columns"""
    return df[get_metacols(df)]


def get_featuredata(df):
    """return dataframe of just featuredata columns"""
    return df[get_featurecols(df)]

def remove_negcon_empty_wells(df):
    """return dataframe of non-negative control wells"""
    df = (
        df.query('Metadata_pert_type!="negcon"')
        .dropna(subset=["Metadata_broad_sample"])
        .reset_index(drop=True)
    )
    return df


def consensus(profiles_df, group_by_feature):
    """
    Computes the median consensus profiles.
    Parameters:
    -----------
    profiles_df: pandas.DataFrame
        dataframe of profiles
    group_by_feature: str
        Name of the column
    Returns:
    -------
    pandas.DataFrame of the same shape as `plate`
    """

    metadata_df = get_metadata(profiles_df).drop_duplicates(subset=[group_by_feature])

    feature_cols = [group_by_feature] + get_featurecols(profiles_df)
    profiles_df = (
        profiles_df[feature_cols].groupby([group_by_feature]).median().reset_index()
    )

    profiles_df = metadata_df.merge(profiles_df, on=group_by_feature)

    return profiles_df

In [17]:
plate_profiles = pd.read_parquet("profiles/full_profiles_cc_adj_mean_corr_preprocessed.parquet")
print(plate_profiles.shape)

(79560, 1006)


In [18]:
plate_profiles = remove_negcon_empty_wells(plate_profiles)
# plate_profiles = remove_empty_wells(plate_profiles)
plate_profiles = plate_profiles.query('Metadata_broad_sample!="BAD CONSTRUCT"').reset_index(drop=True)
r,c = np.where(plate_profiles.isna())
features_to_remove = [_ for _ in list(plate_profiles.columns[list(set(c))]) if not _.startswith('Metadata_')]
print(f'Removed nan features: {features_to_remove}')
plate_profiles.drop(features_to_remove, axis=1, inplace=True)
plate_profiles = consensus(plate_profiles, group_by_feature="Metadata_broad_sample")
print(plate_profiles.shape)

Removed nan features: []
(79560, 1006)
(79560, 1006)


In [20]:
plate_profiles.groupby('Metadata_broad_sample')['Metadata_Batch'].unique()

Metadata_broad_sample
ccsbBroad304_00001                                  [2021_06_21_Batch7]
ccsbBroad304_00002                                  [2021_06_07_Batch5]
ccsbBroad304_00003              [2021_08_30_Batch13, 2021_07_12_Batch8]
ccsbBroad304_00007                                 [2021_08_09_Batch11]
ccsbBroad304_00008                                 [2021_08_09_Batch11]
                                            ...                        
ccsbBroad304_16172                                 [2021_08_09_Batch11]
ccsbBroad304_99985    [2021_04_26_Batch1, 2021_05_31_Batch2, 2021_05...
ccsbBroad304_99988    [2021_04_26_Batch1, 2021_05_31_Batch2, 2021_05...
ccsbBroad304_99991    [2021_04_26_Batch1, 2021_05_31_Batch2, 2021_05...
ccsbBroad304_99994    [2021_04_26_Batch1, 2021_05_31_Batch2, 2021_05...
Name: Metadata_Batch, Length: 15096, dtype: object

In [5]:
batch = {
    "pos_sameby": {"all": ["Metadata_Batch"], "any": []},
    "pos_diffby": {"all": [], "any": []},
    "neg_sameby": {"all": [], "any": []},
    "neg_diffby": {"all": ["Metadata_Batch"], "any": []},
}

batch_metrics_config = {
    "mean_ap": {
        "null_size": 10000,
        "groupby_columns": ["Metadata_Batch"],
    },
}

In [7]:
og_same_pert_diff_well_results = evaluate_metrics(
    profiles=plate_profiles.reset_index(drop=True),
    features=plate_profiles.filter(regex="^(?!Metadata_)").columns,
    meta_features= plate_profiles.filter(regex="Metadata_").columns,
    replicate_groups=batch,
    metrics_config=batch_metrics_config,
    use_copairs=True,
)


Calculating distances.


  0%|          | 0/481 [00:00<?, ?it/s]

  0%|          | 0/5214 [00:00<?, ?it/s]


Calculating metric: mean_ap


  0%|          | 0/12 [00:00<?, ?it/s]

In [9]:
og_same_pert_diff_well_results['mean_ap'].to_csv('og_same_pert_diff_well_results.csv', index=False)

{'mean_ap':         Metadata_Batch   mean_ap   p_value  n_pos_pairs  n_total_pairs
 0    2021_04_26_Batch1  0.094604  0.402181       1340.0        15091.0
 1    2021_05_10_Batch3  0.101876  0.093445       1353.0        15091.0
 2    2021_05_17_Batch4  0.096965  0.098670       1356.0        15091.0
 3    2021_05_31_Batch2  0.087796  0.441406       1281.0        15091.0
 4    2021_06_07_Batch5  0.101314  0.321114       1355.0        15091.0
 5    2021_06_14_Batch6  0.072165  0.230521        981.0        15091.0
 6    2021_06_21_Batch7  0.094568  0.374052       1352.0        15091.0
 7    2021_07_12_Batch8  0.071704  0.186970       1016.0        15091.0
 8    2021_07_26_Batch9  0.095589  0.271857       1331.0        15091.0
 9   2021_08_02_Batch10  0.092846  0.236375       1342.0        15091.0
 10  2021_08_09_Batch11  0.093095  0.244925       1350.0        15091.0
 11  2021_08_30_Batch13  0.071610  0.223223       1023.0        15091.0}

In [None]:
data_dict = {
    'mean_ap': pd.DataFrame({
        'Metadata_Batch': [
            '2021_04_26_Batch1', '2021_05_10_Batch3', '2021_05_17_Batch4', 
            '2021_05_31_Batch2', '2021_06_07_Batch5', '2021_06_14_Batch6', 
            '2021_06_21_Batch7', '2021_07_12_Batch8', '2021_07_26_Batch9', 
            '2021_08_02_Batch10', '2021_08_09_Batch11', '2021_08_30_Batch13'
        ],
        'mean_ap': [
            0.094604, 0.101876, 0.096965, 0.087796, 0.101314, 0.072165, 
            0.094568, 0.071704, 0.095589, 0.092846, 0.093095, 0.071610
        ],
        'p_value': [
            0.402181, 0.093445, 0.098670, 0.441406, 0.321114, 0.230521, 
            0.374052, 0.186970, 0.271857, 0.236375, 0.244925, 0.223223
        ],
        'n_pos_pairs': [
            1340.0, 1353.0, 1356.0, 1281.0, 1355.0, 981.0, 
            1352.0, 1016.0, 1331.0, 1342.0, 1350.0, 1023.0
        ],
        'n_total_pairs': [
            15091.0, 15091.0, 15091.0, 15091.0, 15091.0, 15091.0, 
            15091.0, 15091.0, 15091.0, 15091.0, 15091.0, 15091.0
        ]
    })
}

# Extracting the DataFrame
plate_map = data_dict['mean_ap']

print(plate_map)


In [11]:
batch_map = pd.read_csv("outputs/batch-retrieval-mAP-transformed-profiles.csv.gz")
batch_map

Unnamed: 0,Metadata_Batch,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold
0,Batch1,0.136287,3.578182,0.000778,3.109179,True,True
1,Batch10,0.114896,3.417108,0.000778,3.109179,True,True
2,Batch11,0.108473,3.135439,0.000778,3.109179,True,True
3,Batch13,0.082174,3.553396,0.000778,3.109179,True,True
4,Batch2,0.123372,3.191934,0.000778,3.109179,True,True
5,Batch3,0.170321,3.768751,0.000778,3.109179,True,True
6,Batch4,0.108745,3.719207,0.000778,3.109179,True,True
7,Batch5,0.117914,3.326284,0.000778,3.109179,True,True
8,Batch6,0.087164,3.109179,0.000778,3.109179,True,True
9,Batch7,0.115452,3.136314,0.000778,3.109179,True,True


In [25]:
batch_profiles = pd.read_parquet("profiles/mean_corr_cc_adj_0016ef5983f418ad16bae5913eff53c3_harmony_corrected_orf.parquet")
batch_profiles

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022,Metadata_broad_sample,Metadata_Name,Metadata_Vector,Metadata_Transcript,Metadata_Symbol,Metadata_NCBI_Gene_ID,...,X_harmony_0864,X_harmony_0865,X_harmony_0866,X_harmony_0867,X_harmony_0868,X_harmony_0869,X_harmony_0870,X_harmony_0871,X_harmony_0872,X_harmony_0873
0,source_4,BR00117035,M13,JCP2022_900257,ccsbBroad304_00280,ORF012864.1_TRC304.1,pLX_304,NM_001799.4,CDK7,1022,...,-0.000339,-0.000015,-0.000127,-0.000123,0.000108,0.000019,-0.000003,-0.000032,0.000008,-0.000002
1,source_4,BR00121560,E22,JCP2022_905644,ccsbBroad304_06036,ORF003887.1_TRC304.1,pLX_304,NM_001881.3,CREM,1390,...,-0.000082,-0.000085,-0.000104,-0.000012,-0.000084,-0.000047,0.000005,-0.000020,-0.000004,-0.000014
2,source_4,BR00121560,C24,JCP2022_905629,ccsbBroad304_06021,ORF014251.1_TRC304.1,pLX_304,NM_001300.6,KLF6,1316,...,-0.003821,0.000136,0.000204,0.000143,0.000229,-0.000084,-0.000002,0.000074,0.000018,0.000011
3,source_4,BR00121560,O11,JCP2022_905591,ccsbBroad304_05982,ORF010422.1_TRC304.1,pLX_304,NM_001265.6,CDX2,1045,...,-0.001694,0.000886,-0.000580,-0.001062,0.000156,0.000032,0.000029,-0.000175,0.000032,0.000004
4,source_4,BR00121560,A01,JCP2022_905588,ccsbBroad304_05979,ORF000779.1_TRC304.1,pLX_304,NM_001261.4,CDK9,1025,...,-0.005453,-0.001106,0.001056,-0.000973,-0.000360,0.000852,-0.000222,0.000016,0.000144,-0.000035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79426,source_4,BR00123536,K12,JCP2022_910410,ccsbBroad304_11164,ORF012646.1_TRC304.1,pLX_304,BC014413.1,STK10,6793,...,0.000645,-0.000105,-0.000090,-0.000108,0.000053,0.000022,0.000016,-0.000018,-0.000003,0.000002
79427,source_4,BR00123536,L20,JCP2022_910503,ccsbBroad304_11265,ORF016142.1_TRC304.1,pLX_304,NM_001363813.1,AXIN2,8313,...,0.000782,-0.000115,-0.000085,-0.000263,-0.000017,-0.000085,0.000034,-0.000013,-0.000014,-0.000004
79428,source_4,BR00123536,J12,JCP2022_910528,ccsbBroad304_11291,ORF012481.1_TRC304.1,pLX_304,NM_001005745.1,NUMB,8650,...,-0.000141,-0.000455,-0.000192,0.000051,0.000053,0.000081,0.000045,-0.000033,0.000021,-0.000007
79429,source_4,BR00123536,N15,JCP2022_910564,ccsbBroad304_11328,ORF012255.1_TRC304.1,pLX_304,NM_001034996.3,RPL14,9045,...,0.001127,0.000123,0.000003,-0.000099,-0.000062,-0.000022,-0.000025,-0.000015,-0.000029,-0.000005


In [26]:
batch_profiles = remove_negcon_empty_wells(batch_profiles)
# plate_profiles = remove_empty_wells(plate_profiles)
batch_profiles = batch_profiles.query('Metadata_broad_sample!="BAD CONSTRUCT"').reset_index(drop=True)
r,c = np.where(batch_profiles.isna())
features_to_remove = [_ for _ in list(batch_profiles.columns[list(set(c))]) if not _.startswith('Metadata_')]
print(f'Removed nan features: {features_to_remove}')
batch_profiles.drop(features_to_remove, axis=1, inplace=True)
batch_profiles = consensus(batch_profiles, group_by_feature="Metadata_broad_sample")
print(batch_profiles.shape)

Removed nan features: []
(15091, 893)


In [27]:
corr_batch_results = evaluate_metrics(
    profiles=batch_profiles.reset_index(drop=True),
    features=batch_profiles.filter(regex="^(?!Metadata_)").columns,
    meta_features= batch_profiles.filter(regex="Metadata_").columns,
    replicate_groups=batch,
    metrics_config=batch_metrics_config,
    use_copairs=True,
)


Calculating distances.


  0%|          | 0/487 [00:00<?, ?it/s]

  0%|          | 0/5207 [00:00<?, ?it/s]


Calculating metric: mean_ap


  0%|          | 0/12 [00:00<?, ?it/s]

In [28]:
corr_batch_results

{'mean_ap':         Metadata_Batch   mean_ap   p_value  n_pos_pairs  n_total_pairs
 0    2021_04_26_Batch1  0.133213  0.025140       1340.0        15090.0
 1    2021_05_10_Batch3  0.168917  0.021743       1353.0        15090.0
 2    2021_05_17_Batch4  0.123983  0.026963       1356.0        15090.0
 3    2021_05_31_Batch2  0.124148  0.054373       1281.0        15090.0
 4    2021_06_07_Batch5  0.115512  0.057164       1363.0        15090.0
 5    2021_06_14_Batch6  0.082965  0.043037        974.0        15090.0
 6    2021_06_21_Batch7  0.110466  0.061983       1352.0        15090.0
 7    2021_07_12_Batch8  0.109883  0.053144       1358.0        15090.0
 8    2021_07_26_Batch9  0.135782  0.046645       1331.0        15090.0
 9   2021_08_02_Batch10  0.111965  0.049271       1342.0        15090.0
 10  2021_08_09_Batch11  0.109300  0.054419       1350.0        15090.0
 11  2021_08_30_Batch13  0.080739  0.031585        679.0        15090.0}