# Applying mAP to Plate 4 data at Well level.

In [1]:
import sys
import pathlib
import pandas as pd

import copairs.map as map
from pycytominer import aggregate
from pycytominer.cyto_utils import infer_cp_features

sys.path.append("../")  # noqa
from src.utils import shuffle_features

In [2]:
# setting up paths
plate4_path = pathlib.Path("../data/Plate_4_sc_normalized.parquet")

# output paths
results_path = pathlib.Path("./results").resolve()
results_path.mkdir(exist_ok=True)

In [3]:
# loading in plate 4 normalized profile
plate4_df = pd.read_parquet(plate4_path)

# replacing None with "No Constructs"
plate4_df["Metadata_siRNA"].fillna("No Construct", inplace=True)
plate4_df.dropna(inplace=True)

# display dataframe with
print("shape:", plate4_df.shape)
plate4_df.head()

shape: (7265, 2321)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  plate4_df["Metadata_siRNA"].fillna("No Construct", inplace=True)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
2,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,0.041221,0.061693,-0.361728,-0.342859,-0.276131,-0.345279,-0.090203,-0.170966,-0.174635,-0.16101
3,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,1.298175,1.228147,-0.425678,-0.44448,-0.437382,-0.415659,-0.511531,-0.51589,-0.477327,-0.491683
4,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,-0.555394,-0.587789,-0.525197,-0.494342,-0.490506,-0.525199,-0.206447,-0.212056,-0.198931,-0.202032
5,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,1.868931,1.759154,0.273109,0.262805,0.23945,0.27538,-0.15337,-0.139313,0.031441,-0.10375
6,B,2,B2,12,111,NF1,WT,1000,No Construct,0,...,0.152067,0.194806,0.104127,0.10483,0.21834,0.111618,1.100564,0.990683,1.111314,1.07483


In [4]:
# aggregate dataset at the well level
agg_plate4_df = aggregate(plate4_df, strata=["Metadata_Well", "Metadata_siRNA"])

# splitting feature space
meta_features = infer_cp_features(agg_plate4_df, metadata=True)
cp_features = infer_cp_features(agg_plate4_df)

# extract siRNA perturbations
siRNAs = agg_plate4_df["Metadata_siRNA"].unique().tolist()

# display
print("aggregated profile shape", agg_plate4_df.shape)
print("siRNA types", agg_plate4_df["Metadata_siRNA"].unique().tolist())
print(
    f"Number of '{siRNAs[0]}' wells: ",
    agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == siRNAs[0]].shape[0],
)
print(
    f"Number of '{siRNAs[1]}' wells: ",
    agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == siRNAs[1]].shape[0],
)
print(
    f"Number of '{siRNAs[2]}' wells: ",
    agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == siRNAs[2]].shape[0],
)
print(
    f"Number of '{siRNAs[3]}' wells: ",
    agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == siRNAs[3]].shape[0],
)
agg_plate4_df.head()

aggregated profile shape (60, 2302)
siRNA types ['Scramble', 'No Construct', 'NF1 Target 1', 'NF1 Target 2']
Number of 'Scramble' wells:  15
Number of 'No Construct' wells:  15
Number of 'NF1 Target 1' wells:  15
Number of 'NF1 Target 2' wells:  15


Unnamed: 0,Metadata_Well,Metadata_siRNA,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_BoundingBoxMinimum_Y,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,B10,Scramble,-0.046915,-0.019388,-0.036121,0.195996,-0.126334,0.076976,-0.081874,0.122849,...,-0.052212,-0.035408,-0.132622,-0.117142,-0.095358,-0.109026,-0.057774,-0.030383,-0.046199,-0.049982
1,B11,No Construct,-0.445319,-0.325957,0.208846,-0.014452,0.383183,-0.047739,0.281925,0.026885,...,-0.050258,-0.029858,-0.300654,-0.307184,-0.301677,-0.29319,-0.182431,-0.17818,-0.153776,-0.144413
2,B2,No Construct,-0.334183,-0.383919,-0.093761,0.216527,-0.147564,0.12894,-0.044953,0.239299,...,-0.117567,-0.124287,-0.29348,-0.278806,-0.305568,-0.304227,0.240663,0.242458,0.232652,0.227064
3,B3,Scramble,-0.101119,-0.288846,-0.009103,-0.350656,0.009891,-0.349132,0.100606,-0.427534,...,0.095398,0.148146,-0.286401,-0.284968,-0.294833,-0.304298,0.027029,0.022396,0.007678,0.008945
4,B4,Scramble,-0.215108,-0.189746,0.007108,-0.440481,0.055889,-0.346534,0.027213,-0.377904,...,0.142243,0.107748,-0.162868,-0.15091,-0.160018,-0.150965,0.031385,0.040455,0.033496,0.049061


## Applying mAP analysis with Well Level Profile

Parameter docs:
- **pos_samby**: Dictating comparison within the siRNA Group
- **pos_diffby**: Dictating differences of entries (in this case wells) comparing different wells within the same wells 
- **neg_sameby**: kept blank
- **neg_diffby**: Establishing which groups to compare with each other (control vs treatment)
- **null_size**:
- **batch_size**: Amount of calculations done per thread

In [5]:
# setting parameters for mAP
seed = 0
ref_siRNA = "No Construct"
pos_sameby = ["Metadata_siRNA"]
pos_diffby = ["Metadata_Well"]
neg_sameby = []
neg_diffby = ["Metadata_siRNA"]
null_size = (
    agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == ref_siRNA].shape[0] * 100
)
batch_size = 100

# generate a ref siRNA, this dataframe will be used to be compared across all siRNAs
ref_siRNA_df = agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == ref_siRNA]

### Running mAP with original dataset

In [6]:
# storing all mAP scores
map_results = []

for siRNA in siRNAs:
    # skipping ref to ref comparison
    if siRNA == ref_siRNA:
        continue

    # selecting 1 siRNA treatment
    siRNA_df = agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == siRNA]

    # concat ref with selected siRNA wells
    concat_df = pd.concat([ref_siRNA_df, siRNA_df])

    # execute mAP, comparing the reference siRNA and selected siRNA
    # store into a list
    map_result = map.run_pipeline(
        meta=concat_df[meta_features],
        feats=concat_df[cp_features].values,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding shuffled column
    map_result.insert(0, "shuffled", "Not Shuffled")

    # store to list
    map_results.append(map_result)

# convert mAP results to a dataframe
map_results = pd.concat(map_results)
map_results.to_csv("./results/well_AP_scores.csv", index=False)

map_results

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,shuffled,Metadata_Well,Metadata_siRNA,average_precision,p_value,n_pos_pairs,n_total_pairs
0,Not Shuffled,B11,No Construct,0.489049,0.687542,14,29
1,Not Shuffled,B2,No Construct,0.488716,0.687542,14,29
2,Not Shuffled,B5,No Construct,0.561324,0.361093,14,29
3,Not Shuffled,B8,No Construct,0.470221,0.758827,14,29
4,Not Shuffled,C11,No Construct,0.431633,0.896735,14,29
...,...,...,...,...,...,...,...
25,Not Shuffled,G5,NF1 Target 2,0.816024,0.001999,14,29
26,Not Shuffled,G6,NF1 Target 2,0.815095,0.001999,14,29
27,Not Shuffled,G7,NF1 Target 2,0.811615,0.002665,14,29
28,Not Shuffled,G8,NF1 Target 2,0.816497,0.001999,14,29


In [7]:
# aggregate values based on siRNA
agg_map_results = map.aggregate(map_results, sameby="Metadata_siRNA", threshold=0.05)
agg_map_results.to_csv("./results/well_mAP_scores.csv", index=False)
agg_map_results

Unnamed: 0,Metadata_siRNA,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold
0,NF1 Target 1,0.695844,1.355954,0.058747,1.231015,True,False
1,NF1 Target 2,0.673322,1.580286,0.052571,1.279256,True,False
2,No Construct,0.571846,0.732084,0.185317,0.732084,False,False
3,Scramble,0.733438,1.939302,0.046,1.337242,True,True


### Running mAP with shuffled feature space dataset

In [8]:
# storing all mAP scores
shuffled_map_results = []

for siRNA in siRNAs:
    # skipping ref to ref comparison
    if siRNA == ref_siRNA:
        continue

    # selecting 1 siRNA treatment
    siRNA_df = agg_plate4_df.loc[agg_plate4_df["Metadata_siRNA"] == siRNA]

    # concat ref with selected siRNA wells
    concat_df = pd.concat([ref_siRNA_df, siRNA_df])

    shuffled_concat_vales = shuffle_features(concat_df[cp_features].values, seed=0)

    # execute mAP, comparing the reference siRNA and selected siRNA
    # store into a list
    map_result = map.run_pipeline(
        meta=concat_df[meta_features],
        feats=shuffled_concat_vales,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding shuffled column
    map_result.insert(0, "shuffled", "Features Shuffled")

    # store to list
    shuffled_map_results.append(map_result)

# convert mAP results to a dataframe
shuffled_map_results = pd.concat(shuffled_map_results)
shuffled_map_results.to_csv("./results/shuffled_well_AP_scores.csv", index=False)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
shuffled_agg_map = map.aggregate(
    shuffled_map_results, sameby="Metadata_siRNA", threshold=0.05
)
shuffled_agg_map.to_csv("./results/shuffled_well_mAP_scores.csv")