# Applying mAP to Plate 4 data at Single Cell level.

In [10]:
import sys
import pathlib
import pandas as pd

import copairs.map as map
from pycytominer.cyto_utils import infer_cp_features

sys.path.append("../")  # noqa
from src.utils import shuffle_features  # noqa

In [11]:
# setting up paths
plate4_path = pathlib.Path("../data/Plate_4_sc_normalized.parquet")

# output paths
results_path = pathlib.Path("./results").resolve()
results_path.mkdir(exist_ok=True)

In [13]:
# loading in plate 4 normalized profile
plate4_df = pd.read_parquet(plate4_path)

# replacing None with "No Constructs"
plate4_df["Metadata_siRNA"].fillna("No Construct", inplace=True)
plate4_df.dropna(inplace=True)

# display dataframe with
print("shape:", plate4_df.shape)
plate4_df.head()

shape: (7265, 2321)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  plate4_df["Metadata_siRNA"].fillna("No Construct", inplace=True)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
2,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,0.041221,0.061693,-0.361728,-0.342859,-0.276131,-0.345279,-0.090203,-0.170966,-0.174635,-0.16101
3,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,1.298175,1.228147,-0.425678,-0.44448,-0.437382,-0.415659,-0.511531,-0.51589,-0.477327,-0.491683
4,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,-0.555394,-0.587789,-0.525197,-0.494342,-0.490506,-0.525199,-0.206447,-0.212056,-0.198931,-0.202032
5,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,1.868931,1.759154,0.273109,0.262805,0.23945,0.27538,-0.15337,-0.139313,0.031441,-0.10375
6,B,2,B2,12,111,NF1,WT,1000,No Construct,0,...,0.152067,0.194806,0.104127,0.10483,0.21834,0.111618,1.100564,0.990683,1.111314,1.07483


In [14]:
# splitting feature space
meta_features = infer_cp_features(plate4_df, metadata=True)
cp_features = infer_cp_features(plate4_df)

# extract siRNA perturbations
siRNAs = plate4_df["Metadata_siRNA"].unique().tolist()

# display
print("aggregated profile shape", plate4_df.shape)
print("siRNA types", plate4_df["Metadata_siRNA"].unique().tolist())
print(
    f"Number of '{siRNAs[0]}' wells: ",
    plate4_df.loc[plate4_df["Metadata_siRNA"] == siRNAs[0]].shape[0],
)
print(
    f"Number of '{siRNAs[1]}' wells: ",
    plate4_df.loc[plate4_df["Metadata_siRNA"] == siRNAs[1]].shape[0],
)
print(
    f"Number of '{siRNAs[2]}' wells: ",
    plate4_df.loc[plate4_df["Metadata_siRNA"] == siRNAs[2]].shape[0],
)
print(
    f"Number of '{siRNAs[3]}' wells: ",
    plate4_df.loc[plate4_df["Metadata_siRNA"] == siRNAs[3]].shape[0],
)
plate4_df.head()

aggregated profile shape (7265, 2321)
siRNA types ['No Construct', 'Scramble', 'NF1 Target 1', 'NF1 Target 2']
Number of 'No Construct' wells:  1636
Number of 'Scramble' wells:  1808
Number of 'NF1 Target 1' wells:  1823
Number of 'NF1 Target 2' wells:  1998


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
2,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,0.041221,0.061693,-0.361728,-0.342859,-0.276131,-0.345279,-0.090203,-0.170966,-0.174635,-0.16101
3,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,1.298175,1.228147,-0.425678,-0.44448,-0.437382,-0.415659,-0.511531,-0.51589,-0.477327,-0.491683
4,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,-0.555394,-0.587789,-0.525197,-0.494342,-0.490506,-0.525199,-0.206447,-0.212056,-0.198931,-0.202032
5,B,2,B2,11,111,NF1,WT,1000,No Construct,0,...,1.868931,1.759154,0.273109,0.262805,0.23945,0.27538,-0.15337,-0.139313,0.031441,-0.10375
6,B,2,B2,12,111,NF1,WT,1000,No Construct,0,...,0.152067,0.194806,0.104127,0.10483,0.21834,0.111618,1.100564,0.990683,1.111314,1.07483


## Applying mAP analysis with Well Level Profile

Parameter docs:
- **pos_samby**: Dictating comparison within the siRNA Group
- **pos_diffby**: Dictating differences of entries (in this case wells) comparing different wells within the same wells 
- **neg_sameby**: kept blank
- **neg_diffby**: Establishing which groups to compare with each other (control vs treatment)
- **null_size**:
- **batch_size**: Amount of calculations done per thread

In [15]:
# setting parameters for mAP
seed = 0
ref_siRNA = "No Construct"
pos_sameby = ["Metadata_siRNA"]
pos_diffby = ["Metadata_Well"]
neg_sameby = []
neg_diffby = ["Metadata_siRNA"]
null_size = plate4_df.loc[plate4_df["Metadata_siRNA"] == ref_siRNA].shape[0] * 100
batch_size = 100

# generate a ref siRNA, this dataframe will be used to be compared across all siRNAs
ref_siRNA_df = plate4_df.loc[plate4_df["Metadata_siRNA"] == ref_siRNA]

### Running mAP with original dataset

In [16]:
# storing all mAP scores
map_results = []

for siRNA in siRNAs:
    # skipping ref to ref comparison
    if siRNA == ref_siRNA:
        continue

    # selecting 1 siRNA treatment
    siRNA_df = plate4_df.loc[plate4_df["Metadata_siRNA"] == siRNA]

    # concat ref with selected siRNA wells
    concat_df = pd.concat([ref_siRNA_df, siRNA_df])

    # execute mAP, comparing the reference siRNA and selected siRNA
    # store into a list
    map_result = map.run_pipeline(
        meta=concat_df[meta_features],
        feats=concat_df[cp_features].values,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding shuffled column
    map_result.insert(0, "shuffled", "Not Shuffled")

    # store to list
    map_results.append(map_result)

# convert mAP results to a dataframe
map_results = pd.concat(map_results)
map_results.to_csv("./results/sc_AP_scores.csv", index=False)

map_results

  0%|          | 0/27658 [00:00<?, ?it/s]

  0%|          | 0/29579 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

: 

In [7]:
# aggregate values based on siRNA
agg_map_results = map.aggregate(map_results, sameby="Metadata_siRNA", threshold=0.05)
agg_map_results.to_csv("./results/sc_mAP_scores.csv", index=False)
agg_map_results

Unnamed: 0,Metadata_siRNA,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold
0,NF1 Target 1,0.695844,1.355954,0.058747,1.231015,True,False
1,NF1 Target 2,0.673322,1.580286,0.052571,1.279256,True,False
2,No Construct,0.571846,0.732084,0.185317,0.732084,False,False
3,Scramble,0.733438,1.939302,0.046,1.337242,True,True


### Running mAP with shuffled feature space dataset

In [8]:
# storing all mAP scores
shuffled_map_results = []

for siRNA in siRNAs:
    # skipping ref to ref comparison
    if siRNA == ref_siRNA:
        continue

    # selecting 1 siRNA treatment
    siRNA_df = plate4_df.loc[plate4_df["Metadata_siRNA"] == siRNA]

    # concat ref with selected siRNA wells
    concat_df = pd.concat([ref_siRNA_df, siRNA_df])

    shuffled_concat_vales = shuffle_features(concat_df[cp_features].values, seed=0)

    # execute mAP, comparing the reference siRNA and selected siRNA
    # store into a list
    map_result = map.run_pipeline(
        meta=concat_df[meta_features],
        feats=shuffled_concat_vales,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding shuffled column
    map_result.insert(0, "shuffled", "Features Shuffled")

    # store to list
    shuffled_map_results.append(map_result)

# convert mAP results to a dataframe
shuffled_map_results = pd.concat(shuffled_map_results)
shuffled_map_results.to_csv("./results/sc_well_AP_scores.csv", index=False)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
shuffled_agg_map = map.aggregate(
    shuffled_map_results, sameby="Metadata_siRNA", threshold=0.05
)
shuffled_agg_map.to_csv("./results/sc_well_mAP_scores.csv")