# Evaluate misclassified samples in their different feature spaces

We compare the samples with the highest incorrect predictions against those with the highest confident accurate predictions.

We compare Wildtype and Resistant clones separately, and then compare the feature spaces together.

In [1]:
import pathlib
import pandas as pd
import numpy as np
from scipy import stats

from pycytominer.cyto_utils import infer_cp_features

In [2]:
# Output file
output_ks_test_file = pathlib.Path("results", "ks_test_misclassified_differences.tsv")

In [3]:
# Define paths
data_dir = pathlib.Path("..", "2.describe-data", "data", "merged")
signature_dir = pathlib.Path("..", "3.resistance-signature")

profile_file = pathlib.Path(f"{data_dir}/all_merged_profiles_before_feature_selection.csv.gz")
bz_signature_file = pathlib.Path(f"{signature_dir}/results/signatures/signature_summary_bortezomib_signature.tsv.gz")
accuracy_summary_file = pathlib.Path("results", "singscore_accuracy_summary.tsv")

In [4]:
# Load profile data
profile_df = pd.read_csv(profile_file, low_memory=False)

print(profile_df.shape)
profile_df.head(3)

(3957, 3544)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_ID,Metadata_plate_filename,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,219905,B02,2021_03_03_Batch14,4269,2.5x10^3 cells/well,1.0,WT_parental,20210219.0,,,...,-1.007074,-1.010473,-1.018616,-1.038237,-1.005599,-1.015286,-1.020732,-1.009446,-1.019474,-1.012719
1,219905,B03,2021_03_03_Batch14,1688,2.5x10^3 cells/well,2.0,CloneA,20210219.0,,,...,-0.986309,-1.000248,-1.02219,-1.045947,-1.016479,-1.038013,-0.999631,-0.999447,-0.987759,-0.977462
2,219905,B04,2021_03_03_Batch14,2238,2.5x10^3 cells/well,3.0,CloneE,20210219.0,,,...,-1.371187,-1.358646,-1.353107,-1.355997,-1.357253,-1.352782,-1.360769,-1.364501,-1.366883,-1.360609


In [5]:
# Load bortezomib signature features
bz_sig_df = pd.read_csv(bz_signature_file, sep="\t")

bz_sig_features = bz_sig_df.query("final_signature").features.to_list()

print(bz_sig_df.shape)
print(len(bz_sig_features))
bz_sig_df.head()

(782, 8)
45


Unnamed: 0,features,non_status_significant_exclude,batch_exclude,cell_count_exclude,non_specific_exclude,treatment_time_exclude,final_signature,dataset
0,Cells_AreaShape_Compactness,False,False,False,True,False,False,bortezomib
1,Cells_AreaShape_Eccentricity,True,False,True,True,False,False,bortezomib
2,Cells_AreaShape_Extent,False,False,False,True,False,False,bortezomib
3,Cells_AreaShape_FormFactor,False,False,True,True,False,False,bortezomib
4,Cells_AreaShape_MeanRadius,True,False,False,True,False,False,bortezomib


In [6]:
# Load singscore summary
summary_df = pd.read_csv(accuracy_summary_file, sep="\t")

print(summary_df.shape)
summary_df.head()

(46, 10)


Unnamed: 0,Metadata_clone_number,total_samples,completely_incorrect,high_confidence,accurate,incorrect,prop_completely_incorrect,prop_high_confidence,prop_accurate,prop_inaccurate
0,WT clone 15,16,15.0,0,0,16,0.9375,0.0,0.0,1.0
1,BZ006,11,9.0,0,0,11,0.818182,0.0,0.0,1.0
2,WT clone 10,16,12.0,0,0,16,0.75,0.0,0.0,1.0
3,WT clone 01,7,2.0,2,3,4,0.285714,0.285714,0.428571,0.571429
4,BZ007,11,2.0,8,8,3,0.181818,0.727273,0.727273,0.272727


In [7]:
# Select samples with higher than 75 percent completely incorrect
incorrect_samples = summary_df.head(3).Metadata_clone_number.tolist()
incorrect_samples

['WT clone 15', 'BZ006', 'WT clone 10']

In [8]:
# Select samples with higher than 70 percent high confidence
correct_samples = (
    summary_df
    .sort_values(by="prop_high_confidence", ascending=False)
    .head(6)
    .Metadata_clone_number
    .tolist()
)

correct_samples

['WT clone 02', 'WT clone 12', 'WT clone 13', 'WT clone 14', 'BZ003', 'BZ007']

In [9]:
# Manually define these samples in specific dictionaries
sample_comparison_dict = {
    "wildtype": {
        "correct": ["WT clone 02", "WT clone 12", "WT clone 13", "WT clone 14"],
        "incorrect": ["WT clone 15", "WT clone 10"]
    },
    "resistant": {
        "correct": ["BZ003", "BZ007"],
        "incorrect": ["BZ006"]
    }
}

In [10]:
# Perform KS test for each feature for these mischaracterized columns
all_ks_results = []
for sig_feature in bz_sig_features:

    for clone_type in sample_comparison_dict.keys():
        correct_samples = sample_comparison_dict[clone_type]["correct"]
        incorrect_samples = sample_comparison_dict[clone_type]["incorrect"]

        # Subset the profile dataframe
        correct_feature_values = (
            profile_df
            .query("Metadata_clone_number in @correct_samples")
            .loc[:, sig_feature]
            .tolist()
        )

        incorrect_feature_values = (
            profile_df
            .query("Metadata_clone_number in @incorrect_samples")
            .loc[:, sig_feature]
            .tolist()
        )

        ks_stat, p_value = stats.ks_2samp(correct_feature_values, incorrect_feature_values)
        all_ks_results.append([sig_feature, clone_type, ks_stat, p_value])

# Save results to file for downstream visualization
all_ks_results = pd.DataFrame(all_ks_results)
all_ks_results.columns = ["feature", "clone_type", "ks_stat", "ks_pval"]

all_ks_results.to_csv(output_ks_test_file, sep="\t", index=False)

print(all_ks_results.shape)
all_ks_results.head()

(90, 4)


Unnamed: 0,feature,clone_type,ks_stat,ks_pval
0,Cells_AreaShape_Zernike_4_2,wildtype,0.17933,0.00446173
1,Cells_AreaShape_Zernike_4_2,resistant,0.705397,6.321256000000001e-22
2,Cells_Correlation_K_DNA_AGP,wildtype,0.214869,0.0003162436
3,Cells_Correlation_K_DNA_AGP,resistant,0.633651,2.2726640000000003e-17
4,Cells_Correlation_Manders_Mito_ER,wildtype,0.169118,0.00872904
