# Summary of Imaging Analyses Results

In [1]:
# imports
import os
import polars as pl
import pandas as pd
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from functools import reduce
import operator
from adjustText import adjust_text

In [2]:
BATCH_LIST_DICT = {
    # "2024_01_23_Batch_7": "2024_02_Batch_7-8", 
    # "2024_02_06_Batch_8": "2024_02_Batch_7-8",
    # "2024_12_09_Batch_11": "2024_12_Batch_11-12", 
    # "2024_12_09_Batch_12": "2024_12_Batch_11-12",
    "2025_01_27_Batch_13": "2025_01_Batch_13-14", 
    "2025_01_28_Batch_14": "2025_01_Batch_13-14",
    # "2025_03_17_Batch_15": "2025_03_Batch_15-16", 
    # "2025_03_17_Batch_16": "2025_03_Batch_15-16"
}

BIO_REP_BATCHES_DICT = {
    "2025_01_Batch_13-14": ("2025_01_27_Batch_13", "2025_01_28_Batch_14")
}

METADATA_INPUT = "../../../../1_allele_collection/3_outputs"
FEAT_SETS = ["DNA", "Mito", "GFP", "AGP", "Morph"]
CLASS_RES_OUTDIR = "../../3_outputs/1_snakemake_pipeline/3.smp_results_analyses"
INTEGRATIVE_INPUT = "../../../../3_integrated_assay_analyses/1_inputs/imaging"

In [3]:
clin_var_df = pl.read_csv(f"{METADATA_INPUT}/varchamp_clinvar_gnomad.tsv", 
                          separator="\t", infer_schema_length=10000
).with_columns(
    pl.col("orf_id_wt").alias("orf_id"),
    pl.col("mutation_id_old").alias("mut_id"),
    pl.col("gene_variant").alias("gene_allele"),
)
clin_var_df.head()

symbol,ensembl_gene_id,orf_id_wt,mutation_id_old,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,gene_variant,chr_num,nuc_loc,ref_allele,…,ChromosomeAccession,Chromosome,Start,Stop,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity,SCVsForAggregateGermlineClassification,SCVsForAggregateSomaticClinicalImpact,SCVsForAggregateOncogenicityClassification,clinvar_nt_change,clinvar_aa_change,RefSeq_mRNA,StarStatus,clinvar_clnsig_clean,chr,chr_pos_38,ref_right,alt_right,gnomad_af,orf_id,mut_id,gene_allele
str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,i64,f64,str,…,str,str,i64,i64,str,str,i64,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,i64,i64,str
"""GBA1""","""ENSG00000177628""",2,6,"""CCSBVarC000001""","""ALE0000584""","""NC_000001.11:155240033:C:G""","""160G>C""","""Val54Leu""","""RC4""","""RC4_Mut_GDEh1026""","""H01""","""GDEhDisVCh_40054""","""F12""","""2""","""RC4_Mut_GDDh1026""","""H01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""2""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Val54Leu""",1,155240033.0,"""C""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,6,"""GBA1_Val54Leu"""
"""GBA1""","""ENSG00000177628""",2,73,"""CCSBVarC000002""","""ALE00000002""","""NC_000001.11:155238225:G:A""","""670C>T""","""Leu224Phe""","""RC4""","""RC4_Mut_GDEh1026""","""E01""","""GDEhDisVCh_40054""","""C12""","""2""","""RC4_Mut_GDDh1026""","""E01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""1""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Leu224Phe""",1,155238225.0,"""G""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,73,"""GBA1_Leu224Phe"""
"""GBA1""","""ENSG00000177628""",2,113,"""CCSBVarC000003""","""ALE00000003""","""NC_000001.11:155237453:C:T""","""887G>A""","""Arg296Gln""","""RC4""","""RC4_Mut_GDEh1026""","""F01""","""GDEhDisVCh_40054""","""D12""","""2""","""RC4_Mut_GDDh1026""","""F01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""7""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Arg296Gln""",1,155237453.0,"""C""",…,"""NC_000001.11""","""1""",155237453.0,155237453.0,"""1q22""","""criteria provided, multiple su…",15.0,"""-""","""N""","""ClinGen:CA221417,UniProtKB:P04…",3.0,4328.0,155237453.0,"""C""","""T""","""-""","""-""","""-""","""-""","""-""","""-""","""SCV000232587|SCV000321701|SCV0…","""-""","""-""","""887G>A ""","""Arg296Gln""","""NM_000157.4""",2.0,"""1_Pathogenic""",,,,,,2,113,"""GBA1_Arg296Gln"""
"""GBA1""","""ENSG00000177628""",2,231,"""CCSBVarC000004""","""ALE00000004""","""NC_000001.11:155235252:A:G""","""1448T>C""","""Leu483Pro""","""RC4""","""RC4_Mut_GDEh1026""","""G01""","""GDEhDisVCh_40054""","""E12""","""2""","""RC4_Mut_GDDh1026""","""G01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""2""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Leu483Pro""",1,155235252.0,"""A""",…,"""NC_000001.11""","""1""",155235252.0,155235252.0,"""1q22""","""criteria provided, multiple su…",36.0,"""-""","""Y""","""UniProtKB:P04062#VAR_003321,OM…",3.0,4288.0,155235252.0,"""A""","""G""","""-""","""-""","""-""","""-""","""-""","""-""","""SCV000111209|SCV000491300|SCV0…","""-""","""-""","""1448T>C ""","""Leu483Pro""","""NM_000157.4""",2.0,"""1_Pathogenic""",,,,,,2,231,"""GBA1_Leu483Pro"""
"""GBA1""","""ENSG00000177628""",2,213510,"""CCSBVarC003869""","""ALE00003869""","""NC_000001.11:155239934:G:A""","""259C>T""","""Arg87Trp""","""CEGS2""","""CegsMutGDEh1035""","""B03""","""GDEhDisVCh_40054""","""B02""","""2""","""CegsMutGDDh1035""","""B03""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""2""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Arg87Trp""",1,155239934.0,"""G""",…,"""NC_000001.11""","""1""",155239934.0,155239934.0,"""1q22""","""criteria provided, multiple su…",13.0,"""-""","""N""","""ClinGen:CA253098,UniProtKB:P04…",3.0,4321.0,155239934.0,"""G""","""A""","""-""","""-""","""-""","""-""","""-""","""-""","""SCV000697586|SCV001422687|SCV0…","""-""","""-""","""259C>T ""","""Arg87Trp""","""NM_000157.4""",2.0,"""1_Pathogenic""",,,,,,2,213510,"""GBA1_Arg87Trp"""


In [4]:
priority_col = ["orf_id", "mut_id", "symbol", "aa_change", "gene_allele", "gene_variant", "ensembl_gene_id", "clinvar_clnsig_clean", "gnomad_af", "StarStatus"]
clin_var_df = clin_var_df.select(priority_col).filter(~pl.col("mut_id").is_null()).with_columns(
    pl.col("mut_id").cast(pl.Int64).alias("mut_id"),
    pl.col("orf_id").cast(pl.Int64).alias("orf_id")
)

In [5]:
cell_count_summary = pl.read_csv(f"{CLASS_RES_OUTDIR}/2.cell_count_abundance_change/2025_01_Batch_13-14/well-level_cell-count_changes.csv").rename(
    {"U2OS_paired_t_stat": "U2OS_cc_t_stat", "U2OS_paired_t_pval": "U2OS_cc_t_pval", "Variant": "gene_allele"}
)
prot_abund_summary = pl.read_csv(f"{CLASS_RES_OUTDIR}/2.cell_count_abundance_change/2025_01_Batch_13-14/well-level_prot-abundance_changes.csv").rename(
    {"U2OS_paired_t_stat": "U2OS_abun_t_stat", "U2OS_paired_t_pval": "U2OS_abun_t_pval", "Variant": "gene_allele"}
)
auroc_summary_df = pl.read_csv(f"{CLASS_RES_OUTDIR}/3.classification_analyses/2025_01_Batch_13-14/imaging_analyses_classification_summary.csv")
auroc_summary_df

gene_allele,Metadata_Bio_Batch,AUROC_BioRep1_Morph,AUROC_BioRep1_AGP,AUROC_BioRep1_GFP,AUROC_BioRep1_Mito,AUROC_BioRep1_DNA,AUROC_BioRep2_Morph,AUROC_BioRep2_AGP,AUROC_BioRep2_GFP,AUROC_BioRep2_Mito,AUROC_BioRep2_DNA,AUROC_Mean_Morph,AUROC_Mean_AGP,AUROC_Mean_GFP,AUROC_Mean_Mito,AUROC_Mean_DNA,Altered_95th_perc_Morph,Altered_95th_perc_AGP,Altered_95th_perc_GFP,Altered_95th_perc_Mito,Altered_95th_perc_DNA,Altered_99th_perc_Morph,Altered_99th_perc_AGP,Altered_99th_perc_GFP,Altered_99th_perc_Mito,Altered_99th_perc_DNA,Gene
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""SDHD_Gly106Val""","""2025_01_Batch_13-14""",0.778163,0.834339,0.799601,0.858283,0.697296,0.755168,0.807092,0.769505,0.793184,0.672306,0.766665,0.820715,0.784553,0.825733,0.684801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""SDHD"""
"""SOS2_Val869Ile""","""2025_01_Batch_13-14""",0.660541,0.981839,0.996189,0.981104,0.698522,0.799511,0.987445,0.997277,0.993171,0.713993,0.730026,0.984642,0.996733,0.987137,0.706257,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,"""SOS2"""
"""RET_Thr338Ile""","""2025_01_Batch_13-14""",0.917492,0.710512,0.659498,0.59265,0.8981,0.920531,0.902788,0.843026,0.920583,0.829956,0.919012,0.80665,0.751262,0.756617,0.864028,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,"""RET"""
"""CCM2_Arg389Trp""","""2025_01_Batch_13-14""",0.879888,0.891073,0.906692,0.907211,0.793835,0.680867,0.848538,0.898244,0.820994,0.59598,0.780377,0.869806,0.902468,0.864102,0.694907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""CCM2"""
"""SHOC2_Ser4Arg""","""2025_01_Batch_13-14""",0.917976,0.918094,0.87941,0.91422,0.866539,0.675518,0.808099,0.742499,0.738974,0.640452,0.796747,0.863096,0.810955,0.826597,0.753496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""SHOC2"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""CCM2_Glu264Asp""","""2025_01_Batch_13-14""",0.904895,0.90019,0.867517,0.888666,0.73532,0.862908,0.901495,0.79773,0.794693,0.796924,0.883902,0.900843,0.832624,0.841679,0.766122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""CCM2"""
"""CCM2_Lys30Glu""","""2025_01_Batch_13-14""",0.84455,0.870647,0.875369,0.870079,0.817052,0.743932,0.775511,0.816843,0.777453,0.733565,0.794241,0.823079,0.846106,0.823766,0.775309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""CCM2"""
"""BRCA1_Thr37Lys""","""2025_01_Batch_13-14""",0.817409,0.909922,0.875146,0.8792,0.667051,0.936526,0.874551,0.911929,0.954337,0.939591,0.876968,0.892236,0.893538,0.916768,0.803321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""BRCA1"""
"""BRAF_Phe468Ser""","""2025_01_Batch_13-14""",0.587999,0.684204,0.669576,0.693123,0.545225,0.583917,0.695054,0.676646,0.699388,0.55726,0.585958,0.689629,0.673111,0.696256,0.551242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""BRAF"""


In [6]:
imaging_summary = cell_count_summary.join(
    prot_abund_summary,
    on=["Gene", "gene_allele"],
    how="full",
    coalesce=True
).join(
    auroc_summary_df,
    on=["Gene", "gene_allele"],
    how="full",
    coalesce=True
).with_columns(
    pl.lit(True).alias("image_assayed")
).join(
    clin_var_df,
    on=["gene_allele"],
    how="left"
)
imaging_summary.write_csv("../../../../3_integrated_assay_analyses/1_inputs/imaging/imaging_analyses_summary_clinvar.tsv", separator="\t")