In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook
import black
import jupyter_black

jupyter_black.load(
    lab=False,
    line_length=90,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.preprocessing as sp
import pickle
import matplotlib.pyplot as plt
import scipy
import os
import time
from datetime import date
# import dask.dataframe as dd
# import dask.config
# dask.config.set({"distributed.scheduler.allowed-failures": 10})
today = date.today()
import os
import sys
sys.path.insert(0, '/home/ubuntu/workspace_SingleCell/SingleCell_Morphological_Analysis/') 
from singlecell.read import read_single_cell_sql
from singlecell.preprocess import handle_nans, extract_cpfeature_names,find_highly_correlated_features
from singlecell.visualize import visualize_n_SingleCell, cluster
from singlecell.process import statistical_tests,precision_recall
from singlecell.preprocess.filter_out_edge_single_cells import edgeCellFilter
from singlecell.save.save_pandas_dfs import saveDF_to_CSV_GZ_no_timestamp
from singlecell.preprocess.control_for_cellcount import control_feature_y_for_variable_x
from singlecell.process.replicate_correlation import replicate_null_corr_coefs
from singlecell.process import normalize_funcs
import scipy.stats as ss
import json

# %matplotlib inline  

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
# Function to calculate U statistics and p-values using Mann-Whitney U test for each feature
def calculate_stats(ds_params,feature_list, gene_list, df_genes, p_value):
    df_p_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)
    df_u_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)

    gene_counter = 0

    for feat in feature_list:
        list_p = []
        list_u = []
        for gene in gene_list:
            gene_counter += 1
            # Print status every 10 features
            if gene_counter / len(gene_list) % 10 == 0:
                print(
                    f"now calculating feature number {gene_counter/len(gene_list)}"
                )
                
            query_str = f'{ds_params["untreated_key_val"][0]} == "{ds_params["untreated_key_val"][1]}"'
            
            u, p = ss.mannwhitneyu(
                df_genes.query(
                    query_str
                )[feat],
                df_genes.query(
                    "Metadata_Symbol == @gene"
                )[feat],
                method="asymptotic",
            )

            list_p.append(p)
            list_u.append(u)

        df_p_values_feature[feat] = list_p
        df_u_values_feature[feat] = list_u

    df_u_values_feature = df_u_values_feature.apply(pd.to_numeric)
    df_p_values_feature = df_p_values_feature.apply(pd.to_numeric)

    df_p_values_feature.loc["sig_gene_count"] = 0

    for i in range(len(df_p_values_feature.columns)):
        count = 0
        for j in range(len(df_p_values_feature.index) - 1):
            if df_p_values_feature.iloc[j, i] <= p_value:
                count += 1
        df_p_values_feature.iloc[len(df_p_values_feature.index) - 1, i] = count

    return df_u_values_feature, df_p_values_feature


import numpy as np

def calculate_stats_2(ds_params,feature_list, gene_list, df_genes, p_value):
    df_p_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)
    df_u_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)

    # Query is done once
    untreated_df = df_genes.query(f'{ds_params["untreated_key_val"][0]} == "{ds_params["untreated_key_val"][1]}"')
    
    for feat in feature_list:
        list_p = []
        list_u = []
        for i, gene in enumerate(gene_list):
            # Print status every 10 features
#             if (i + 1) % 10 == 0:
#                 print(
#                     f"now calculating feature number {i+1}"
#                 )
            
            gene_df = df_genes.query("Metadata_Symbol == @gene")[feat]
            
            u, p = ss.mannwhitneyu(
                untreated_df[feat],
                gene_df,
                method="asymptotic"
            )

            list_p.append(p)
            list_u.append(u)

        df_p_values_feature[feat] = list_p
        df_u_values_feature[feat] = list_u

    df_u_values_feature = df_u_values_feature.apply(pd.to_numeric)
    df_p_values_feature = df_p_values_feature.apply(pd.to_numeric)

    # Vectorized comparison of p-values
    df_p_values_feature.loc["sig_gene_count"] = (df_p_values_feature <= p_value).sum(axis=0)

    return df_u_values_feature, df_p_values_feature


In [None]:
calculate_stats

In [3]:
for profile_file, cell_line, condition in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join("../Profile_Aggregation/outputs",profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    
    print ("Converting feature strings to ints")
    df_int_feats,features_dic_reverse = features_to_ints(df)
    
    describe_expression(df, cell_line)

    df_genes = df_int_feats.query("Metadata_Foci_Barcode_MatchedTo_GeneCode != 'nontargeting'")
    genes = list(df_genes.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
    features_int = list(df_genes.columns)[2:]

    # Perform the statistics calculations for each feature
    print (f"Calculating statistics for {profile_file}")
    df_u_values , df_p_values = calculate_stats(features_int, genes, df_int_feats, p_value)

    # Return the ints to feature strings
    print (f'Converting ints to feature strings')
    df_p_values = df_p_values.rename(columns=features_dic_reverse)
    df_u_values = df_u_values.rename(columns=features_dic_reverse)
    df_p_values = df_p_values.reset_index(level=0).rename(columns={'index':'Gene'})
    df_u_values = df_u_values.reset_index(level=0).rename(columns={'index':'Gene'})
    if condition:
        df_p_values.to_csv(os.path.join(output_folder,f'{cell_line}_{condition}_significant_features_mann_whitney_p_values.csv.gz'),index=False)
        df_u_values.to_csv(os.path.join(output_folder,f'{cell_line}_{condition}_significant_features_mann_whitney_u_values.csv.gz'),index=False)
    else:
        df_p_values.to_csv(os.path.join(output_folder,f'{cell_line}_significant_features_mann_whitney_p_values.csv.gz'),index=False)
        df_u_values.to_csv(os.path.join(output_folder,f'{cell_line}_significant_features_mann_whitney_u_values.csv.gz'),index=False)


NameError: name 'file_list' is not defined

In [10]:
########################## Project root directory and path to results ########################
mito_project_root_dir = "/home/ubuntu/bucket/projects/2016_08_01_RadialMitochondriaDistribution_donna/"
save_results_dir = mito_project_root_dir + "/workspace/results/jump_fq/"


import pandas as pd
from sqlalchemy import create_engine
from functools import reduce
import gc


def read_per_well_data(
    input_data_dir,
    annot,
    prof_workspace_folder_name="profiles",
    fformat=".parquet",
):
    batches = annot["Batch"].unique()

    df_agg_all_batches_ls = []
    for b in batches:
        print(b)
        #         if "Metadata_Source" in annot.columns:
        source_str = annot.loc[
            annot["Batch"] == b, "Metadata_Source"
        ].unique()[0]
        #             print(source_str)
        profile_path = (
            input_data_dir
            + source_str
            + "/workspace/"
            + prof_workspace_folder_name
            + "/"
        )
        #         else:
        #             profile_path = input_data_dir + "/workspace/profiles/"

        df_sag_ls = []
        plates_exist = os.listdir(profile_path + b)
        plates_meta = annot.loc[annot["Batch"] == b, "Metadata_Plate"].unique()
        plates = set(plates_meta) & set(plates_exist)
        for p in plates:
            print(p)

            fileName = profile_path + b + "/" + p + "/" + p + fformat
            #             print(fileName)
            if os.path.exists(fileName):
                if fformat == ".parquet":
                    sc_df = pd.read_parquet(fileName)
                elif fformat in [".csv", ".csv.gz"]:
                    sc_df = pd.read_csv(fileName)

                #         per_site_aggregate=sc_df.groupby(['Metadata_Well','Metadata_Site']).mean()[feature_list+['Count_Cells']].reset_index()
                sc_df["Metadata_Batch"] = b
                sc_df["Metadata_Plate"] = p
                df_sag_ls.append(sc_df)
                del sc_df
                gc.collect()
            else:
                print(fileName, " not exists")

        if df_sag_ls:
            df_sag = pd.concat(df_sag_ls, axis=0)
            df_agg_all_batches_ls.append(df_sag)

    df_agg_all_batches = pd.concat(df_agg_all_batches_ls, axis=0)
    return df_agg_all_batches


lincs_meta_cols = [
    "Metadata_broad_sample",
    "Metadata_dose_recode",
    "Metadata_pert_id",
    "Metadata_pert_mfc_id",
    "Metadata_InChIKey14",
    "Metadata_pert_type",
    "Metadata_moa",
    "Metadata_target",
    "Metadata_pert_id_dose",
    "Metadata_pert_name",
]

# lincs_meta_cols=['Metadata_broad_sample','Metadata_dose_recode','Metadata_pert_id','Metadata_pert_mfc_id',\
# 'Metadata_InChIKey14','Metadata_pert_type','Metadata_pert_id_dose']

cdrp_meta_cols = [
    "Metadata_broad_sample",
    "Metadata_mmoles_per_liter2",
    "Metadata_pert_id",
    "Metadata_Sample_Dose",
    "Metadata_moa",
]
jumporf_meta_cols = ["Symbol", "broad_sample"]
jumpcrispr_meta_cols = ["Metadata_NCBI_Gene_ID", "Metadata_Symbol"]
jumpcompound_meta_cols = ["Metadata_InChIKey", "Metadata_InChI"]
taorf_meta_cols = [
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_broad_sample",
    "Metadata_moa",
]


# jump_orf_params={'profiles_path':"/home/ubuntu/jumpbucket/projects/2021_04_26_Production/workspace/backend/",\
#                  'meta_cols':jumporf_meta_cols,\
#                  'pert_col':'broad_sample',\
#                  'target_features_list':target_features_list_orf_cdrp
#                 }

jump_orf_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0016-jump/",
    "prof_workspace_folder_name": "profiles",
    "pformat": ".parquet",
    "meta_cols": jumporf_meta_cols,
    "pert_col": "Metadata_JCP2022",
    "untreated_key_val": ["Metadata_pert_type", "negcon"],
}

cdrp_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0012-wawer-bioactivecompoundprofiling/broad/workspace/backend/",
    "meta_cols": cdrp_meta_cols,
    "pert_col": "Metadata_Sample_Dose",
}
# /home/ubuntu/bucket/projects/2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad/workspace/backend/
# https://cellpainting-gallery.s3.amazonaws.com/cpg0004-lincs/broad/workspace/backend/2016_04_01_a549_48hr_batch1/SQ00014812
lincs_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0004-lincs/",
    "prof_workspace_folder_name": "backend",
    "pformat": ".csv",
    "meta_cols": lincs_meta_cols,
    "pert_col": "Metadata_pert_id_dose",
    "untreated_key_val": ["Metadata_pert_type", "control"],
}

jump_crispr_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0016-jump/",
    "prof_workspace_folder_name": "profiles",
    "pformat": ".parquet",
    "meta_cols": jumpcrispr_meta_cols,
    "pert_col": "Metadata_JCP2022",
    "untreated_key_val": ["Metadata_Symbol", "non-targeting"],
}

jump_compound_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0016-jump/",
    "prof_workspace_folder_name": "profiles",
    "pformat": ".parquet",
    "meta_cols": jumpcompound_meta_cols,
    "pert_col": "Metadata_JCP2022",
    "untreated_key_val": ["Metadata_JCP2022", "JCP2022_999999"],
}

ta_orf_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0017-rohban-pathways/",
    "prof_workspace_folder_name": "profiles",
    "pformat": ".csv.gz",
    "meta_cols": taorf_meta_cols,
    "pert_col": "Metadata_broad_sample",
    "untreated_key_val": ["Metadata_pert_type", "Untreated"],
}

ds_info_dict = {
    "jump_orf": jump_orf_params,
    "CDRP": cdrp_params,
    "lincs": lincs_params,
    "lincs_g": lincs_params,
    "jump_crispr": jump_crispr_params,
    "jump_compound": jump_compound_params,
    "taorf": ta_orf_params,
}
# 'broad_sample', 'pert_type', 'control_type'

# results=annot[['Symbol','broad_sample', 'pert_type', 'control_type']].drop_duplicates().reset_index(drop=True)

# dataset='CDRP';dataset_meta_hue='Metadata_moa'
# dataset='lincs';dataset_meta_hue='Metadata_moa'
# dataset='jump_orf';dataset_meta_hue='Symbol'

In [15]:
df_rep_level_scaled[ds_info_dict[dataset]["untreated_key_val"][0]]

0           trt
1           trt
2           trt
3           trt
4           trt
          ...  
86690       trt
86691    poscon
86692    poscon
86693    poscon
86694    poscon
Name: Metadata_pert_type, Length: 86695, dtype: object

In [16]:
datasets = ["jump_orf", "jump_crispr"]

symbol_col = {
    "jump_crispr": "Metadata_Symbol",
    "jump_orf": "Metadata_Symbol",
    "taorf": "Metadata_gene_name",
}

# Define p-value to use for significance
p_value = 0.001

df_ls = []
# names=[]
ds_symbols = {}
ds_features = {}
for dataset in datasets:
    #     names.append(dataset)
    annot = pd.read_csv(
        mito_project_root_dir
        + "/workspace/metadata/preprocessed/annot_"
        + dataset
        + ".csv",
        dtype={"Metadata_Plate": str},
    )
    if "Metadata_Source" not in annot.columns:
        annot["Metadata_Source"] = "broad"
    # target_features_list = ds_info_dict[dataset]["target_features_list"]
    # sources = ['source_5']
    sources = annot["Metadata_Source"].unique()
    for si in sources:
        file_path = (
            save_results_dir
            + "/preprocessed_data/"
            + dataset
            + "_df_rep_level_scaled_"
            + si
            + ".csv"
        )
        if os.path.exists(file_path):
            df_rep_level_scaled = pd.read_csv(file_path)

            ds_symbols[dataset] = (
                df_rep_level_scaled[symbol_col[dataset]].unique().tolist()
            )

            (
                cp_features,
                cp_features_analysis_0,
            ) = extract_cpfeature_names.extract_cpfeature_names(
                df_rep_level_scaled
            )

            
            sdfdsfds
            
            
            df_genes = df_rep_level_scaled[df_rep_level_scaled[ds_info_dict[dataset]["untreated_key_val"][0]]!= \
                                                 ds_info_dict[dataset]["untreated_key_val"][1]]
            
            genes = list(df_genes[~df_genes['Metadata_Symbol'].isnull()].Metadata_Symbol.unique())
            df_u_values , df_p_values = calculate_stats(ds_info_dict[dataset],cp_features_analysis_0, genes, df_rep_level_scaled, p_value)
#             features_int = list(df_genes.columns)[2:]            
            
            
#             ds_features[dataset] = cp_features_analysis_0

#             #             df_rep_level_scaled['']

#             #         batch_names_cols = DS_X_featureQ_rank_df.columns[
#             #             DS_X_featureQ_rank_df.columns.str.contains("^average_std_score_")
#             #         ].tolist()
#             df_ls.append(df_rep_level_scaled)

SyntaxError: invalid syntax (1829159743.py, line 54)

In [61]:
df_rep_level_scaled.groupby('Metadata_Symbol').size().describe()

count    12602.000000
mean         6.815664
std         25.090888
min          4.000000
25%          5.000000
50%          5.000000
75%          5.000000
max       2026.000000
dtype: float64

In [55]:
df_rep_level_scaled[df_rep_level_scaled['Metadata_Symbol'].isnull()]#['Metadata_Symbol']

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,Metadata_NCBI_Gene_ID,Metadata_Taxon_ID,Metadata_Gene_Description,Metadata_Prot_Match,Metadata_Insert_Length,Metadata_pert_type,Metadata_PlateType,Batch,batch_plate,ctrl_well
182,source_4,BR00117040,H20,-0.212082,-0.212089,599.10,615.10,495.24,507.47,546.50,...,,,,,,trt,ORF,2021_04_26_Batch1,2021_04_26_Batch1-BR00117040,False
221,source_4,BR00117040,J13,-0.211685,-0.211802,552.94,604.83,444.98,495.96,498.60,...,,,,,,trt,ORF,2021_04_26_Batch1,2021_04_26_Batch1-BR00117040,False
260,source_4,BR00117040,L05,-0.212521,-0.212281,575.23,588.39,473.17,484.91,523.82,...,,,,,,trt,ORF,2021_04_26_Batch1,2021_04_26_Batch1-BR00117040,False
470,source_4,BR00121564,E14,-0.212795,-0.212522,524.06,540.83,415.76,431.37,469.37,...,,,,,,trt,ORF,2021_04_26_Batch1,2021_04_26_Batch1-BR00121564,False
517,source_4,BR00121564,G13,-0.213007,-0.212722,584.10,569.90,477.85,461.69,530.62,...,,,,,,trt,ORF,2021_04_26_Batch1,2021_04_26_Batch1-BR00121564,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85002,source_4,BR00126709,G09,-0.229396,-0.228730,595.14,595.86,483.84,480.86,538.90,...,,,,,,trt,ORF,2021_08_23_Batch12,2021_08_23_Batch12-BR00126709,False
85370,source_4,BR00126710,G09,-0.229524,-0.228817,590.17,582.52,476.66,468.95,532.84,...,,,,,,trt,ORF,2021_08_23_Batch12,2021_08_23_Batch12-BR00126710,False
85738,source_4,BR00126706,G09,-0.229305,-0.228588,580.49,572.01,466.02,457.71,523.03,...,,,,,,trt,ORF,2021_08_23_Batch12,2021_08_23_Batch12-BR00126706,False
86119,source_4,BR00126716,G21,-0.201646,-0.200887,609.91,593.20,499.07,484.10,554.21,...,,,,,,trt,ORF,2021_08_23_Batch12,2021_08_23_Batch12-BR00126716,False


In [None]:
%%time
genes = list(df_genes[~df_genes['Metadata_Symbol'].isnull()].Metadata_Symbol.unique())
df_u_values , df_p_values = calculate_stats_3(ds_info_dict[dataset],cp_features_analysis_0, genes, df_rep_level_scaled, p_value)


In [71]:
len(cp_features_analysis_0)

3439

In [70]:
%%time
genes = list(df_genes[~df_genes['Metadata_Symbol'].isnull()].Metadata_Symbol.unique())
df_u_values , df_p_values = calculate_stats_2(ds_info_dict[dataset],cp_features_analysis_0[:3], genes, df_rep_level_scaled, p_value)


CPU times: user 48min 57s, sys: 1.64 s, total: 48min 59s
Wall time: 48min 46s


In [65]:
len(genes)

12598

In [52]:
            genes = list(df_genes[~df_genes['Metadata_Symbol'].isnull()].Metadata_Symbol.unique())
            df_u_values , df_p_values = calculate_stats(ds_info_dict[dataset],cp_features_analysis_0, genes, df_rep_level_scaled, p_value)


now calculating feature number 10.0


RecursionError: maximum recursion depth exceeded in comparison

In [51]:
len(genes0)

12599

In [48]:
import math
genes0 = list(df_genes.Metadata_Symbol.unique())
genes = [x for x in genes0 if not math.isnan(x)]

TypeError: must be real number, not str

In [39]:
ds_params=ds_info_dict[dataset]
feature_list=cp_features_analysis_0
gene_list=genes
df_p_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)
df_u_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)

gene_counter = 0

for feat in feature_list:
    list_p = []
    list_u = []
    for gene in gene_list:
        gene_counter += 1
        # Print status every 10 features
        if gene_counter / len(gene_list) % 10 == 0:
            print(
                f"now calculating feature number {gene_counter/len(gene_list)}"
            )

        query_str = f'{ds_params["untreated_key_val"][0]} == "{ds_params["untreated_key_val"][1]}"'

        u, p = ss.mannwhitneyu(
            df_rep_level_scaled.query(
                query_str
            )[feat],
            df_rep_level_scaled.query(
                "Metadata_Symbol == @gene"
            )[feat],
        )

        list_p.append(p)
        list_u.append(u)

    df_p_values_feature[feat] = list_p
    df_u_values_feature[feat] = list_u

df_u_values_feature = df_u_values_feature.apply(pd.to_numeric)
df_p_values_feature = df_p_values_feature.apply(pd.to_numeric)

df_p_values_feature.loc["sig_gene_count"] = 0

for i in range(len(df_p_values_feature.columns)):
    count = 0
    for j in range(len(df_p_values_feature.index) - 1):
        if df_p_values_feature.iloc[j, i] <= p_value:
            count += 1
    df_p_values_feature.iloc[len(df_p_values_feature.index) - 1, i] = count


ValueError: `x` and `y` must be of nonzero size.

In [46]:
gene_list.index(gene)

170

In [43]:
            df_rep_level_scaled.query(
                "Metadata_Symbol == @gene"
            )[feat],

(Series([], Name: Cytoplasm_Correlation_Correlation_AGP_Mito, dtype: float64),)

In [45]:
gene_list

['PCDHA8',
 'DNM2',
 'RHBDF1',
 'EPB41L1',
 'FSTL5',
 'RNF19A',
 'TXNDC16',
 'XRN2',
 'PALD1',
 'TRIM28',
 'GTF3C3',
 'PLEKHG2',
 'AGO1',
 'PCDHGB2',
 'FPGT-TNNI3K',
 'MICALL2',
 'TLR5',
 'DLG3',
 'ENPP1',
 'PHKB',
 'SCYL2',
 'eGFP',
 'ARHGAP45',
 'AFF2',
 'MAGEE1',
 'CLMN',
 'BICRAL',
 'PIK3CD',
 'SART3',
 'KCNT2',
 'MAN2B2',
 'BMS1',
 'COBL',
 'PPP1R13B',
 'A2M',
 'TSC2',
 'ARHGAP6',
 'ITSN2',
 'LRIG1',
 'LTN1',
 'CACHD1',
 'KANK1',
 'PFAS',
 'RAPGEF6',
 'TACC3',
 'EXOC1',
 'KIFC2',
 'MARCHF6',
 'RASA3',
 'LGR6',
 'XPC',
 'ANO6',
 'CDAN1',
 'SMCR8',
 'SSH1',
 'DNM1',
 'SGSM2',
 'ESCO1',
 'ABTB2',
 'RBM25',
 'ARHGAP12',
 'LIG1',
 'AP2A1',
 'MORC3',
 'LPIN3',
 'CCDC180',
 'ZNF512B',
 'KRBA1',
 'PLXNA2',
 'WDR11',
 'DICER1',
 'MAML2',
 'ANKRD26',
 'EPHA1',
 'LTBP2',
 'CHL1',
 'SPATA31E1',
 'PATJ',
 'SFSWAP',
 'NOL8',
 'BLM',
 'SMG5',
 'SNCAIP',
 'IFT140',
 'STAG3',
 'TUBGCP5',
 'E2F7',
 'EGFR',
 'SLC4A3',
 'TECPR2',
 'PPP1R13L',
 'SLC9A5',
 'AXDND1',
 'MIOS',
 'PRPF40B',
 'RAB3GAP1',
 '

In [44]:
gene

nan

In [67]:
# df_rep_level_scaled.query(
#                 query_str
#             )

In [38]:

df_u_values , df_p_values = calculate_stats(ds_info_dict[dataset],cp_features_analysis_0, genes, df_rep_level_scaled, p_value)

ValueError: `x` and `y` must be of nonzero size.

In [31]:
            df_u_values , df_p_values = calculate_stats(ds_info_dict[dataset],cp_features_analysis_0, genes, df_rep_level_scaled, p_value)


ValueError: `x` and `y` must be of nonzero size.

In [25]:
df_genes["Metadata_Symbol"]

0         PCDHA8
1           DNM2
2         RHBDF1
3        EPB41L1
4          FSTL5
          ...   
86690      SRPK1
86691       eGFP
86692       eGFP
86693       eGFP
86694       eGFP
Name: Metadata_Symbol, Length: 82803, dtype: object

In [23]:
ds_info_dict[dataset][pert_col]

{'profiles_path': '/home/ubuntu/gallery/cpg0016-jump/',
 'prof_workspace_folder_name': 'profiles',
 'pformat': '.parquet',
 'meta_cols': ['Symbol', 'broad_sample'],
 'pert_col': 'Metadata_JCP2022',
 'untreated_key_val': ['Metadata_pert_type', 'negcon']}

In [64]:
import numpy as np
import pandas as pd
from multiprocessing import Pool

def calculate_stats_for_gene(params):
    gene, untreated_df, feat, df_genes = params
    gene_df = df_genes.query("Metadata_Symbol == @gene")[feat]
    u, p = ss.mannwhitneyu(
        untreated_df[feat],
        gene_df,
        method="asymptotic"
    )
    return p, u

def calculate_stats_3(ds_params, feature_list, gene_list, df_genes, p_value):
    df_p_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)
    df_u_values_feature = pd.DataFrame(index=gene_list, columns=feature_list)

    # Query is done once
    untreated_df = df_genes.query(f'{ds_params["untreated_key_val"][0]} == "{ds_params["untreated_key_val"][1]}"')

    for feat in feature_list:
        with Pool() as p:
            results = p.map(calculate_stats_for_gene, [(gene, untreated_df, feat, df_genes) for gene in gene_list])
        
        p_values, u_values = zip(*results)
        df_p_values_feature[feat] = p_values
        df_u_values_feature[feat] = u_values

    df_u_values_feature = df_u_values_feature.apply(pd.to_numeric)
    df_p_values_feature = df_p_values_feature.apply(pd.to_numeric)

    # Vectorized comparison of p-values
    df_p_values_feature.loc["sig_gene_count"] = (df_p_values_feature <= p_value).sum(axis=0)

    return df_u_values_feature, df_p_values_feature