In [1]:
from proxbias.depmap.process import compute_monte_carlo_stats
from proxbias.depmap.load import get_depmap_data
from proxbias.utils.data_utils import get_cancer_gene_lists
from proxbias.metrics import genome_proximity_bias_score
from proxbias.depmap.constants import *

In [2]:
import os
import sys

import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import pickle
import ast
import random

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
crispr_effect, _, cnv_data, mutation_data = get_depmap_data(rnai_release="")
oncogenes, tsgs = get_cancer_gene_lists(crispr_effect.index)

CRISPRGeneEffect.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsCNGene.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsSomaticMutations.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!


In [4]:
%%time
# Note - n_workers should likely be around half the number of CPUs
res = compute_monte_carlo_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_iterations=100,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=4,
    cnv_cutoffs=(1.5, 2.5)
)
res.head()

Stats for TP53 computed in 179.95777010917664 - diff is 0.025881591000000204, 247 wt and 344 lof
CPU times: user 272 ms, sys: 159 ms, total: 432 ms
Wall time: 3min 1s


Unnamed: 0,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_models,n_test,n_wt
TP53,"[0.67920954, 0.6675941400000001, 0.66288408, 0...",0.672657,"[0.6480423999999999, 0.6497969, 0.64489834, 0....",0.646775,0.025882,lof,197,344,247


In [5]:
#write a function that performs a t-test for the bootstrap estimates
def perform_t_test(df: pd.DataFrame) -> pd.DataFrame:
    df = df.reset_index().rename(columns={'index': 'gene'})

    # Initialize t_stat and p_value columns in the original DataFrame
    df['t_stat'] = None
    df['p_value'] = None

    for index, row in df.iterrows():
        test_stats = row['test_stats']
        wt_stats = row['wt_stats']

        # Perform the t-test
        t_stat, p_value = stats.ttest_ind(test_stats, wt_stats)
        
        # Assign t_stat and p_value to the respective columns in the DataFrame
        df.at[index, 't_stat'] = t_stat
        df.at[index, 'p_value'] = p_value
    
    return df
pd.options.display.float_format = '{:.10e}'.format
ttest=perform_t_test(res)
ttest

Unnamed: 0,gene,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_models,n_test,n_wt,t_stat,p_value
0,TP53,"[0.67920954, 0.6675941400000001, 0.66288408, 0...",0.672656648,"[0.6480423999999999, 0.6497969, 0.64489834, 0....",0.646775057,0.025881591,lof,197,344,247,31.865258665,6.9978804145e-80


In [6]:
### write a function to scramble a given column's values
def scramble_column(df: pd.DataFrame, column_name: str = 'ModelID', seed: int = None) -> pd.DataFrame:
    scrambled_df = df.copy()
    
    # Check if the specified column is in the DataFrame
    if column_name in scrambled_df.columns:
        # Extract the specified column
        column_values = scrambled_df[column_name].tolist()
    else:
        # Extract the index if the column is not a regular column
        column_values = scrambled_df.index.tolist()
    
    # Shuffle the column values list with the specified seed
    if seed is not None:
        random.seed(seed)
    random.shuffle(column_values)
    
    # Assign the scrambled column values list back to the DataFrame
    if column_name in scrambled_df.columns:
        scrambled_df[column_name] = column_values
    else:
        scrambled_df.index = column_values
    
    return scrambled_df

### write a function to scramble column names
def scramble_column_names(df: pd.DataFrame, seed: int = None) -> pd.DataFrame:
    scrambled_df = df.copy()
    column_names = list(scrambled_df.columns)

    # Shuffle column names list with the specified seed
    if seed is not None:
        random.seed(seed)
    random.shuffle(column_names)

    # Assign scrambled column names back to the DataFrame
    scrambled_df.columns = column_names
    return scrambled_df


# Assuming crispr_effect is a DataFrame
scrambled_cnv_data = scramble_column_names(cnv_data, seed=60)
scrambled_cnv_data


Unnamed: 0,ACH-000349,ACH-002151,ACH-000137,ACH-002045,ACH-001125,ACH-001704,ACH-000443,ACH-001611,ACH-002311,ACH-002271,...,ACH-001524,ACH-002163,ACH-001548,ACH-002288,ACH-000756,ACH-001691,ACH-002660,ACH-002251,ACH-002212,ACH-001032
FAM87B,1.0169170477e+00,1.0252245991e+00,9.5921757667e-01,1.0048317811e+00,6.7810472391e-01,9.1587460915e-01,7.2648453903e-01,5.4303496554e-01,1.2173390166e+00,9.8155907240e-01,...,1.0654596811e+00,4.9212646253e-01,1.0094517821e+00,7.4180710958e-01,9.7031360077e-01,8.9828802085e-01,1.0507889978e+00,7.9915153620e-01,5.1559136246e-01,9.9457908628e-01
LINC01128,1.0169170477e+00,1.0252245991e+00,9.5921757667e-01,1.0048317811e+00,6.7810472391e-01,9.1587460915e-01,7.2648453903e-01,5.4303496554e-01,1.2173390166e+00,9.8155907240e-01,...,1.0654596811e+00,4.9212646253e-01,1.0094517821e+00,7.4180710958e-01,9.7031360077e-01,8.9828802085e-01,1.0507889978e+00,7.9915153620e-01,5.1559136246e-01,9.9457908628e-01
AL669831.7,1.0169170477e+00,1.0252245991e+00,9.5921757667e-01,1.0048317811e+00,6.7810472391e-01,9.1587460915e-01,7.2648453903e-01,5.4303496554e-01,1.2173390166e+00,9.8155907240e-01,...,1.0654596811e+00,4.9212646253e-01,1.0094517821e+00,7.4180710958e-01,9.7031360077e-01,8.9828802085e-01,1.0507889978e+00,7.9915153620e-01,5.1559136246e-01,9.9457908628e-01
FAM41C,1.0169170477e+00,1.0252245991e+00,9.5921757667e-01,1.0048317811e+00,6.7810472391e-01,9.1587460915e-01,7.2648453903e-01,5.4303496554e-01,1.2173390166e+00,9.8155907240e-01,...,1.0654596811e+00,4.9212646253e-01,1.0094517821e+00,7.4180710958e-01,9.7031360077e-01,8.9828802085e-01,1.0507889978e+00,7.9915153620e-01,5.1559136246e-01,9.9457908628e-01
LINC02593,1.0169170477e+00,1.0252245991e+00,9.5921757667e-01,1.0048317811e+00,6.7810472391e-01,9.1587460915e-01,7.2648453903e-01,5.4303496554e-01,1.2173390166e+00,9.8155907240e-01,...,1.0654596811e+00,4.9212646253e-01,1.0094517821e+00,7.4180710958e-01,9.7031360077e-01,8.9828802085e-01,1.0507889978e+00,7.9915153620e-01,5.1559136246e-01,9.9457908628e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RPS4Y2,4.8776224620e-01,,6.1954286145e-01,,,,3.8199403096e-01,,3.1531506384e-01,,...,6.1911400499e-01,4.8265492322e-01,8.0005114246e-03,4.1293704615e-09,1.9180347438e-09,8.5632945408e-10,1.5183989018e-09,4.8946174085e-01,4.8621927776e-01,1.8121702185e-07
PRORY,4.8776224620e-01,,6.1954286145e-01,,,,3.8199403096e-01,,3.1531506384e-01,,...,6.1911400499e-01,4.8265492322e-01,8.1190857660e-03,4.1293704615e-09,1.9180347438e-09,8.5632945408e-10,1.5183989018e-09,4.8946174085e-01,4.8621927776e-01,5.7013514982e-10
TTTY13,4.8776224620e-01,,6.1954286145e-01,,,,3.8199403096e-01,,3.1531506384e-01,,...,6.1911400499e-01,4.8265492322e-01,8.1190857660e-03,4.1293704615e-09,1.9180347438e-09,8.5632945408e-10,1.5183989018e-09,4.8946174085e-01,4.8621927776e-01,5.7013514982e-10
TTTY5,4.8776224620e-01,,6.1954286145e-01,,,,3.8199403096e-01,,3.1531506384e-01,,...,6.1911400499e-01,4.8265492322e-01,8.1190857660e-03,4.1293704615e-09,1.9180347438e-09,8.5632945408e-10,1.5183989018e-09,4.8946174085e-01,4.8621927776e-01,5.7013514982e-10


In [7]:
#Test compute_monte_carlo_stats on just one scrambled version of the cnv data
scrambled_res = compute_monte_carlo_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=scrambled_cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_iterations=100,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=4
)

#add on a t-test
null_ttest=perform_t_test(scrambled_res)
null_ttest

Stats for TP53 computed in 169.3167359828949 - diff is 0.030261239999999967, 203 wt and 343 lof


Unnamed: 0,gene,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_models,n_test,n_wt,t_stat,p_value
0,TP53,"[0.6804448, 0.66608688, 0.6760476799999999, 0....",0.677064276,"[0.6486923400000001, 0.6459512, 0.64140104, 0....",0.646803036,0.03026124,lof,162,343,203,34.832795561,2.1996272262e-86


In [8]:
#write a function that creates a permuted null across many shufflings of the data
def permute_null_t_test(mutation_data,
                        crispr_effect,
                        cnv_data,
                        n_shuffles,
                        n_min_cell_lines=25,
                        genes=["TP53"],
                        candidate_models=list(crispr_effect.columns),
                        eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
                        **kwargs):
    permuted_null_t_test = pd.DataFrame()

    #put observed results in dataframe
    observed_data = compute_monte_carlo_stats(
            genes_of_interest=genes,
            dependency_data=crispr_effect,
            cnv_data=cnv_data,
            mutation_data=mutation_data,
            n_min_cell_lines=n_min_cell_lines,
            candidate_models=candidate_models,
            eval_function=genome_proximity_bias_score,
            eval_kwargs=eval_kwargs,
            seed=42,
            n_workers=4,
            **kwargs
        )
    
    observed_data['seed'] = 42
    observed_data['result_type'] = "observed"
    observed_data = perform_t_test(observed_data)


    random_seeds = [random.randint(1, 1000) for _ in range(n_shuffles)]

    for seed in random_seeds:
        scrambled_cnv_data = scramble_column_names(cnv_data, seed=seed)  # Use a single seed for scrambling
        
        # Perform t-test for each gene
        scrambled_res_gene = compute_monte_carlo_stats(
            genes_of_interest=genes,
            dependency_data=crispr_effect,
            cnv_data=scrambled_cnv_data,
            mutation_data=mutation_data,
            n_min_cell_lines=n_min_cell_lines,
            candidate_models=candidate_models,
            eval_function=genome_proximity_bias_score,
            eval_kwargs=eval_kwargs,
            seed=seed,
            n_workers=4,
            **kwargs
        )
        
        # Add a column for the seed
        scrambled_res_gene['seed'] = seed
        scrambled_res_gene['result_type'] = "empirical_null"
        scrambled_res_gene = perform_t_test(scrambled_res_gene)
        
        # Concatenate the result with the existing DataFrame
        permuted_null_t_test = pd.concat([permuted_null_t_test, scrambled_res_gene], ignore_index=True)
    
    permuted_null_t_test = pd.concat([observed_data, permuted_null_t_test], ignore_index=True).sort_values(["gene", "seed"])
    return permuted_null_t_test

permuted_null = permute_null_t_test(mutation_data,
                                    crispr_effect,
                                    cnv_data,
                                    n_shuffles=2,
                                    n_iterations=5)

permuted_null

Stats for TP53 computed in 10.370299816131592 - diff is 0.024002160000000217, 247 wt and 344 lof
Stats for TP53 computed in 10.477968215942383 - diff is 0.03716588399999998, 212 wt and 340 lof
Stats for TP53 computed in 10.100435018539429 - diff is 0.008891980000000022, 207 wt and 334 lof


Unnamed: 0,gene,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_models,n_test,n_wt,seed,result_type,t_stat,p_value
0,TP53,"[0.67920954, 0.6675941400000001, 0.66288408, 0...",0.670896316,"[0.6480423999999999, 0.6497969, 0.64489834, 0....",0.646894156,0.02400216,lof,197,344,247,42,observed,7.0818078592,0.00010384359753
1,TP53,"[0.6868486999999999, 0.68389126, 0.68368798, 0...",0.679781696,"[0.6387305999999999, 0.6413215, 0.6456061, 0.6...",0.642615812,0.037165884,lof,169,340,212,162,empirical_null,10.078445565,8.0069183366e-06
2,TP53,"[0.65127026, 0.6587487999999999, 0.65673102, 0...",0.657049232,"[0.65033158, 0.6480188800000001, 0.65058179999...",0.648157252,0.00889198,lof,165,334,207,920,empirical_null,3.6267606108,0.0067164726142


In [9]:
#subset to cell lines with less than 1% CNVs
cnv_data_trans = 2 * (np.power(2, cnv_data) - 1)
cnv_calls = (cnv_data_trans < 1.75) | (cnv_data_trans > 2.25)
cnv_call_pers = cnv_calls.mean(axis=0).sort_values(ascending=False)
least_cnv_cl_01 = list(set(cnv_call_pers[cnv_call_pers < 0.01].index)) #there are only 21 of these cell lines, so these werent used for the driver analysis

#all columns
all_columns = set(cnv_data.columns).intersection(set(crispr_effect.columns)).intersection(set(mutation_data['ModelID']))

# Get p53 loss-of-function (lof) cell lines
tp53_loss_cell_lines = set(cnv_data_trans.T[["TP53"]].query("TP53 <= 1.5").index).intersection(all_columns)

# Get p53 wild-type (wt) cell lines
tp53_wt = all_columns - tp53_loss_cell_lines

# Convert sets to lists
tp53_copy_loss = list(tp53_loss_cell_lines)
tp53_wt = list(tp53_wt)

# Subset the mutation dataframes by P53 status
p53_lof_mutation_data = mutation_data[mutation_data['ModelID'].isin(tp53_copy_loss)]
p53_lof_crispr_effect = crispr_effect[tp53_copy_loss]

p53_wt_mutation_data = mutation_data[mutation_data['ModelID'].isin(tp53_wt)]
p53_wt_crispr_effect = crispr_effect[tp53_wt]

In [10]:
#write a function that either reads null from disc or creates it
def load_or_create_permuted_null(file_name, mutation_data, crispr_effect, cnv_data, search_mode, genes=["CDKN2A", "CDKN2B", "CDKN2C", "BTG2", "MDM4", "MDM2"], fixed_cell_line_sampling=True):
    if os.path.exists(file_name):
        return pd.read_csv(file_name, sep="\t")
    else:
        permuted_null = permute_null_t_test(
            mutation_data,
            crispr_effect,
            cnv_data,
            n_shuffles=50,  # number of shufflings to do for empirical null generation (20-50 probably plenty)
            genes=genes,
            n_iterations=128,  # number of bootstraps (default is 100)
            search_mode=search_mode,
            n_min_cell_lines=25,
            fixed_cell_line_sampling=fixed_cell_line_sampling,
            candidate_models=list(crispr_effect.columns),
            cnv_cutoffs=(1.5, 2.5),
            filter_amp=True,
            eval_kwargs={"n_samples": 20, "n_trials": 200, "return_samples": False}
        )
        permuted_null.to_csv(file_name, sep="\t")
        return permuted_null

In [11]:
p53_only_permuted_null = load_or_create_permuted_null(
    "p53_only_permuted_null.txt",
    mutation_data,
    crispr_effect,
    cnv_data,
    search_mode="lof",
    genes=["TP53"],
    fixed_cell_line_sampling=False)

Stats for TP53 computed in 219.64018893241882 - diff is 0.02676416015625005, 247 wt and 344 lof


  statistic = -2 * np.sum(np.log(pvalues))


Stats for TP53 computed in 210.52769112586975 - diff is 0.011777343749999947, 210 wt and 337 lof
Stats for TP53 computed in 209.9442868232727 - diff is 0.04990244140624989, 217 wt and 306 lof


In [None]:
p53_wt_permuted_null_LOF = load_or_create_permuted_null(
    "p53_wt_permuted_null_LOF_genes.txt",
    p53_wt_mutation_data,
    p53_wt_crispr_effect,
    cnv_data,
    search_mode="lof"
)

In [None]:
p53_wt_permuted_null_AMP = load_or_create_permuted_null(
    "p53_wt_permuted_null_AMP_genes.txt",
    p53_wt_mutation_data,
    p53_wt_crispr_effect,
    cnv_data,
    search_mode="amp"
)

In [None]:
p53_lof_permuted_null_LOF = load_or_create_permuted_null(
    "p53_lof_permuted_null_LOF_genes.txt",
    p53_lof_mutation_data,
    p53_lof_crispr_effect,
    cnv_data,
    search_mode="lof"
)

In [None]:
p53_lof_permuted_null_AMP = load_or_create_permuted_null(
    "p53_lof_permuted_null_AMP_genes.txt",
    p53_lof_mutation_data,
    p53_lof_crispr_effect,
    cnv_data,
    search_mode="amp"
)

### Compute Empirical Null for LoF Drivers on P53 LoF

In [None]:
p53_wt_permuted_null_LOF

In [None]:
def plot_empirical_null_histogram(df, gene_column='gene'):
    """
    Plot histograms of 'empirical_null' diff values faceted by gene,
    with observed values marked by vertical red lines if they exist.

    Parameters:
    - df: pandas DataFrame containing the data
    - gene_column: name of the column to use for grouping (default: 'gene')

    Returns:
    - None (displays the plot)
    """
    # Filter data for "empirical_null" result_type
    df_empirical_null = df[df['result_type'] == 'empirical_null']

    # Find genes with "observed" values
    genes_with_observed = df[df['result_type'] == 'observed'][gene_column].unique()

    # Filter the empirical null data to only include genes with observed values
    df_empirical_null = df_empirical_null[df_empirical_null[gene_column].isin(genes_with_observed)]

    # FacetGrid with seaborn
    g = sns.FacetGrid(df_empirical_null, col=gene_column, col_wrap=3, height=4)

    # Plot histogram for each gene
    g.map(plt.hist, 'diff', bins=20, color='skyblue', edgecolor='black')

    # Add a vertical red line for the "observed" value in each facet
    for ax, gene in zip(g.axes.flat, df_empirical_null[gene_column].unique()):
        observed_diff = df[(df[gene_column] == gene) & (df['result_type'] == 'observed')]['diff'].values[0]
        ax.axvline(x=observed_diff, color='red', linestyle='--', linewidth=1)
        ax.set_xlabel('Empirical Null Difference in BM Means')

    # Adjust layout and display plot
    g.set_titles("{col_name}")
    plt.subplots_adjust(top=0.85)
    g.fig.suptitle('Histogram of Empirical Null Test Condition vs. WT Condition BM Differences \n with Observed Result Marked in Red')
    plt.show()

# Example usage:
# Assuming df is your pandas DataFrame with the provided data
plot_empirical_null_histogram(p53_wt_permuted_null_LOF, gene_column='gene')
