In [1]:
from proxbias.depmap.process import compute_monte_carlo_stats
from proxbias.depmap.load import get_depmap_data
from proxbias.utils.data_utils import get_cancer_gene_lists
from proxbias.metrics import genome_proximity_bias_score

In [12]:
import os
import sys

import numpy as np
import pandas as pd
import scipy as sp
import pickle
import ast

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
crispr_effect, _, cnv_data, mutation_data = get_depmap_data(rnai_release="")
oncogenes, tsgs = get_cancer_gene_lists(crispr_effect.index)

CRISPRGeneEffect.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsCNGene.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsSomaticMutations.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!


In [3]:
%%time
# Note - n_workers should likely be around half the number of CPUs
res = compute_monte_carlo_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_iterations=100,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=4,
)

Stats for TP53 computed in 181.38685202598572 - diff is 0.025881591000000204, 247 wt and 344 lof
CPU times: user 292 ms, sys: 167 ms, total: 459 ms
Wall time: 3min 3s


In [5]:
res.head()

Unnamed: 0,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_models,n_test,n_wt
TP53,"[0.67920954, 0.6675941400000001, 0.66288408, 0...",0.672657,"[0.6480423999999999, 0.6497969, 0.64489834, 0....",0.646775,0.025882,lof,197,344,247


In [17]:
from scipy import stats

tp53_res = pd.read_csv("TP53_lof_bootstrap_results.csv").rename({"Unnamed: 0": "gene"}, axis=1)

def perform_t_test(df: pd.DataFrame) -> pd.DataFrame:
    results = []

    for index, row in df.iterrows():
        test_stats = row['test_stats']
        wt_stats = row['wt_stats']

        # Perform the t-test
        t_stat, p_value = stats.ttest_ind(test_stats, wt_stats)

        # Determine the gene identifier
        if 'gene' in df.columns:
            gene = row['gene']
        else:
            gene = index
        
        # Append results to list
        results.append({
            'gene': gene,
            't_stat': t_stat,
            'p_value': p_value
        })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

ttest=perform_t_test(res)
ttest

Unnamed: 0,gene,t_stat,p_value
0,TP53,31.865259,6.99788e-80


In [21]:
import random
def scramble_column(df: pd.DataFrame, column_name: str = 'ModelID') -> pd.DataFrame:
    scrambled_df = df.copy()
    
    # Extract the specified column
    column_values = scrambled_df[column_name].tolist()
    
    # Shuffle the column values list
    random.shuffle(column_values)
    
    # Assign the scrambled column values list back to the DataFrame
    scrambled_df[column_name] = column_values
    
    return scrambled_df

scrambled_mutation_data  = scramble_column(mutation_data )
scrambled_mutation_data

Unnamed: 0,VariantInfo,HugoSymbol,ModelID
0,MISSENSE,C1QTNF12,ACH-001841
1,MISSENSE,CASZ1,ACH-000379
2,SILENT,CASZ1,ACH-000983
3,MISSENSE,PRAMEF13,ACH-000960
4,MISSENSE,PRAMEF18,ACH-002127
...,...,...,...
1387890,MISSENSE,F9,ACH-000789
1387891,SILENT,MAGEC3,ACH-001632
1387892,MISSENSE,SPANXN3,ACH-000550
1387893,MISSENSE,MT-ND5,ACH-000913


In [22]:
scrambled_res = compute_monte_carlo_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=scrambled_mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_iterations=100,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=4,
)

Stats for TP53 computed in 211.00881266593933 - diff is -0.004651769999999944, 345 wt and 344 lof


In [23]:
null_ttest=perform_t_test(scrambled_res)
null_ttest

Unnamed: 0,gene,t_stat,p_value
0,TP53,-8.004021,9.99885e-14
