## Frequency EDA

Testing if distributions between unperturbed data and knockout data is statistically different for test and validation genes.
If we find a knockout condition where they are different
and the test gene approaches `0`,
then that might be a candidate sampling pool for making distributions.

equal_var in https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html#scipy-stats-ttest-ind

### Load useful libraries

In [6]:
import scanpy as sc
import scipy
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.0 statsmodels==0.13.5 pynndescent==0.5.8


### Read data

In [7]:
def adata_f():
    data = sc.read_h5ad('code/sc_training.h5ad')
    data.layers['normalized_logcounts'] = data.X.copy()
    return data

def data_to_df(data):
    return data.to_df(layer="normalized_logcounts")

def unperturbed_data(data):
    df_cond = pd.DataFrame(data.obs['condition'])
    unperturbed_indices = df_cond[df_cond['condition'] == 'Unperturbed'].index
    
    return data_to_df(data).filter(unperturbed_indices, axis=0)

def conditioned_data(data, condition):
    df_cond = pd.DataFrame(data.obs['condition'])
    condition_index = df_cond[df_cond['condition'] == condition].index
    return data_to_df(data).filter(condition_index, axis=0)

def conditions(data):
    return data.obs['condition'].unique()

def experiments():
    total_conditions = set(conditions(adata_f()))
    total_conditions.remove('Unperturbed')
    return total_conditions

def table_means(table, cols):
    if len(cols) is not 0:
        table = table[cols]
    return table.mean(), table.std()

test_genes = ["Ets1", "Fosb", "Mafk", "Stat3"]
validation_genes = ["Aqr", "Bach2", "Bhlhe40"]

def unperturbed_stats():
    return table_means(unperturbed_data(adata_f()), test_genes + validation_genes)

def condition_stats(condition):
    return table_means(conditioned_data(adata_f(), condition), [condition])

def stats_between_condition_and_unperturbed(condition):
    unpert_data = unperturbed_data(adata_f())[condition]

    columns = []
    means = []
    stds = []
    stats = []
    pvalues = []
    for experiment in experiments():
        experiment_data = conditioned_data(adata_f(), experiment)[condition]
        means.append(experiment_data.mean())
        stds.append(experiment_data.std())
        columns.append(experiment)

        test_results = scipy.stats.ttest_ind(unpert_data, experiment_data, equal_var=False)
        stats.append(test_results.statistic)
        pvalues.append(test_results.pvalue)

    return pd.DataFrame([means, stds, stats, pvalues], columns=columns, index = ['mean', 'std', 'statistic', 'pvalue']).T

## Input: a heldout or test gene name
## Summary: Add a function that for each experiment reports the mean and std
## of the input gene in the form of a table. Filter out experiments that aren't statistically
## different from the unperturbed case. Then we can inspect
## and try to find knockouts that bring the mean closer to 0

In [8]:
#adata = adata_f()
#adata = data_to_df(adata)
#table_means(unperturbed_data(adata_f()), test_genes)
#set(conditions(adata_f()))
#unperturbed_stats()
#df_cond = pd.DataFrame(adata_f().obs['condition'])
#df_cond
stats_between_condition_and_unperturbed('Aqr')

Unnamed: 0,mean,std,statistic,pvalue
Runx3,0.058418,0.184735,0.993283,0.346370
Eomes,0.104666,0.252893,0.187459,0.853800
Il12rb1,0.126749,0.222726,-1.237844,0.216017
Zfp292,0.121745,0.274124,-0.329258,0.742163
Hmgb2,0.162969,0.257277,-1.746091,0.083966
...,...,...,...,...
Litaf,0.098460,0.222431,1.491743,0.136518
Foxo1,0.099023,0.208666,0.787603,0.432934
Tpt1,0.111989,0.199206,0.113687,0.910419
Oxnad1,0.182861,0.287509,-1.765412,0.082680


In [10]:
len(experiments())

66