## Frequency EDA

Testing if distributions between unperturbed data and knockout data is statistically different for test and validation genes.
If we find a knockout condition where they are different
and the test gene approaches `0`,
then that might be a candidate sampling pool for making distributions.

### Load useful libraries

In [6]:
import scanpy as sc
import scipy
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.0 statsmodels==0.13.5 pynndescent==0.5.8


### Read data

In [54]:
def adata_f():
    data = sc.read_h5ad('sc_training.h5ad')
    data.layers['normalized_logcounts'] = data.X.copy()
    return data

def data_to_df(data):
    return data.to_df(layer="normalized_logcounts")

def unperturbed_data(data):
    df_cond = pd.DataFrame(data.obs['condition'])
    unperturbed_indices = df_cond[df_cond['condition'] == 'Unperturbed'].index
    
    return data_to_df(data).filter(unperturbed_indices, axis=0)

def conditioned_data(data, condition):
    df_cond = pd.DataFrame(data.obs['condition'])
    condition_index = df_cond[df_cond['condition'] == condition].index
    return data_to_df(data).filter(condition_index, axis=0)

def conditions(data):
    return data.obs['condition'].unique()

def experiments():
    total_conditions = set(conditions(adata_f()))
    total_conditions.remove('Unperturbed')
    return total_conditions

def table_means(table, cols):
    if len(cols) is not 0:
        table = table[cols]
    return table.mean(), table.std()

test_genes = ["Ets1", "Fosb", "Mafk", "Stat3"]
validation_genes = ["Aqr", "Bach2", "Bhlhe40"]

def unperturbed_stats():
    return table_means(unperturbed_data(adata_f()), test_genes + validation_genes)

def condition_stats(condition):
    return table_means(conditioned_data(adata_f(), condition), [condition])

def stats_between_experiments_and_unperturbed(condition):
    unpert_data = unperturbed_data(adata_f())[condition]

    columns = []
    means = []
    stds = []
    stats = []
    pvalues = []
    for experiment in experiments():
        experiment_data = conditioned_data(adata_f(), experiment)[condition]
        means.append(experiment_data.mean())
        stds.append(experiment_data.std())
        columns.append(experiment)

        test_results = scipy.stats.ttest_ind(unpert_data, experiment_data, equal_var=False)
        stats.append(test_results.statistic)
        pvalues.append(test_results.pvalue)

    return pd.DataFrame([means, stds, stats, pvalues], columns=columns, index = ['mean', 'std', 'statistic', 'pvalue']).T

def experiments_connected_to_condition(condition, pvalue = 0.05):
    total_stats = stats_between_experiments_and_unperturbed(condition)
    return total_stats[(total_stats['pvalue'] <= pvalue) & (total_stats['statistic'] > 0)]

## Input: a heldout or test gene name
## Summary: Add a function that for each experiment reports the mean and std
## of the input gene in the form of a table. Filter out experiments that aren't statistically
## different from the unperturbed case. Then we can inspect
## and try to find knockouts that bring the mean closer to 0

In [55]:
experiments_connected_to_condition('Aqr')

Unnamed: 0,mean,std,statistic,pvalue
Tbx21,0.0,0.0,35.964568,5.272099e-252
Crem,0.092276,0.213689,2.888923,0.003945495
Egr1,0.08745,0.196261,2.193383,0.02915753
Tcf3,0.075167,0.168784,2.275131,0.02517378


In [56]:
experiments_connected_to_condition('Bach2')

Unnamed: 0,mean,std,statistic,pvalue
Tbx21,0.0,0.0,18.416083,2.5387110000000002e-73
Hif1a,0.032368,0.123687,2.253428,0.0244331
Stat4,0.00678,0.037135,5.217717,7.30511e-06
Arid5b,0.025771,0.10036,5.869557,4.566689e-09
Zeb2,0.026996,0.123181,3.575069,0.0003613671
Tox2,0.024655,0.105943,6.791898,1.181264e-11
Rps6,0.0,0.0,18.416083,2.5387110000000002e-73
Dvl2,0.035112,0.129077,2.509317,0.01212575


In [57]:
experiments_connected_to_condition('Bhlhe40')

Unnamed: 0,mean,std,statistic,pvalue
Hif1a,1.015748,0.521279,3.132636,0.001785148
Arid5b,0.965486,0.50285,8.840025,1.262089e-18
Batf,0.351554,0.444193,4.03275,0.009902928
Zeb2,0.965038,0.538449,5.849132,6.290361e-09
Rela,0.931651,0.648445,2.230837,0.02807145
Dvl2,0.992108,0.520352,6.505996,8.511121e-11
Sox4,1.011131,0.520184,2.426951,0.01567804


In [58]:
experiments_connected_to_condition('Ets1')

Unnamed: 0,mean,std,statistic,pvalue
Nr4a2,0.558102,0.424003,4.496182,7.796397e-06
Nr3c1,0.493852,0.40175,6.574002,1.343794e-10
Arid5b,0.506235,0.373989,12.897494,1.505429e-37
Sub1,0.484603,0.464401,3.750917,0.0002568453
Ikzf3,0.532851,0.429557,5.416889,8.242354e-08
Tox2,0.577297,0.414737,6.511863,7.806109e-11
Prdm1,0.576235,0.45853,2.503418,0.01265479
Sp140,0.44072,0.464967,2.418768,0.02135373
Ldhb,0.521945,0.428573,3.658962,0.0003197123
Foxm1,0.495445,0.511259,2.064467,0.04361154


In [59]:
experiments_connected_to_condition('Fosb')

Unnamed: 0,mean,std,statistic,pvalue
Runx3,0.0,0.0,22.575824,1.595712e-107
Tbx21,0.0,0.0,22.575824,1.595712e-107
Rad21,0.0,0.0,22.575824,1.595712e-107
Hif1a,0.046253,0.155234,2.989608,0.002858527
Arid5b,0.031397,0.114299,9.256861,2.722823e-20
Arid4b,0.050296,0.175196,2.086515,0.03722055
Batf,0.0,0.0,22.575824,1.595712e-107
Zeb2,0.050905,0.15909,2.450729,0.01437786
Ikzf3,0.042602,0.145387,3.457086,0.0005731988
Tox2,0.04485,0.1476,5.745544,9.461158e-09


In [60]:
experiments_connected_to_condition('Mafk')

Unnamed: 0,mean,std,statistic,pvalue
Eomes,0.012506,0.050025,2.1318,0.04906084
Tbx21,0.0,0.0,19.264881,8.014496e-80
Dkk3,0.009156,0.063437,3.236242,0.002112664
Yy1,0.0,0.0,19.264881,8.014496e-80
Batf,0.0,0.0,19.264881,8.014496e-80
Id3,0.016719,0.074081,2.051942,0.04562498
Klf2,0.0,0.0,19.264881,8.014496e-80
Rps6,0.0,0.0,19.264881,8.014496e-80
Dvl2,0.032587,0.114516,2.167232,0.03026277
Sp140,0.009635,0.055348,3.034083,0.004530051


In [61]:
experiments_connected_to_condition('Stat3')

Unnamed: 0,mean,std,statistic,pvalue
Il12rb1,1.053009,0.533647,4.871935,1.255577e-06
Tbx21,0.662224,0.046427,14.447332,0.03364274
Nr4a2,0.993946,0.514508,7.335612,4.957991e-13
Hif1a,1.083836,0.488034,3.237687,0.001247866
Satb1,1.094992,0.568521,2.043566,0.0414349
Nr4a3,1.089278,0.556842,2.477624,0.01345219
P2rx7,0.88497,0.621416,2.777435,0.008116681
Fzd6,1.035934,0.559837,2.798747,0.005601534
Crem,1.099808,0.593333,2.157528,0.03121243
Tox2,1.083316,0.481356,6.234451,4.730794e-10


1.1490169