## Frequency EDA

Testing if distributions between unperturbed data and knockout data is statistically different for test and validation genes.
If we find a knockout condition where they are different
and the test gene approaches `0`,
then that might be a candidate sampling pool for making distributions.

### Load useful libraries

In [1]:
import scanpy as sc
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.8.0 pandas==1.5.1 scikit-learn==1.1.2 statsmodels==0.13.2 python-igraph==0.10.2 pynndescent==0.5.7


### Read data

In [31]:
def adata_f():
    data = sc.read_h5ad('../sc_training.h5ad')
    data.layers['normalized_logcounts'] = data.X.copy()
    return data

def data_to_df(data):
    return data.to_df(layer="normalized_logcounts")

def unperturbed_data(data):
    df_cond = pd.DataFrame(data.obs['condition'])
    unperturbed_indices = df_cond[df_cond['condition'] == 'Unperturbed'].index
    
    return data_to_df(data).filter(unperturbed_indices, axis=0)

def conditioned_data(data, condition):
    df_cond = pd.DataFrame(data.obs['condition'])
    condition_index = df_cond[df_cond['condition'] == condition].index
    return data_to_df(data).filter(condition_index, axis=0)

def conditions(data):
    return data.obs['condition'].unique()

def experiments():
    total_conditions = set(conditions(adata_f()))
    total_conditions.remove('Unperturbed')
    return total_conditions

def table_means(table, cols):
    if len(cols) is not 0:
        table = table[cols]
    return table.mean(), table.std()

test_genes = ["Ets1", "Fosb", "Mafk", "Stat3"]
validation_genes = ["Aqr", "Bach2", "Bhlhe40"]

def unperturbed_stats():
    return table_means(unperturbed_data(adata_f()), test_genes + validation_genes)

def condition_stats(condition):
    return table_means(conditioned_data(adata_f(), condition), [condition])

def stats_between_condition_and_unperturbed(condition):
    baseline_means, baseline_std = unperturbed_stats()
    condition_means, condition_std = condition_stats(condition)
    return (baseline_means[condition], baseline_std[condition]), (condition_means[condition], condition_std[condition])


In [34]:
adata = adata_f()
adata = data_to_df(adata)
#table_means(unperturbed_data(adata_f()), test_genes)
#set(conditions(adata_f()))
#unperturbed_stats()
df_cond = pd.DataFrame(adata_f().obs['condition'])
df_cond

Unnamed: 0,condition
053l1_AAACCTGAGATGTCGG-1,Unperturbed
053l1_AAACCTGAGCAACGGT-1,Tox2
053l1_AAACCTGAGTACGACG-1,Tpt1
053l1_AAACCTGAGTCGTTTG-1,Tox2
053l1_AAACCTGAGTGAAGAG-1,Tcf7
...,...
053l4_TTTGTCATCAGGTTCA-1,Tox2
053l4_TTTGTCATCAGTGTTG-1,Dvl2
053l4_TTTGTCATCCTCGCAT-1,Zeb2
053l4_TTTGTCATCTTCAACT-1,Sox4


In [4]:
adata_f()

AnnData object with n_obs × n_vars = 28697 × 15077
    obs: 'gRNA_maxID', 'state', 'condition', 'lane'
    layers: 'rawcounts', 'normalized_logcounts'