In [53]:
import numpy as np
import pandas as pd
from itertools import product
from scipy.stats import nbinom, gamma, poisson
import scipy.integrate as integrate
from tqdm import tqdm
from univariate_tada import *

# Smoke Test
# Read mutation data
tada_file = "TADA_smoke_test_counts.txt"
tada_data = pd.read_table(tada_file)

# Specify the number of families and the number of cases and control samples included in the analysis
n_family = 4500
n_case = 1000
n_ctrl = 3000

data = {'dn': [n_family], 'ca': [n_case + n_family], 'cn': [n_ctrl + n_family]}
n = pd.DataFrame(data)
sample_counts = {'cls1': n, 'cls2': n}

# Create the mutational data used by TADA
cls1_counts = pd.DataFrame({'dn': tada_data['dn.cls1'],
                            'ca': tada_data['trans.cls1'] + tada_data['case.cls1'],
                            'cn': tada_data['ntrans.cls1'] + tada_data['ctrl.cls1']})
cls1_counts.index = tada_data['gene.id']

cls2_counts = pd.DataFrame({'dn': tada_data['dn.cls2'],
                            'ca': tada_data['trans.cls2'] + tada_data['case.cls2'],
                            'cn': tada_data['ntrans.cls2'] + tada_data['ctrl.cls2']})
cls2_counts.index = tada_data['gene.id']

tada_counts = {'cls1': cls1_counts, 'cls2': cls2_counts}

# Set up mutation rates
mu = pd.DataFrame({'cls1': tada_data['mut.cls1'], 'cls2': tada_data['mut.cls2']})

# Set up denovo only TRUE/FALSE, here we do not want to restrict ourselves to de novo only analyses
denovo_only = pd.DataFrame({'cls1': [False], 'cls2': [False]})

# Set up parameters
cls1_params = pd.DataFrame({'gamma.mean.dn': [20.0],
                            'beta.dn': [1],
                            'gamma.mean.CC': [2.3],
                            'beta.CC': [4.0],
                            'rho1': [0.1],
                            'nu1': [100],
                            'rho0': [0.1],
                            'nu0': [100]})

cls2_params = pd.DataFrame({'gamma.mean.dn': [4.7],
                            'beta.dn': [1],
                            'gamma.mean.CC': [1.0],
                            'beta.CC': [1000],
                            'rho1': [0.15],
                            'nu1': [100],
                            'rho0': [0.15],
                            'nu0': [100]})

hyperpar = {'cls1': cls1_params, 'cls2': cls2_params}
# Running TADA
re_TADA = TADA(tada_counts=tada_counts, sample_counts=sample_counts, mu=mu, hyperpar=hyperpar, denovo_only=denovo_only)


# Bayesian FDR control
re_TADA['qval'] = Bayesian_FDR(re_TADA['BF.total'], pi0=0.95)

# Run permutation to get the null distributions to use for calculating p-values for TADA
re_TADA_null = TADAnull(tada_counts=tada_counts, sample_counts=sample_counts, mu=mu, hyperpar=hyperpar, denovo_only=denovo_only, n_rep=100)
re_TADA['pval'] = bayesFactor_pvalue(re_TADA['BF.total'], re_TADA_null['BF_null.total'])

# Top 10 genes based on BF.total
re_TADA = re_TADA.sort_values(by='BF.total', ascending=False).head(10)
print(re_TADA)

CALCULATION OF TADA TEST STATISTICS
checking the input for consistent variable names
working on :: cls1


  0%|          | 0/10 [00:00<?, ?it/s]


working on :: cls2


  0%|          | 0/10 [00:00<?, ?it/s]


CALCULATION OF TADA TEST STATISTICS UNDER THE NULL HYPOTHESIS
working on creating DN table for :: cls1
working on creating DN table for :: cls2
working on creating null data for :: cls1
Progress: 0/10
working on creating null data for :: cls2
Progress: 0/10
                  cls1           cls2      BF.total          qval    pval
gene.id                                                                  
GENE956   2.933410e+05  591202.755311  1.734240e+11  1.095581e-10  0.0005
GENE2602  5.819384e+07       2.822945  1.642780e+08  5.788358e-08  0.0005
GENE4190  1.606000e+05       3.088818  4.960642e+05  1.280526e-05  0.0005
GENE2864  3.175408e+03      20.399743  6.477751e+04  8.291037e-05  0.0005
GENE2403  2.513713e+01      20.738377  5.213032e+02  7.099416e-03  0.0005
GENE1227  9.443343e+00      22.591615  2.133404e+02  1.954561e-02  0.0005
GENE1904  9.510587e+00      18.631253  1.771942e+02  3.058807e-02  0.0005
GENE1896  1.613211e+02       0.832099  1.342352e+02  4.226361e-02  0.0005
GE

In [38]:
# Demo Test
# Read mutation data
tada_file = "TADA_demo_counts_de-novo_and_inherited.txt"
tada_data = pd.read_table(tada_file)

# Specify the number of families and the number of cases and control samples included in the analysis
n_family = 4500
n_case = 1000
n_ctrl = 3000

data = {'dn': [n_family], 'ca': [n_case + n_family], 'cn': [n_ctrl + n_family]}
n = pd.DataFrame(data)
sample_counts = {'cls1': n, 'cls2': n}

# Create the mutational data used by TADA
cls1_counts = pd.DataFrame({'dn': tada_data['dn.cls1'],
                            'ca': tada_data['trans.cls1'] + tada_data['case.cls1'],
                            'cn': tada_data['ntrans.cls1'] + tada_data['ctrl.cls1']})
cls1_counts.index = tada_data['gene.id']

cls2_counts = pd.DataFrame({'dn': tada_data['dn.cls2'],
                            'ca': tada_data['trans.cls2'] + tada_data['case.cls2'],
                            'cn': tada_data['ntrans.cls2'] + tada_data['ctrl.cls2']})
cls2_counts.index = tada_data['gene.id']

tada_counts = {'cls1': cls1_counts, 'cls2': cls2_counts}

# Set up mutation rates
mu = pd.DataFrame({'cls1': tada_data['mut.cls1'], 'cls2': tada_data['mut.cls2']})

# Set up denovo only TRUE/FALSE, here we do not want to restrict ourselves to de novo only analyses
denovo_only = pd.DataFrame({'cls1': [False], 'cls2': [False]})

# Set up parameters
cls1_params = pd.DataFrame({'gamma.mean.dn': [20.0],
                            'beta.dn': [1],
                            'gamma.mean.CC': [2.3],
                            'beta.CC': [4.0],
                            'rho1': [0.1],
                            'nu1': [100],
                            'rho0': [0.1],
                            'nu0': [100]})

cls2_params = pd.DataFrame({'gamma.mean.dn': [4.7],
                            'beta.dn': [1],
                            'gamma.mean.CC': [1.0],
                            'beta.CC': [1000],
                            'rho1': [0.15],
                            'nu1': [100],
                            'rho0': [0.15],
                            'nu0': [100]})

hyperpar = {'cls1': cls1_params, 'cls2': cls2_params}

# Running TADA
re_TADA = TADA(tada_counts=tada_counts, sample_counts=sample_counts, mu=mu, hyperpar=hyperpar, denovo_only=denovo_only)


CALCULATION OF TADA TEST STATISTICS
checking the input for consistent variable names
working on :: cls1


  0%|          | 0/5000 [00:00<?, ?it/s]
  2%|▏         | 100/5000 [00:00<00:07, 638.61it/s]
  4%|▍         | 200/5000 [00:00<00:01, 4261.79it/s]
  6%|▌         | 300/5000 [00:00<00:00, 6980.54it/s]
  8%|▊         | 400/5000 [00:00<00:00, 6449.25it/s]
 10%|█         | 500/5000 [00:00<00:00, 6811.32it/s]
 12%|█▏        | 600/5000 [00:00<00:00, 4409.77it/s]
 14%|█▍        | 700/5000 [00:00<00:00, 4817.71it/s]
 16%|█▌        | 800/5000 [00:00<00:00, 6685.18it/s]
 18%|█▊        | 900/5000 [00:00<00:00, 7565.12it/s]
 20%|██        | 1000/5000 [00:00<00:00, 10463.21it/s]
 22%|██▏       | 1100/5000 [00:00<00:00, 19020.92it/s]
 24%|██▍       | 1200/5000 [00:00<00:00, 16180.32it/s]
 26%|██▌       | 1300/5000 [00:00<00:00, 9403.38it/s]
 28%|██▊       | 1400/5000 [00:00<00:00, 26060.71it/s]
 30%|███       | 1500/5000 [00:00<00:00, 17788.10it/s]
 32%|███▏      | 1600/5000 [00:00<00:00, 13458.38it/s]
 34%|███▍      | 1700/5000 [00:00<00:00, 22678.69it/s]
 36%|███▌      | 1800/5000 [00:00<00:01, 181

working on :: cls2


  0%|          | 0/5000 [00:00<?, ?it/s]
  2%|▏         | 100/5000 [00:00<00:07, 617.72it/s]
  4%|▍         | 200/5000 [00:00<00:04, 1177.75it/s]
  6%|▌         | 300/5000 [00:00<00:02, 2093.44it/s]
  8%|▊         | 400/5000 [00:00<00:00, 6812.17it/s]
 10%|█         | 500/5000 [00:00<00:00, 5949.57it/s]
 12%|█▏        | 600/5000 [00:00<00:01, 4175.74it/s]
 14%|█▍        | 700/5000 [00:00<00:01, 2538.94it/s]
 16%|█▌        | 800/5000 [00:00<00:00, 5353.05it/s]
 18%|█▊        | 900/5000 [00:00<00:00, 9145.20it/s]
 20%|██        | 1000/5000 [00:00<00:00, 10078.56it/s]
 22%|██▏       | 1100/5000 [00:00<00:00, 6956.64it/s]
 24%|██▍       | 1200/5000 [00:00<00:00, 12550.22it/s]
 26%|██▌       | 1300/5000 [00:00<00:00, 7267.50it/s]
 28%|██▊       | 1400/5000 [00:00<00:00, 10577.12it/s]
 30%|███       | 1500/5000 [00:00<00:00, 10304.76it/s]
 32%|███▏      | 1600/5000 [00:00<00:00, 16077.68it/s]
 34%|███▍      | 1700/5000 [00:00<00:00, 19947.56it/s]
 36%|███▌      | 1800/5000 [00:00<00:00, 2650

In [46]:
# Bayesian FDR control
re_TADA['qval'] = Bayesian_FDR(re_TADA['BF.total'], pi0=0.95)

# Run permutation to get the null distributions to use for calculating p-values for TADA
re_TADA_null = TADAnull(tada_counts=tada_counts, sample_counts=sample_counts, mu=mu, hyperpar=hyperpar, denovo_only=denovo_only, n_rep=100)
re_TADA['pval'] = bayesFactor_pvalue(re_TADA['BF.total'], re_TADA_null['BF_null.total'])

# Top 10 genes based on BF.total
re_TADA = re_TADA.sort_values(by='BF.total', ascending=False).head(10)
print(re_TADA)

CALCULATION OF TADA TEST STATISTICS UNDER THE NULL HYPOTHESIS
working on creating DN table for :: cls1
working on creating DN table for :: cls2
working on creating null data for :: cls1
Progress: 0/5000
Progress: 100/5000
Progress: 200/5000
Progress: 300/5000
Progress: 400/5000
Progress: 500/5000
Progress: 600/5000
Progress: 700/5000
Progress: 800/5000
Progress: 900/5000
Progress: 1000/5000
Progress: 1100/5000
Progress: 1200/5000
Progress: 1300/5000
Progress: 1400/5000
Progress: 1500/5000
Progress: 1600/5000
Progress: 1700/5000
Progress: 1800/5000
Progress: 1900/5000
Progress: 2000/5000
Progress: 2100/5000
Progress: 2200/5000
Progress: 2300/5000
Progress: 2400/5000
Progress: 2500/5000
Progress: 2600/5000
Progress: 2700/5000
Progress: 2800/5000
Progress: 2900/5000
Progress: 3000/5000
Progress: 3100/5000
Progress: 3200/5000
Progress: 3300/5000
Progress: 3400/5000
Progress: 3500/5000
Progress: 3600/5000
Progress: 3700/5000
Progress: 3800/5000
Progress: 3900/5000
Progress: 4000/5000
Progre