In [28]:
from typing import Any, Tuple, Optional
import numpy as np
import pandas as pd
from itertools import product
import torch
import collections
from scipy.stats import nbinom, gamma, poisson, hypergeom
import scipy.integrate as integrate
import os
import copy
import multiprocessing
import uuid
from math import exp, log

import matplotlib.pyplot as plt
from tqdm import tqdm

# Based on original TADA software (He et al. 2013) and rewrite (Klei 2015)

# Set seed for reproducibility
np.random.seed(100)

def TADA(tada_counts, sample_counts, mu, hyperpar, denovo_only, mu_frac=1, pi_gene=1):
    # Genome-wide application of TADA for K classes of variants
    # tada_counts: Dictionary of K data frames, each consisting of vectors for counts for denovo, case, and control mutation counts
    # sample_counts: Dictionary of K data frames, each consisting of a vector with three entries of total sample counts,
    #                 one for denovo (# trios), cases (# cases + # trios), and controls (# controls + # trios)
    # mu: Data frame with K vectors of mutation rates, one for each mutation category
    # mu_frac: Data frame with the fraction to use for each mutation category K
    # hyperpar: Dictionary of K data frames, each consisting of entries for gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0
    # denovo_only: Data frame with K Boolean variables indicating whether only denovo counts should be used (T) or whether both denovo and case-control counts be used (F)
    # pi_gene: Data frame with K vectors of estimated fractions of causal variants, one for each class of variants. These fractions will be used to set gene-specific RR (case-control)
    # Output: Data frame with BF for each of the K classes of variants as well as BF.total. One entry for each of the genes.

    # Make sure every list and dataframe has the same elements
    print("CALCULATION OF TADA TEST STATISTICS")
    print("checking the input for consistent variable names")
    mutation_types = list(tada_counts.keys())
    n_mutation = len(mutation_types)
    n_samples = len(list(tada_counts.values())[0])

    # Make sure mu_frac and pi_gene are data frames
    if not isinstance(mu_frac, pd.DataFrame):
        mu_frac = pd.DataFrame(np.reshape([[mu_frac] * n_mutation],(1,n_mutation)), columns=mutation_types)

    if not isinstance(pi_gene, pd.DataFrame):
        pi_gene = pd.DataFrame(np.reshape([[pi_gene] * n_mutation * n_samples],(n_samples,n_mutation,)), columns=mutation_types)

    if (
        sum([mutation_type in mu for mutation_type in mutation_types]) != n_mutation
        or sum([mutation_type in mu_frac for mutation_type in mutation_types]) != n_mutation
        or sum([mutation_type in hyperpar for mutation_type in mutation_types]) != n_mutation
        or sum([mutation_type in pi_gene for mutation_type in mutation_types]) != n_mutation
        or sum([mutation_type in denovo_only for mutation_type in mutation_types]) != n_mutation
        or sum([mutation_type in sample_counts for mutation_type in mutation_types]) != n_mutation
    ):
        return "mismatch in names for the different variables"
    names_trans_categories = list(sample_counts.keys())
    names_N = ['dn', 'ca', 'cn']

    for mutation in mutation_types:
        if sum(names_N[i] in tada_counts[mutation] for i in range(3)) != 3:
            return f"columns of {mutation} do not match the required 'dn' 'ca' 'cn'"

    # Find the number of genes and the number of different kinds of mutations
    n_gene = len(mu)  # was m
    n_mutation = len(mutation_types)  # was K

    BF = pd.DataFrame()
    for mutation in mutation_types:
        print(f"working on :: {mutation}")
        BF_mut = np.array([])
        for i in range(len(tada_counts[mutation])):
            test_BF = calculate_BF(i,tada_counts[mutation], sample_counts[mutation], mu[mutation], mu_frac[mutation], hyperpar[mutation], denovo_only[mutation].item(), pi_gene[mutation])
            BF_mut = np.append(BF_mut,test_BF)

        BF = pd.concat([BF, pd.DataFrame(BF_mut)], axis=1)
    BF.columns = mutation_types
    BF.index = tada_counts[mutation].index

    # Calculate the overall BF
    BF_total = np.exp(np.log(BF).sum(axis=1))
    BF['BF.total'] = BF_total
    return BF

def TADAnull(tada_counts, sample_counts, mu, hyperpar, denovo_only, mu_frac=1, n_rep=100, dn_max=20, ca_max=200, cn_max=200, max_gap=50):
    # Genome-wide application of TADA for K classes of variants
    # This function determines the distribution of the null hypothesis test statistics which in turn can be used to determine approximate p-values
    # tada.counts: list of K data frames in which each dataframe consists of vectors for counts for denovo, case and control mutation counts
    # sample.counts: list of K data frames in which each dataframe consists of a vector with three entries of total sample counts,
    #                 one for denovo (# trios), cases (# cases + # trios) and controls (# controls + # trios)
    # mu: data frame with K vectors of mutation rates. One for each mutation category.
    # mu.frac: data frame with the fraction to use for each mutation category K
    # hyperpar: list of K data frames. Each data frame consists of entries for gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0
    # denovo.only: data frame with K Boolean variables of indicating whether only denovo counts should be used (T) or whether both denovo and case-control
    # counts be used (F).
    # n_rep: number of repetitions to use recommended to be at least 100. For smaller numbers of genes n_rep should be increased.
    # dn.max: number of denovo events for which the BF is pre-computed and stored in a table. This speeds up the simulation process. The function will use dn.max or
    #         the maximum number of denovo events for a gene, whichever is smaller.
    # ca.max and cn.max: as dn.max this is used to pre-compute a table of common case-control count events to speed up processing. The larger these numbers, the longer the
    #                     the longer the pre computation step takes. For very large values of ca.max and cn.max an intergration error might occur.
    # max.gap: this is used internally to control the size of the pre-compution of the case-control BF matrix. It represents the gap between two genes in
    #           case-control count when ordered from smallest to largest count. This essentially identifies outlying values that hardly ever happen and do not have to be
    #           pre-computed.
    # Output: dataframe with BFnull for each of the K classes of variants as well as BFnull.total. One entry for each of the genes times n_rep.

    print("CALCULATION OF TADA TEST STATISTICS UNDER THE NULL HYPOTHESIS")
    mutation_types = list(tada_counts.keys())
    n_mutation = len(mutation_types)
    n_samples = len(list(tada_counts.values())[0])

    # Make sure mu_frac and pi_gene are data frames
    if not isinstance(mu_frac, pd.DataFrame):
        mu_frac = pd.DataFrame(np.reshape([[mu_frac] * n_mutation],(1,n_mutation)), columns=mutation_types)

    #Pre-compute the bayes factors for the denovo data
    table_BF_dn = {}
    for mutation in mutation_types:
        print(f"working on creating DN table for :: {mutation}")
        x = np.arange(dn_max + 1)
        counts = tada_counts[mutation]['dn']
        param = hyperpar[mutation]
        n = sample_counts[mutation]['dn']
        BF = np.column_stack([bayes_factor_dn(x[i], n_dn=n.item(), mu=mu[mutation] * mu_frac[mutation].item(), gamma_dn=param["gamma.mean.dn"].item(), beta_dn=param["beta.dn"].item()) for i in range(len(x))])

        table_BF_dn[mutation] = pd.DataFrame(BF, columns=[f"X{value}" for value in x])
        tada_counts[mutation]["Ncc"] = tada_counts[mutation][["ca", "cn"]].sum(axis=1)


    table_BF_cc = {}
    for mutation in mutation_types:
        if not denovo_only[mutation].item():
            print(f"working on creating CC table for :: {mutation}")
            tada_counts[mutation]["Ncc"] = tada_counts[mutation][["ca", "cn"]].sum(axis=1)
            Ncc = sorted(tada_counts[mutation]["Ncc"])
            Ncc_gaps = np.diff(Ncc)
            i_gap = Ncc_gaps[np.where(Ncc_gaps > max_gap)[0][0]] if len(np.where(Ncc_gaps > max_gap)[0]) > 0 else max(ca_max,cn_max)
            n_ca = min(ca_max, i_gap)

            n_ca = ca_max
            n_cn = min(cn_max, i_gap)
            n_cn = cn_max
            if(mutation == 'cls2'):
                n_ca = 162
                n_cn = 162
            x = pd.DataFrame(list(product(range(n_ca + 1), range(n_cn + 1))), columns=["ca", "cn"])
            param = hyperpar[mutation]
            n = sample_counts[mutation][["ca", "cn"]]
            BF = np.column_stack([bayes_factor_cc(x.loc[i], n_cc=n, gamma_cc=param["gamma.mean.CC"], beta_cc=param["beta.CC"], rho1=param["rho1"], nu1=param["nu1"], rho0=param["rho0"], nu0=param["nu0"]) for i in range(len(x))])
            BF = np.reshape(BF,(n_ca+1,n_cn+1))
            table_BF_cc[mutation] = pd.DataFrame(BF)

    BF = np.array([])
    for mutation in mutation_types:
        print("working on creating null data for ::", mutation)
        BF_col = [permute_gene(i,mu_rate=mu[mutation] * mu_frac[mutation][0], counts=tada_counts[mutation],
                                                        n=sample_counts[mutation], n_rep=n_rep, param=hyperpar[mutation],
                                                        denovo_only=denovo_only[mutation],
                                                        table_cc=table_BF_cc[mutation], table_dn=table_BF_dn[mutation]) for i in range(len(tada_counts[mutation]))]
        if(len(BF) != 0):
            BF = np.column_stack((BF,np.ravel(BF_col)))
        else:
            BF = np.append(BF,BF_col)
    BF = pd.DataFrame(BF, index = np.arange(len(BF)),columns=mutation_types)
    BF_total = np.exp(np.log(BF).sum(axis=1))

    return {'BF_null': BF, 'BF_null.total': BF_total}


def calculate_BF(i_gene, counts, n, mu, mu_frac, hyperpar, denovo_only, pi_gene):
    # Wrapper so that lapply can be used to determine the BF for a gene and a particular mutation variant
    # i_gene: gene of interest
    # counts: counts for a particular variant, dataframe with vectors for dn, ca, cn
    # n: total samples counts, dataframe with entries for dn, ca, cn
    # mu: vector with mutation rates for the variant of interest for each gene
    # mu_frac: fraction to multiply mu with for the variant of interest
    # hyperpar: dataframe with entries for gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0 for the
    #           variant of interest
    # denovo_only: Boolean vector indicating whether only denovo contribution (denovo_only = True) or a combination of
    #              denovo and case-control contributions is to be used (denovo_only=False)
    # pi_gene: vector with K vectors of estimated fractions of causal variants, one for each class of variants.
    #          These fractions will be used to set gene-specific RR (case-control)

    if i_gene % 100 == 0 or i_gene == len(counts):
        pbar = tqdm(total=len(counts))

    ### set the hyper parameters for this gene
    hyperpar_gene = hyperpar.copy()
    RR_product = hyperpar_gene['gamma.mean.CC'] * hyperpar_gene['beta.CC']
    hyperpar_gene['gamma.mean.CC'] = hyperpar_gene['gamma.mean.CC'] * pi_gene[i_gene] + (1 - pi_gene[i_gene])
    hyperpar_gene['beta.CC'] = RR_product / hyperpar_gene['gamma.mean.CC']
    # Determine the BAYES factor
    BF = bayes_factor(x=counts.iloc[i_gene, :], n=n, mu=mu[i_gene] * mu_frac, param=hyperpar_gene, denovo_only=denovo_only)

    if i_gene % 100 == 0 or i_gene == len(counts):
        pbar.update(i_gene)
        pbar.close()

    return BF

def bayes_factor(x, n, mu, param, denovo_only):
    # Bayes factor of the gene combining de novo and case-control
    # x: a list of (dn, ca, cn), counts in de novo, cases and controls
    # n: a list of (dn, ca, cn), sample sizes
    # param: (gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0)
    # denovo_only: Boolean indicating whether only denovo contribution (True) or a combination of
    #              denovo and case-control contributions is to be used (False)
    # Prior distribution of RR in de novo: gamma.dist.dn ~ Gamma(gamma.mean.dn * beta.dn, beta.dn)
    # Prior distribution of RR in C/C data: gamma.dist.cc ~ Gamma(gamma.mean.CC * beta.CC, beta.CC)
    # Prior distribution of q|H1: Gamma(rho1, nu1)
    # Prior distribution of q|H0: Gamma(rho0, nu0)

    # Contribution of denovo variants in families
    BF_dn = bayes_factor_dn(x_dn=x['dn'], n_dn=n['dn'], mu=mu, gamma_dn=param['gamma.mean.dn'], beta_dn=param['beta.dn'])
    if not denovo_only:
        # Contribution of variants in cases and controls
        BF_cc = bayes_factor_cc(x_cc=x[['ca', 'cn']], n_cc=n[['ca', 'cn']], gamma_cc=param['gamma.mean.CC'], beta_cc=param['beta.CC'],
                                rho1=param['rho1'], nu1=param['nu1'], rho0=param['rho0'], nu0=param['nu0'])
    else:
        BF_cc = 1
    # Combine the pieces of information
    BF = BF_dn * BF_cc
    return BF

def bayes_factor_dn(x_dn, n_dn, mu, gamma_dn, beta_dn):
    # Bayes factor of de novo counts of a gene
    # x_dn: the de novo count
    # n_dn: the sample size (number of families)
    # mu: the mutation rate (of this type of mutational events)
    # Prior distribution of RR: gamma ~ Gamma(gamma_dn * beta_dn, beta_dn)
    marg_lik0 = poisson.pmf(x_dn, 2 * n_dn * mu)
    marg_lik1 = nbinom.pmf(x_dn, gamma_dn * beta_dn, beta_dn / (beta_dn + 2 * n_dn * mu))
    BF = marg_lik1 / marg_lik0
    return BF


def bayes_factor_cc(x_cc, n_cc, gamma_cc, beta_cc, rho1, nu1, rho0, nu0):
    # Bayes factor of the case-control data
    # BF_cn and BF_ca: contribution from control and case data, respectively
    # Input: the count data x_cc, the sample size n_cc and the parameters gamma_cc, beta_cc, rho1 and nu1
    # Prior distribution of RR: gamma ~ Gamma(gamma_cc * beta_cc, beta_cc)
    # Prior distribution of q|H1: Gamma(rho1, nu1)
    # Prior distribution of q|H0: Gamma(rho0, nu0)
    marglik0_cc = evidence_null_cc(x_cc, n_cc, rho0, nu0)
    marglik1_cc = evidence_alt_cc(x_cc, n_cc, gamma_cc, beta_cc, rho1, nu1)
    BF_cn = marglik1_cc["cn"] / marglik0_cc["cn"]
    BF_ca = marglik1_cc["ca"] / marglik0_cc["ca"]
    BF = BF_cn * BF_ca
    return BF

def evidence_null_cc(x_cc, n_cc, rho0, nu0):
    # model evidence of case-control data: P(x_1,x_0|H_0)
    # Input: the count data x_cc, the sample size n_cc, and the rho0 and nu0
    # Prior distribution of q|H0: Gamma(rho0, nu0)
    marglik0_ctrl_log = np.log(nbinom.pmf(x_cc["cn"], rho0, nu0 / (nu0 + n_cc["cn"])))
    marglik0_case_log = np.log(nbinom.pmf(x_cc["ca"], rho0 + x_cc["cn"], (nu0 + n_cc["cn"]) / (nu0 + n_cc["cn"] + n_cc["ca"])))
    marglik0_log = marglik0_ctrl_log + marglik0_case_log

    return {"cn": np.exp(marglik0_ctrl_log), "ca": np.exp(marglik0_case_log), "total": np.exp(marglik0_log)}

def evidence_alt_cc(x_cc, n_cc, gamma_cc, beta_cc, rho1, nu1, q_lower=1e-8, q_upper=0.1, debug=False):
    # model evidence of case-control data: P(x_1,x_0|H_1)
    # Input: the count data x_cc, the sample size n_cc, and the parameters gamma_cc, beta_cc, rho1, and nu1
    # Prior distribution of RR: gamma ~ Gamma(gamma_cc*beta_cc, bet a_cc)
    # Prior distribution of q|H1: Gamma(rho1, nu1)

    def integrand(u, x_ca, gamma_cc, beta_cc, n_ca, x_cn, rho1, nu1, n_cn):
        q = np.exp(u)
        return (nbinom.pmf(x_ca, gamma_cc * beta_cc, beta_cc / (beta_cc + n_ca * q)) *
            gamma.pdf(q, rho1 + x_cn, scale=1 / (nu1 + n_cn)) *
            np.exp(u))

    marglik1_ctrl = nbinom.pmf(x_cc["cn"], rho1, nu1 / (nu1 + n_cc["cn"]))
    marglik1_case = integrate.quad(integrand, np.log(q_lower), np.log(q_upper),args=(x_cc["ca"].item(), gamma_cc.item(), beta_cc.item(), n_cc["ca"].item(), x_cc["cn"].item(), rho1.item(), nu1.item(), n_cc["cn"].item()))[0]
    marglik1 = marglik1_ctrl * marglik1_case
    return {"cn": marglik1_ctrl, "ca": marglik1_case, "total": marglik1}

def table_BF_dn_wrapper(i_x, x, n_dn, mu, gamma_dn, beta_dn):
    # a wrapper function used for generating the denovo table, see bayes.factor.dn for a description of the variables
    BF = bayes_factor_dn(x[i_x], n_dn=n_dn, mu=mu, gamma_dn=gamma_dn, beta_dn=beta_dn)
    return BF


def table_BF_cc_wrapper(i_x, x, n_cc, gamma_cc, beta_cc, rho1, nu1, rho0, nu0):
    # a wrapper function used for generating the case-control table, see bayes.factor.cc for a description of the variables
    if (i_x % 100 == 0 or i_x == len(x)) and len(x) > 1000:
        pb = tqdm(total=len(x))
        pb.update(i_x)
    BF = bayes_factor_cc(x[i_x], n_cc=n_cc, gamma_cc=gamma_cc, beta_cc=beta_cc, rho1=rho1, nu1=nu1, rho0=rho0, nu0=nu0)
    return BF


def permute_gene(i_gene, mu_rate, counts, n, n_rep, param, denovo_only, table_cc, table_dn):
    # Compute permutation BFs of one gene
    # mu_rate: the mutation rate of a gene for the variant of interest
    # counts: dn, ca, and co counts for the variant of interest to be permuted, this also has a column for Ncc=ca+cn
    # n: sample size, for values for de novo, case, control, and case+control
    # n_rep: number of permutation_types
    # param: set of hyper parameters for the variant of interest.
    # table_cc: table of precomputed BFs for case control events of size max.ca by max.cn for the variant of interest
    # table_dn: table of precomputed BFs for denovo events of size number of genes by max.dn for the variant of interest
    # Output: vector of n_rep BF generated under the null hypothesis

    if i_gene % 100 == 0 or i_gene == counts.shape[0]:
        print(f"Progress: {i_gene}/{counts.shape[0]}")

    # generate permutation data for denovo events
    sample_dn = np.random.poisson(2 * n['dn'].item() * mu_rate[i_gene], size=n_rep)
    # look up the BF value in the table
    BF_dn = table_dn.iloc[i_gene].iloc[sample_dn]
    if not denovo_only.item():
        # when both denovo and case-control BF are needed
        # generate permutation data for case-control events
        max_ca = table_cc.shape[0]
        max_cn = table_cc.shape[1]
        sample_ca = np.zeros(n_rep)
        sample_cn = np.zeros(n_rep)
        sample_ca = np.random.hypergeometric(counts["Ncc"][i_gene], n["ca"] + n["cn"] - counts["Ncc"][i_gene], n["ca"],n_rep)
        sample_cn = counts["Ncc"][i_gene] - sample_ca
        # find the generated counts that are outside of the pre-computed table
        i_na = np.where((sample_ca + 1 > max_ca) | (sample_cn + 1 > max_cn))[0]

        if len(i_na) > 0:
            # calculate their BF on a case by case basis
            BF_na = np.zeros(len(i_na))
            for idx, i in enumerate(i_na):
                BF_na[idx] = bayes_factor_cc({"ca": sample_ca[i], "cn": sample_cn[i]},n_cc=n[["ca", "cn"]], gamma_cc=param["gamma.mean.CC"], beta_cc=param["beta.CC"], rho1=param["rho1"], nu1=param["nu1"], rho0=param["rho0"], nu0=param["nu0"])

        # set the counts outside the range to missing
        sample_ca = np.where(sample_ca > max_ca, np.nan, sample_ca).astype(int)
        sample_cn = np.where(sample_cn > max_cn, np.nan, sample_cn).astype(int)
        # gather the BF values that can be taken from the pre-computed table
        BF_cc = np.array([table_cc.iloc[sample_ca[i]][sample_cn[i]] for i in range(n_rep)])

        # replace the missing values with the pre-computed ones
        i_na = np.where(np.isnan(BF_cc))[0]
        if len(i_na) > 0:
            BF_cc[i_na] = BF_na

    else:
        # if denovo only needed then set BF_dn to 1
        BF_cc = np.ones(n_rep)

    # determine the total BF from the two components
    BF = BF_cc * BF_dn

    return BF


def Bayesian_FDR(BF, pi0):
    # Bayesian FDR control (PMID:19822692, Section2.3)
    # BF: a vector of BFs
    # pi0: the prior probability that the null model is true
    # Return: the q-value of each BF, and the number of findings with q below alpha.

    # order the BF in decreasing order, need to retain order to get results back in proper order
    i_order = np.argsort(-BF).to_numpy()
    BF = BF[i_order]

    # convert BFs to PPA (posterior probability of alternative model)
    pi = 1 - pi0
    q = pi * BF / (1 - pi + pi * BF)  # PPA
    q0 = 1 - q  # posterior probability of null model
    # the FDR at each PPA cutoff
    FDR = np.cumsum(q0) / np.arange(1,len(BF)+1)
    # reorder to the original order
    FDR = FDR[i_order]
    return FDR

def bayesFactor_pvalue(BF, BF_null):
    # determines the p-value for the BF using permutation_types under the null hypothesis BF_null
    # BF: vector with bayes factors based on the data
    # BF_null: vector with bayes factors based on permuted data

    BF_null = np.sort(BF_null)[::-1]
    pval = np.searchsorted(-BF_null, -BF) / len(BF_null)
    pval[pval == 0] = 0.5 / len(BF_null)

    return pval

def denovo_MOM(k, N, mu, C, beta, d=2, S=100, max_kvec=None):
    # Estimating relative risk and the number of multiple hits from de novo data
    # Input:  k - number of disease genes
    #         N - sample size
    #         mu - mutation rate for all genes
    #         C - observed number of de novo events
    #         beta - parameter of the prior distribution of gamma
    #         d - number of events to use (1 is 1 or more, 2 is 2 or more)
    #         S - number of samples to generate per gene
    #         max_kvec - used to generate a time line.
    # Output: gamma_mean - the average relative risk,
    #         M - the expected number of multi-hit genes

    if max_kvec is not None:
        if k % 100 == 0 or k == max_kvec:
            pb = tqdm(total=max_kvec)
            pb.update(k)

    m = len(mu)  # number of genes

    # enrichment of de novo events
    nu = C / (2 * N * np.sum(mu))

    # MOM estimator of gamma_mean
    gamma_mean = (nu - 1) * m / k + 1

    # expected M (choose d = 2)
    rs = count_multihit(N, mu, k / m, gamma_mean, beta, d=d, S=S)
    M = np.sum(rs['M1']) + np.sum(rs['M0'])

    return {'gamma.mean': gamma_mean, 'M': M}


def count_multihit(N, mu, pi, gamma_mean, beta, d, S):
    # Estimate the number of multihit genes in a genome.
    # N: sample size
    # mu: mutation rate for all genes
    # pi: ratio of number of risk genes and total number of genes
    # gamma_mean: the average relative risk
    # beta: parameter of the prior distribution of gamma
    # d: number of events to use (1 is 1 or more, 2 is 2 or more)
    # S: number of samples to generate per gene
    # Output: M0 - number of multiple-hit genes for the non-risk genes
    #         M1 - number of multiple-hit genes for risk genes

    m = len(mu)

    # M1: the number of causal genes having d or more de novo mutation_types
    p_alt = np.column_stack([multihit_prob(mu_i, N, gamma_mean, beta, d=d, S=S) for mu_i in mu])
    M1 = m * pi * np.mean(p_alt, axis=0)

    # M0: the number of non-causal genes having d or more de novo mutation_types
    p_null = np.column_stack([(1 - poisson.cdf(d_i, 2 * N * mu_i)) for mu_i in mu for d_i in d])
    p_null = p_null.reshape(m, len(d))
    M0 = m * (1 - pi) * np.mean(p_null, axis=0)

    result = pd.DataFrame({'d': d, 'M0': M0, 'M1': M1})
    return result

def multihit_prob(mu, N, gamma_mean, beta, d, S):
    # Prob. of having d or more de novo mutation_types under H1
    # Use simulation, but could also use analytic form
    # mu: mutation rate for a gene
    # N: sample size
    # gamma_mean: the average relative risk
    # beta: parameter of the prior distribution of gamma
    # d: number of events to use (1 is 1 or more, 2 is 2 or more)
    # S: number of samples to generate per gene
    # Output: p - average probability of having d or more de novo mutation_types

    gamma = gamma.rvs(gamma_mean * beta, scale=1 / beta, size=S)
    p = 1 - poisson.cdf(d, 2 * N * mu * gamma)
    return np.mean(p)


In [27]:
# Read mutation data
tada_file = "TADA_smoke_test_counts.txt"
tada_data = pd.read_table(tada_file)

# Specify the number of families and the number of cases and control samples included in the analysis
n_family = 4500
n_case = 1000
n_ctrl = 3000

data = {'dn': [n_family], 'ca': [n_case + n_family], 'cn': [n_ctrl + n_family]}
n = pd.DataFrame(data)
sample_counts = {'cls1': n, 'cls2': n}

# Create the mutational data used by TADA
cls1_counts = pd.DataFrame({'dn': tada_data['dn.cls1'],
                            'ca': tada_data['trans.cls1'] + tada_data['case.cls1'],
                            'cn': tada_data['ntrans.cls1'] + tada_data['ctrl.cls1']})
cls1_counts.index = tada_data['gene.id']

cls2_counts = pd.DataFrame({'dn': tada_data['dn.cls2'],
                            'ca': tada_data['trans.cls2'] + tada_data['case.cls2'],
                            'cn': tada_data['ntrans.cls2'] + tada_data['ctrl.cls2']})
cls2_counts.index = tada_data['gene.id']

tada_counts = {'cls1': cls1_counts, 'cls2': cls2_counts}

# Set up mutation rates
mu = pd.DataFrame({'cls1': tada_data['mut.cls1'], 'cls2': tada_data['mut.cls2']})

# Set up denovo only TRUE/FALSE, here we do not want to restrict ourselves to de novo only analyses
denovo_only = pd.DataFrame({'cls1': [False], 'cls2': [False]})

# Set up parameters
cls1_params = pd.DataFrame({'gamma.mean.dn': [20.0],
                            'beta.dn': [1],
                            'gamma.mean.CC': [2.3],
                            'beta.CC': [4.0],
                            'rho1': [0.1],
                            'nu1': [100],
                            'rho0': [0.1],
                            'nu0': [100]})

cls2_params = pd.DataFrame({'gamma.mean.dn': [4.7],
                            'beta.dn': [1],
                            'gamma.mean.CC': [1.0],
                            'beta.CC': [1000],
                            'rho1': [0.15],
                            'nu1': [100],
                            'rho0': [0.15],
                            'nu0': [100]})

hyperpar = {'cls1': cls1_params, 'cls2': cls2_params}
# Running TADA
re_TADA = TADA(tada_counts=tada_counts, sample_counts=sample_counts, mu=mu, hyperpar=hyperpar, denovo_only=denovo_only)


CALCULATION OF TADA TEST STATISTICS
checking the input for consistent variable names
working on :: cls1



  0%|          | 0/10 [00:00<?, ?it/s]


working on :: cls2



  0%|          | 0/10 [00:00<?, ?it/s]


In [23]:
# Bayesian FDR control
re_TADA['qval'] = Bayesian_FDR(re_TADA['BF.total'], pi0=0.95)

# Run permutation to get the null distributions to use for calculating p-values for TADA
re_TADA_null = TADAnull(tada_counts=tada_counts, sample_counts=sample_counts, mu=mu, hyperpar=hyperpar, denovo_only=denovo_only, n_rep=100)
re_TADA['pval'] = bayesFactor_pvalue(re_TADA['BF.total'], re_TADA_null['BF_null.total'])

# Top 10 genes based on BF.total
re_TADA = re_TADA.sort_values(by='BF.total', ascending=False).head(10)
print(re_TADA)

CALCULATION OF TADA TEST STATISTICS UNDER THE NULL HYPOTHESIS
working on creating DN table for :: cls1
working on creating DN table for :: cls2
working on creating null data for :: cls1
Progress: 0/10
working on creating null data for :: cls2
Progress: 0/10
                  cls1           cls2      BF.total          qval    pval
gene.id                                                                  
GENE956   2.933410e+05  591202.755311  1.734240e+11  1.095581e-10  0.0005
GENE2602  5.819384e+07       2.822945  1.642780e+08  5.788358e-08  0.0005
GENE4190  1.606000e+05       3.088818  4.960642e+05  1.280526e-05  0.0005
GENE2864  3.175408e+03      20.399743  6.477751e+04  8.291037e-05  0.0005
GENE2403  2.513713e+01      20.738377  5.213032e+02  7.099416e-03  0.0005
GENE1227  9.443343e+00      22.591615  2.133404e+02  1.954561e-02  0.0005
GENE1904  9.510587e+00      18.631253  1.771942e+02  3.058807e-02  0.0005
GENE1896  1.613211e+02       0.832099  1.342352e+02  4.226361e-02  0.0005
GE