In [1]:
import pandas as pd
import numpy as np
import numpy.random as rand

#Simulates univariate data based on demo mutation rates from code in original TADA paper (https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003671)

PLOIDY = 2

def gen_univ_data(gamma: float, q: float, N_family: int, N_case: int, N_control: int, mut_1: 'array_like', mut_2: 'array_like', seed: int = 0):
    '''
    Function for generating synthetic data according to TADA probabilistic model.

    Parameters
    ----------
    gamma : float
        Genotype relative risk for genotype Aa where a is an allele with
        a deleterious mutation, and A without

    q : float
        Probability of mutation

    N_family : int
        Number of families in sample

    N_case : int
        Number of cases in case/control data

    N_control : int
        Number of controls in case/control data

    mut_1 : array_like
        1D array_like of mutation rates for first mutation class with type 'float'

    mut_2 : array_like
        1D array_like of mutation rates for second mutation class with type 'float'

    seed : int
        Random seed, defaults to 0

    Returns:
        df : DataFrame
            Generated Data
    '''
    #Sets random seed
    rand.seed(int(seed))

    #Ensures mutation rates for the different classes are saved as numpy arrays
    mut_1 = np.array(mut_1)
    mut_2 = np.array(mut_2)

    #Rates for poisson distributions followed by the de novo mutations for the two categories,
    # transmitted/nontransmitted data, and transmission counts for case-control data respectively
    X_d1_rates = PLOIDY * mut_1 * gamma * N_family
    X_d2_rates = PLOIDY * mut_2 * gamma * N_family
    X_t_rates = SAMPLE_SIZE * [q * gamma * N_family]
    X_nt_rates = SAMPLE_SIZE * [q * N_family]
    X_ctrl_rates = SAMPLE_SIZE * [q * (N_control+N_family)]
    X_case_rates = SAMPLE_SIZE * [q * gamma * (N_case+N_family)]

    #Stacks poisson rates and samples (cls indicates mutation class)
    cls1 = np.column_stack([X_d1_rates,np.array(X_t_rates),np.array(X_nt_rates),np.array(X_case_rates),np.array(X_ctrl_rates)])
    cls2 = np.column_stack([X_d2_rates,np.array(X_t_rates),np.array(X_nt_rates),np.array(X_case_rates),np.array(X_ctrl_rates)])
    sample_1 = rand.poisson(cls1).astype(int)
    sample_2 = rand.poisson(cls2).astype(int)

    #Combines simulated data with mutation rates into table
    table = np.column_stack((sample_1,sample_2))
    df = pd.DataFrame(table)
    df.columns = ['dn.cls1', 'trans.cls1', 'ntrans.cls1', 'case.cls1', 'ctrl.cls1', 'dn.cls2', 'trans.cls2', 'ntrans.cls2', 'case.cls2', 'ctrl.cls2']

    df.insert(0, 'mut.cls1', mut_1.reshape(-1,1))
    df.insert(6, 'mut.cls2', mut_2.reshape(-1,1))
    return df



In [3]:
#Tests De novo + case/control
#Read mutation data
tada_file = "TADA_demo_counts_de-novo_and_inherited.txt"
tada_data = pd.read_table(tada_file)

#Specify the number of families and the number of cases and control samples included in the simulation
N_family = 4500
N_case = 1000
N_ctrl = 3000
SAMPLE_SIZE = 5000

#Extract mutation rates
mut_cls1 = tada_data["mut.cls1"]
mut_cls2 = tada_data["mut.cls2"]


#Varying gamma and q parameters (where q/2 is the allele frequency of a)
gamma_1, gamma_2, gamma_3 = 15, 20, 25
q_1, q_2, q_3 = 5*10**-5, 10**-4, 2*10**-4

df_1 = gen_univ_data(gamma_3, q_3, N_family, N_case, N_ctrl, mut_cls1, mut_cls2)

gene_names = tada_data["gene.id"]
df_1.insert(0,"gene.id",gene_names)

#Exports output to text file
df_1.to_csv('testUnivData3.txt', index=False, sep='\t')

In [4]:
import pandas as pd
import numpy as np
import numpy.random as rand

#Simulates de novo univariate data based on demo mutation rates from code in original TADA paper (https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003671)

PLOIDY = 2
SAMPLE_SIZE = 5000

def gen_den_univ_data(gamma: float, q: float, N_family: int, mut_1: 'array_like', mut_2: 'array_like', seed: int = 0):
    '''
    Function for generating synthetic data according to TADA probabilistic model.

    Parameters
    ----------
    gamma : float
        Genotype relative risk for genotype Aa where a is an allele with
        a deleterious mutation, and A without

    q : float
        Probability of mutation

    N_family : int
        Number of families in sample

    mut_1 : array_like
        1D array_like of mutation rates for first mutation class with type 'float'

    mut_2 : array_like
        1D array_like of mutation rates for second mutation class with type 'float'

    seed : int
        Random seed, defaults to 0

    Returns:
        df : DataFrame
            Generated Data
    '''
    #Sets random seed
    rand.seed(int(seed))

    #Ensures mutation rates for the different classes are saved as numpy arrays
    mut_1 = np.array(mut_1)
    mut_2 = np.array(mut_2)

    #Rates for poisson distributions followed by the de novo mutations for the two categories,
    # transmitted/nontransmitted data, and transmission counts for case-control data respectively
    X_d1_rates = PLOIDY * mut_1 * gamma * N_family
    X_d2_rates = PLOIDY * mut_2 * gamma * N_family
    X_t_rates = SAMPLE_SIZE * [q * gamma * N_family]
    X_nt_rates = SAMPLE_SIZE * [q * N_family]

    #Stacks poisson rates and samples (cls indicates mutation class)
    cls1 = np.column_stack([X_d1_rates,np.array(X_t_rates),np.array(X_nt_rates)])
    cls2 = np.column_stack([X_d2_rates,np.array(X_t_rates),np.array(X_nt_rates)])
    sample_1 = rand.poisson(cls1).astype(int)
    sample_2 = rand.poisson(cls2).astype(int)

    #Combines simulated data with mutation rates into table
    table = np.column_stack((sample_1,sample_2))
    df = pd.DataFrame(table)
    df.columns = ['dn.cls1', 'trans.cls1', 'ntrans.cls1', 'dn.cls2', 'trans.cls2', 'ntrans.cls2',]

    df.insert(0, 'mut.cls1', mut_1.reshape(-1,1))
    df.insert(4, 'mut.cls2', mut_2.reshape(-1,1))
    return df



In [5]:
#De novo only testing
#Read mutation data
tada_file = "TADA_demo_counts_de-novo_and_inherited.txt"
tada_data = pd.read_table(tada_file)

#Specify the number of families and the number of cases and control samples included in the simulation
N_family = 4500
N_case = 1000
N_ctrl = 3000

#Extract mutation rates
mut_cls1 = tada_data["mut.cls1"]
mut_cls2 = tada_data["mut.cls2"]


#Varying gamma and q parameters (where q/2 is the allele frequency of a)
gamma_1, gamma_2, gamma_3 = 15, 20, 25
q_1, q_2, q_3 = 5*10**-5, 10**-4, 2*10**-4
#Generates data with gamma_1 and q_1
df_1 = gen_den_univ_data(gamma_1, q_1, N_family, mut_cls1, mut_cls2)

gene_names = tada_data["gene.id"]
df_1.insert(0,"gene.id",gene_names)

#Exports output to text file
df_1.to_csv('testDenUnivData.txt', index=False, sep='\t')

In [6]:
#WIP Version
import pandas as pd
import numpy as np
import numpy.random as rand

#Simulates univariate data based on demo mutation rates from code in original TADA paper (https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003671)

PLOIDY = 2

def gen_univ_data(gamma: float, q: float, N_family: int, N_case: int, N_control: int, mut_1: 'array_like', mut_2: 'array_like', seed: int = 0):
    '''
    Function for generating synthetic data according to TADA probabilistic model.

    Parameters
    ----------
    gamma : float
        Genotype relative risk for genotype Aa where a is an allele with
        a deleterious mutation, and A without

    q : float
        Probability of mutation

    N_family : int
        Number of families in sample

    N_case : int
        Number of cases in case/control data

    N_control : int
        Number of controls in case/control data

    mut_1 : array_like
        1D array_like of mutation rates for first mutation class with type 'float'

    mut_2 : array_like
        1D array_like of mutation rates for second mutation class with type 'float'

    seed : int
        Random seed, defaults to 0

    Returns:
        df : DataFrame
            Generated Data
    '''
    #Sets random seed
    rand.seed(int(seed))

    #Ensures mutation rates for the different classes are saved as numpy arrays
    mut_1 = np.array(mut_1)
    mut_2 = np.array(mut_2)

    #Rates for poisson distributions followed by the de novo mutations for the two categories,
    # transmitted/nontransmitted data, and transmission counts for case-control data respectively
    gamma2 = 4.7
    q2 = 6*10**-4
    X_d1_rates = PLOIDY * mut_1 * 20 * N_family
    X_d2_rates = PLOIDY * mut_2 * 4.7 * N_family
    X_t_rates = SAMPLE_SIZE * [q * 2.3 * N_family]

    X_t_rates2 = SAMPLE_SIZE * [q2 * N_family]
    X_nt_rates = SAMPLE_SIZE * [q * N_family]
    X_ctrl_rates = SAMPLE_SIZE * [q * (N_control)]
    X_case_rates = SAMPLE_SIZE * [q * gamma * (N_case)]

    X_nt_rates2 = SAMPLE_SIZE * [q2 * N_family]
    X_ctrl_rates2 = SAMPLE_SIZE * [q2 * (N_control)]
    X_case_rates2 = SAMPLE_SIZE * [q2 * 4.7 * (N_case)]

    #Stacks poisson rates and samples (cls indicates mutation class)
    cls1 = np.column_stack([X_d1_rates,np.array(X_t_rates),np.array(X_nt_rates),np.array(X_ctrl_rates),np.array(X_case_rates)])
    cls2 = np.column_stack([X_d2_rates,np.array(X_t_rates2),np.array(X_nt_rates2),np.array(X_ctrl_rates2),np.array(X_case_rates2)])
    sample_1 = rand.poisson(cls1).astype(int)
    sample_2 = rand.poisson(cls2).astype(int)

    #Combines simulated data with mutation rates into table
    table = np.column_stack((sample_1,sample_2))
    df = pd.DataFrame(table)
    df.columns = ['dn.cls1', 'trans.cls1', 'ntrans.cls1', 'case.cls1', 'ctrl.cls1', 'dn.cls2', 'trans.cls2', 'ntrans.cls2', 'case.cls2', 'ctrl.cls2']

    df.insert(0, 'mut.cls1', mut_1.reshape(-1,1))
    df.insert(6, 'mut.cls2', mut_2.reshape(-1,1))
    return df

#Read mutation data
tada_file = "TADA_demo_counts_de-novo_and_inherited.txt"
tada_data = pd.read_table(tada_file)

#Specify the number of families and the number of cases and control samples included in the simulation
N_family = 4500
N_case = 1000
N_ctrl = 3000
SAMPLE_SIZE = 5000

#Extract mutation rates
mut_cls1 = tada_data["mut.cls1"]
mut_cls2 = tada_data["mut.cls2"]


#Varying gamma and q parameters (where q/2 is the allele frequency of a)
gamma_1, gamma_2, gamma_3 = 15, 20, 25
q_1, q_2, q_3 = 5*10**-5, 10**-4, 2*10**-4

#Generates data with gamma_1 and q_1
df_1 = gen_univ_data(gamma_2, q_2, N_family, N_case, N_ctrl, mut_cls1, mut_cls2)

gene_names = tada_data["gene.id"]
df_1.insert(0,"gene.id",gene_names)

#Exports output to text file
df_1.to_csv('testUnivData1.txt', index=False, sep='\t')
