# Toy example for multi-genes and multi-CNVs in a region
## One or more causal genes

In [2]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
from pprint import pprint

In [44]:
def toy_multi_gene_cnv(p, n_cnv, n_max=10, causal=5, const1=1, const2=0.1, seed=1):
    '''A region with multiple genes and CNVs. CNVs may overlap or not overlap. Only one causal gene.'''
    np.random.seed(seed)
    cnv_len = np.random.choice((np.random.geometric(p, size=n_cnv) - 1), n_cnv, replace=False)
    cnv_len = cnv_len[cnv_len <= 10].tolist()
    start_pos = [np.random.choice(range(n_max+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(start_pos):
        if i == -1:
            ptn = [0]*n_max
        elif i == 0:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-i)
        else:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-(i-1))
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
##         config.append(line[0]+const1) if line[causal-1] == 1 else config.append(line[0])
        if line[causal] == 1:
            line[0] = line[0] + const1
        config.append(line[0])
        
#       the longer the CNV is, the less common it is, and larger OR
        line[0] = line[0] + line[1:].count(1)*const2
        
##         line[0] = 1 if line[0] > np.median(config) else 0
        if line[0] >= np.median(config):
            line[0] = 1
        else:
            line[0] = 0
        
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts

In [50]:
df, counts = toy_multi_gene_cnv(p=0.6, n_cnv=200, const1=1.0, const2=0.05)
# print (df["phenotype"].tolist())

In [51]:
df

Unnamed: 0,phenotype,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,gene10
0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,1,0,0,0
4,1,0,0,0,0,0,1,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,0


In [43]:
np.random.choice(range(8),100)

array([7, 1, 3, 6, 6, 5, 3, 7, 1, 2, 7, 6, 1, 7, 6, 5, 2, 0, 7, 1, 5, 2, 1,
       7, 0, 4, 4, 1, 1, 7, 0, 7, 4, 5, 3, 6, 5, 6, 3, 1, 4, 0, 1, 3, 0, 1,
       3, 2, 6, 2, 2, 1, 7, 1, 2, 3, 4, 1, 7, 3, 7, 2, 2, 2, 4, 6, 2, 6, 2,
       3, 1, 1, 5, 5, 5, 2, 6, 5, 1, 4, 7, 2, 5, 4, 0, 7, 6, 0, 1, 0, 1, 3,
       7, 5, 3, 5, 1, 1, 4, 4])

In [30]:
def toy_multi_cnv_multi_causal_gene(p, n_cnv, causal, n_geom=5000, n_gene=10, const1=0.5, const2=0.1, seed=1):
    '''A region with multiple genes and CNVs. CNVs probably overlap. At least two causal genes.
    "causal" is a list of positions of causal genes.
    "n_gene" is the number of genes this region harbors.
    "n_cnv" is the number of CNV in this region. "n_geom" is the number of geometric variables generated.
    "p" is the probability of geometric dist, then randomly generate a certain number of geometric variables.
    I use these variables minus 1 as the length of CNVs, which means the number of genes that a CNV overlap with.
    The maximum length for CNV in this region is overlapping with 10 genes'''
    np.random.seed(seed)
    geom_minus_1 = np.random.geometric(p, size=n_geom) - 1
    cnv_len = np.random.choice(geom_minus_1[geom_minus_1 <= 10], n_cnv, replace=False)
    cnv_start_pos = [np.random.choice(range(n_gene+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(cnv_start_pos):
    ## j is the index of i in cnv start position list, so cnv_len[j] is the length of the corresponding cnv;
    ## i is the corresponding cnv start position
        if i == -1:
            ptn = [0]*n_gene
        elif i == 0:
            ptn = [0]*(i - 1) + [1]*cnv_len[j] + [0]*(n_gene - cnv_len[j] - i)
        else:
            ptn = [0]*(i - 1) + [1]*cnv_len[j] + [0]*(n_gene - cnv_len[j] - (i-1))
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    
    return None

In [5]:
df.to_csv("data/df.csv")

In [6]:
counts.to_csv("data/counts.csv")