In [1]:
import time
import numpy as np
import pandas as pd
# not compatible with pandas 1.1.0
# pip3 install --user -U pandas==1.0.5
# see https://github.com/limix/pandas-plink/issues/18
from pandas_plink import read_plink

from pprint import pprint
from IPython.display import display, HTML

In [2]:
prefix = '/home/arash/temp/BitEpi/sampleData/bfile'
rndCT = False
sd=0.03
numRepeat = 10
numInteraction = 5
# Set the random seed
seedMax = 2**32 - 1
seed = int(time.time()*10000000) % seedMax
np.random.seed(seed)

In [3]:
# Return a genotype datafream generated from plink bfile (BDF: Bfile DataFrame)
def BDF(prefix):
    (bim, fam, bed) = read_plink(prefix, verbose=True)
    bdf = pd.DataFrame(bed.compute().astype('int8')).join(bim[['snp']])\
    .set_index('snp').append(fam.trait.astype('int8')).transpose().astype('category')
    bdf['cnt']=1
    return bdf

In [4]:
# Return the Contingency Table (CT) with sum (s) and row weight (w) given a number of SNPs and Bfile Data Frame
def CT(bdf, SNPs, pheno = 'trait'):
    ct = bdf.groupby([pheno]+SNPs).count()[['cnt']]
    ctrl = ct.loc[ct.index.get_level_values(pheno) == 1].droplevel(level=0).rename(columns={"cnt": "ctrl"})
    case = ct.loc[ct.index.get_level_values(pheno) == 2].droplevel(level=0).rename(columns={"cnt": "case"})
    ctx = ctrl.join(case)
    ctx['s'] = ctx.sum(axis=1)
    total = ctx['s'].sum()
    ctx['w'] = ctx['s']/total
    return ctx.fillna(0)

In [5]:
# Return Weighted Average Purity (WAP) given a contingency table
def WAP(ct):
#     ct['x']=(ct.ctrl/ct.s)**2
#     ct['y']=(ct.case/ct.s)**2
#     ct['z']=ct.x + ct.y
#     ct['zz']=ct.z * ct.w
#     print("=========================")
#     print(ct)
#     display(HTML(ct.to_html()))
#     print(ct.zz.sum())
#     print("=========================")
    # *** This does not handle division by 0
    #print(ct)
    return ((((ct.ctrl/ct.s)**2) + ((ct.case/ct.s)**2)) * ct.w).sum()

In [6]:
# Return Mximum Lower Order WAP (MLOWAP). Lower Order means to exclude 1 SNP from combination
def MLOWAP(ct, SNPs):
    lowaps = list()
    for i in range(0, len(SNPs)):
        lo = SNPs.copy()
        del lo[i]
        #print(lo)
        ctx = ct.groupby(lo).sum()
        lowaps.append(WAP(ctx))
    return max(lowaps)

In [7]:
#return Alpha and Beta (BitEpi) given contingency table
def AB(ct, SNPs):
    #display(HTML(ct.to_html()))
    wap = WAP(ct)
    mlowap = MLOWAP(ct,SNPs)
    return {'beta':wap, 'alpha':(wap - mlowap)}

In [8]:
def RP(ct):
    # pc: probability of being case
    # pc = np.random.uniform(0, 1, ct.shape[0])
    pc = np.random.normal(0.5, sd, ct.shape[0])
    pc[pc > 1] = 1
    pc[pc < 0] = 0
    ct['pc'] = pc
    ct.case = (ct.s * ct.pc).astype('int32')
    ct.ctrl = (ct.s - ct.case).astype('int32')
    ct.drop(['pc'], axis=1, inplace=True)

In [9]:
def Pvalue(bdf, SNPs, numRepeat=100):
    ct = CT(bdf, SNPs)
    ab = AB(ct, SNPs)
    # rab for AB with random phenotype
    rab = list()
    for i in range(numRepeat):

        if rndCT:
            RP(ct)
        else:
            bdf['RandomPheno'] = np.random.choice([1,2], bdf.shape[0])
            ct = CT(bdf, SNPs, pheno='RandomPheno')
        rab.append(AB(ct, SNPs))

    ac = len([r for r in rab if r['alpha'] > ab['alpha']])
    bc = len([r for r in rab if r['beta'] > ab['beta']])

    pv = {'beta':bc/numRepeat, 'alpha':ac/numRepeat}
    return (ab, pv)

In [10]:
bdf = BDF(prefix)

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 140.24it/s]


In [11]:
(stat,pvalue) = Pvalue(bdf, ['M0P1','M0P2'], numRepeat)
print('Statistics:', stat)
print('Pvalues:', pvalue)

Statistics: {'beta': 0.504218620099264, 'alpha': 0.0035141920889357747}
Pvalues: {'beta': 0.0, 'alpha': 0.0}


In [12]:
(stat,pvalue) = Pvalue(bdf, ['N34','N86'], numRepeat)
print('Statistics:', stat)
print('Pvalues:', pvalue)

Statistics: {'beta': 0.501757527587287, 'alpha': 0.001407223355438414}
Pvalues: {'beta': 0.5, 'alpha': 0.4}


In [13]:
#BitEpi -i sampleData/bfile -bfile -o sampleData/out.bfile -sort -t 2 -a4 100

In [14]:
# Compute pvalue for interactions listed in a file
def EpiPvalue(bfilePrefix, epiFile, numInteraction, numRepeat):
    
    # Read Plink bfile into pandas dataframe
    bdf = BDF(bfilePrefix)
    
    # Read BitEpi output into pandas dataframe
    epiInt = pd.read_csv(epiFile)
    
    # compute pvalue for the top interactions
    result = list()
    for i in range(numInteraction):
        row = epiInt.iloc[i]
        firstCol = row[0]
        SNPs = list(row[1:].values)
        print(firstCol)
        print(SNPs)
        (stat,pvalue) = Pvalue(bdf, SNPs, numRepeat)
        print('Statistics:', stat)
        print('Pvalues:', pvalue)
        print("===================================")
        result.append({'firstCol':firstCol, 'SNPs':SNPs, 'stat':stat, 'pvalue':pvalue})
        
    return result

In [15]:
# Compute pvalue of random combination of SNPs
def RndPvalue(bfilePrefix, numSNPs, numInteraction, numRepeat):
    
    # Read Plink bfile into pandas dataframe
    bdf = BDF(bfilePrefix)
    varId = bdf.columns.values[0:-2]

    # compute pvalue for the top interactions
    result = list()
    for i in range(numInteraction):
        firstCol = 0
        np.random.shuffle(varId)
        SNPs = list(varId[0:numSNPs])
        print(firstCol)
        print(SNPs)
        (stat,pvalue) = Pvalue(bdf, SNPs, numRepeat)
        print('Statistics:', stat)
        print('Pvalues:', pvalue)
        print("===================================")
        result.append({'firstCol':firstCol, 'SNPs':SNPs, 'stat':stat, 'pvalue':pvalue})
        
    return result

In [16]:
pvals = RndPvalue(prefix, 4, numInteraction, numRepeat=100)

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 141.78it/s]

0
['N94', 'N92', 'N33', 'N85']





Statistics: {'beta': 0.5134426827849664, 'alpha': 0.007415187996812067}
Pvalues: {'beta': 0.58, 'alpha': 0.34}
0
['N71', 'N58', 'N33', 'N50']
Statistics: {'beta': 0.5178785369847997, 'alpha': 0.010902237668964476}
Pvalues: {'beta': 0.25, 'alpha': 0.06}
0
['N22', 'N86', 'N67', 'N51']
Statistics: {'beta': 0.5103643739208267, 'alpha': 0.005821644880766197}
Pvalues: {'beta': 0.97, 'alpha': 0.74}
0
['N9', 'N5', 'N37', 'N24']
Statistics: {'beta': 0.5146649767527692, 'alpha': 0.007881717332883342}
Pvalues: {'beta': 0.44, 'alpha': 0.27}
0
['N18', 'N14', 'N23', 'N58']
Statistics: {'beta': 0.5154065656764837, 'alpha': 0.008204245935263699}
Pvalues: {'beta': 0.72, 'alpha': 0.75}


In [17]:
pvals = EpiPvalue(prefix, prefix+'.Alpha.4.csv', numInteraction, numRepeat=1000)

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 78.63it/s]

0.023553
['N10', 'N43', 'N44', 'N85']





Statistics: {'beta': 0.5339051524084679, 'alpha': 0.02355252997683932}
Pvalues: {'beta': 0.0, 'alpha': 0.0}
0.023203
['N17', 'N71', 'N77', 'N96']
Statistics: {'beta': 0.5358881655191845, 'alpha': 0.02320301247856571}
Pvalues: {'beta': 0.0, 'alpha': 0.0}
0.023068
['N10', 'N21', 'N36', 'N41']
Statistics: {'beta': 0.533675989229375, 'alpha': 0.023067800362634783}
Pvalues: {'beta': 0.0, 'alpha': 0.0}
0.023005
['N19', 'N28', 'N44', 'N72']
Statistics: {'beta': 0.5307312444641648, 'alpha': 0.023004973025889774}
Pvalues: {'beta': 0.001, 'alpha': 0.0}
0.02248
['N11', 'N43', 'N56', 'N65']
Statistics: {'beta': 0.5336167580368831, 'alpha': 0.022479976584007466}
Pvalues: {'beta': 0.0, 'alpha': 0.0}


In [20]:
pd.DataFrame(pvals)

Unnamed: 0,firstCol,SNPs,stat,pvalue
0,0.023553,"[N10, N43, N44, N85]","{'beta': 0.5339051524084679, 'alpha': 0.023552...","{'beta': 0.0, 'alpha': 0.0}"
1,0.023203,"[N17, N71, N77, N96]","{'beta': 0.5358881655191845, 'alpha': 0.023203...","{'beta': 0.0, 'alpha': 0.0}"
2,0.023068,"[N10, N21, N36, N41]","{'beta': 0.533675989229375, 'alpha': 0.0230678...","{'beta': 0.0, 'alpha': 0.0}"
3,0.023005,"[N19, N28, N44, N72]","{'beta': 0.5307312444641648, 'alpha': 0.023004...","{'beta': 0.001, 'alpha': 0.0}"
4,0.02248,"[N11, N43, N56, N65]","{'beta': 0.5336167580368831, 'alpha': 0.022479...","{'beta': 0.0, 'alpha': 0.0}"


In [33]:
pd.DataFrame(list(map(lambda x: {'firstCol':x['firstCol'], 'SNPs':x['SNPs'], \
                                 'beta':x['stat']['beta'], 'alpha':x['stat']['alpha'], \
                                 'pval-beta':x['pvalue']['beta'], 'pval-alpha':x['pvalue']['alpha'],},\
                      pvals))).to_csv('xxx.tsv', sep='\t', index=None)

In [34]:
%%sh
column -t xxx.tsv

firstCol  SNPs     beta    alpha   pval-beta  pval-alpha
0.023553  ['N10',  'N43',  'N44',  'N85']     0.5339051524084679  0.02355252997683932   0.0    0.0
0.023203  ['N17',  'N71',  'N77',  'N96']     0.5358881655191845  0.02320301247856571   0.0    0.0
0.023068  ['N10',  'N21',  'N36',  'N41']     0.533675989229375   0.023067800362634783  0.0    0.0
0.023005  ['N19',  'N28',  'N44',  'N72']     0.5307312444641648  0.023004973025889774  0.001  0.0
0.02248   ['N11',  'N43',  'N56',  'N65']     0.5336167580368831  0.022479976584007466  0.0    0.0
