Creates a xcel and csv final of the BioVU Summary Stastistics with the following information: 
    * P value 
    * OR 
    * A1, A2
    * HWE in case and controls
    * MAF in cases and controls

In [1]:
import os, sys
import numpy as np
import pandas as pd 
import seaborn as sns
from datetime import datetime

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
from IPython.core.display import display, HTML    
display(HTML("<style>.container {width:100% !important; }</style>"))
%matplotlib inline 
np.set_printoptions(precision=5, suppress=True) 

from docx import Document

DATE = datetime.now().strftime('%Y-%m-%d')

In [2]:
### PATHS 
BV_FREQ_DIR="/dors/capra_lab/users/abraha1/prelim_studies/katja_biobank/results/2019_07_21_logistic/freq_hwe"
fq_cases = os.path.join(BV_FREQ_DIR, '2019_07_21_logistic_cases_frq_hwe.out.frq')
fq_con = os.path.join(BV_FREQ_DIR, '2019_07_21_logistic_controls_frq_hwe.out.frq')
hwe_cases = os.path.join(BV_FREQ_DIR, '2019_07_21_logistic_cases_frq_hwe.out.hwe')
hwe_con = os.path.join(BV_FREQ_DIR, '2019_07_21_logistic_controls_frq_hwe.out.hwe')


BV_ASSOC_FILE="/dors/capra_lab/users/abraha1/prelim_studies/katja_biobank/results/2019_07_21_logistic/2019_07_21_logistic.assoc.logistic"

OUTPUT_DIR="/dors/capra_lab/users/abraha1/prelim_studies/katja_biobank/results/manuscript/tables"

# main

In [3]:
load = lambda x: pd.read_csv(x, sep="\s+")

In [4]:
# load associations 
bv_df = pd.read_csv(BV_ASSOC_FILE, sep="\s+")

In [5]:
# load cases
case_frq = load(fq_cases)
case_hwe = load(hwe_cases)
case_hwe = case_hwe.loc[case_hwe['TEST']=='ALL'].copy()

In [6]:
# load controls 
control_frq = load(fq_con)
control_hwe = load(hwe_con)
control_hwe = control_hwe.loc[control_hwe['TEST']=='ALL'].copy()

In [7]:
# keep only autosomes
autosomes = lambda x: x.loc[x['CHR'] < 23,: ]

In [8]:
case_frq = autosomes(case_frq).copy()
case_hwe = autosomes(case_hwe).copy()
control_frq = autosomes(control_frq).copy()
case_hwe = autosomes(case_hwe).copy()

In [9]:
# rename columns
case_frq.rename(columns={'MAF':'MAF (cases)'}, inplace=True)
case_hwe.rename(columns={'P':'HWE P-value (cases)'}, inplace=True)

control_frq.rename(columns={'MAF':'MAF (controls)'}, inplace=True)
control_hwe.rename(columns={'P':'HWE P-value (controls)'}, inplace=True)

In [10]:
merged_cases = pd.merge(case_frq.loc[:, ['SNP','A1','A2','MAF (cases)']],
         case_hwe.loc[:, ['SNP','HWE P-value (cases)']], on='SNP', how='left')

merged_controls = pd.merge(control_frq.loc[:, ['SNP','A1','A2','MAF (controls)']],
         control_hwe.loc[:, ['SNP','HWE P-value (controls)']], on='SNP', how='left')

In [11]:
bv_df.tail()

Unnamed: 0,CHR,SNP,BP,A1,TEST,NMISS,OR,STAT,P
5677633,26,rs373855397,16261,T,PC1,61622,1.733,0.2432,0.8078
5677634,26,rs373855397,16261,T,PC2,61622,17.49,1.195,0.2321
5677635,26,rs373855397,16261,T,PC3,61622,48.08,1.82,0.06876
5677636,26,rs373855397,16261,T,PC4,61622,18.32,1.363,0.1729
5677637,26,rs373855397,16261,T,PC5,61622,0.08672,-1.148,0.2508


In [12]:
# format association 
bv_df = autosomes(bv_df).copy()
bv_df['chr:pos (hg37)'] = bv_df.CHR.map(str) + ":" + bv_df.BP.map(str)
bv_df['cohort'] = 'BioVU'

In [13]:
summary_df = bv_df.loc[bv_df['TEST']=='ADD', ['SNP','chr:pos (hg37)','NMISS','OR','P','cohort']].copy()

In [14]:
case_merge = pd.merge(summary_df, merged_cases, on='SNP', how='left')
merged_df = pd.merge(case_merge, merged_controls.loc[:, ['SNP','MAF (controls)', 'HWE P-value (controls)']], on='SNP', how='left')

In [15]:
col_order=['SNP', 'chr:pos (hg37)','A1', 'A2', 'OR', 'P','MAF (cases)', 'MAF (controls)', 'HWE P-value (cases)','HWE P-value (controls)', 'NMISS',  'cohort']
final_merged_df = merged_df.loc[:, col_order].copy()
final_merged_df.rename(columns={'NMISS':'N'}, inplace=True)

In [16]:
final_merged_df.head()
final_merged_df.shape

Unnamed: 0,SNP,chr:pos (hg37),A1,A2,OR,P,MAF (cases),MAF (controls),HWE P-value (cases),HWE P-value (controls),N,cohort
0,1:49554-G-A,1:49554,G,A,0.996,0.8906,0.04189,0.04202,1.227e-27,2.697e-19,61373,BioVU
1,JHU_1.565975,1:565976,C,T,0.9803,0.5325,0.01633,0.01699,0.0,0.0,61156,BioVU
2,JHU_1.731717,1:731718,C,T,1.014,0.4322,0.1265,0.1248,0.2732,0.9667,60825,BioVU
3,rs3131972,1:752721,T,C,1.003,0.8433,0.1761,0.1752,2.0079999999999997e-38,3.579e-26,61694,BioVU
4,JHU_1.761957,1:761958,T,C,0.9944,0.8486,0.0412,0.04146,6.0710000000000005e-27,6.26e-22,61727,BioVU


(798051, 12)

In [17]:
final_merged_df.sort_values('P', inplace=True)

In [19]:
pd.__version__

'0.24.0'

In [20]:
final_merged_df.to_csv(os.path.join(OUTPUT_DIR, 'biovu_summary_stats_{}.csv'.format(DATE)), sep=",", index=False)