In [None]:
%run IMPORT.ipynb

In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib notebook

In [None]:
def load_calls_results(prefix):
    aa = pd.concat([pd.read_pickle('/data/cox/%s/ukb_cal_chr%s_v2.pkl'%(prefix,i)) for i in range(1,23)])

    aa['Chromosome'] = [int(elem.split('_')[0]) for elem in aa.index]
    aa['Position'] = [int(elem.split('_')[3].split('_')[0]) for elem in aa.index]
    aa['ID'] = map(lambda x: x.split('_')[1], aa.index)

    aa[2.1] = 2.*aa[2.0]
    aa[0.1] = 2.*aa[0.0]
    aa['maf'] = aa[[1.0,2.1]].sum(1)/(2.*aa[[0.0,1.0,2.0]].sum(1))
    aa['miss'] = aa[3.0]/(1.*aa[[0.0,1.0,2.0,3.0]].sum(1))
    aa['t'] = aa['beta']/aa['sigma']
    split = np.array_split(aa['t'],n_jobs)
    p = pd.concat(multiproc_pbar(parallel_p, [split], [True]))
    aa['p'] = p.loc[aa.index]
    return aa

In [None]:
def load_plink_calls_results(prefix):
    bim = pd.concat([pd.read_csv('/mnt/tmp/calls/ukb_cal_chr%s_v2.bim'%i,
                                 sep='\t', header=None, index_col=1) for i in range(1,23)])
    bim = bim.rename(columns={0:'chr',3:'pos',4:'A1',5:'A2'})
    aa = pd.concat([pd.read_csv('/data/GWAS-2/gwas/triples1/ukb_cal_chr%s_v2.%s.assoc.linear'%(
        i,prefix), delim_whitespace=True, index_col=1) for i in range(1,23)]).drop(['A1'], axis=1)
    res = pd.concat([aa,bim], axis=1)
    return res.reset_index().rename(columns={'BETA':'beta','CHR':'Chromosome','BP':'Position',
                                             'index':'ID','A1':'A','A2':'B','NMISS':'Nd','STAT':'t',
                                            'P':'p'})

### Calls data

#### Load calls summary statistics

In [None]:
pbar = ProgressBar()
tstats = {}
for prefix in pbar(['andersen_list','andersen_MI','andersen_diabetes','andersen_canceronly',
                    'andersen_CHF','andersen_COPD','andersen_death','andersen_dementia','andersen_stroke']):
    tstats[prefix+'_selectedcovs'] = load_calls_results(prefix+'_selectedcovs')

#### LDsc Genetic Correlations for calls data

In [None]:
ldsc_refdir=/data/genreg/calls_10k_ref_maf0.01
tenkref=/data/files/10k_reference.index
ldsc_py=/data/soft/dist/ldsc/ldsc.py
munge_sumstats_py=/data/soft/dist/ldsc/munge_sumstats.py
out_dir=/data/genreg/output_calls_selected_covs_wo_death

# mkdir $ldsc_refdir

In [None]:
mkdir $out_dir

parallel --eta plink2 --bfile {.} --keep $tenkref --maf 0.01 --make-bed --out $ldsc_refdir/{/.} ::: /mnt/tmp/calls/chr*bed
parallel --eta $ldsc_py --bfile {.} --l2 --ld-wind-cm 1 --yes-really --out {.} ::: $ldsc_refdir/chr*bed
rename 's/chr//g' $ldsc_refdir/chr*l2*

echo -e 'SNP\tA1\tA2' > $ldsc_refdir/snplist.txt
awk '{print $2"\t"$5"\t"$6'} $ldsc_refdir/chr*bim >> $ldsc_refdir/snplist.txt

In [None]:
out_dir = '/data/genreg/output_calls_selected_covs/'

In [None]:
# pbar = ProgressBar()
for name, df in tstats.iteritems():
    df = df[(df['beta']**2)<80]
    df = df[~((df['Chromosome']==6)&(df['Position']>26e6)&(df['Position']<34e6))]
    df = df[df['miss']<0.02]
    df = df.reset_index()
    df['A'] = df['index'].apply(lambda x: x.split('_')[-2])
    df['B'] = df['index'].apply(lambda x: x.split('_')[-1])
    
    df.dropna().to_csv(join(out_dir,'%s.gz'%name), compression='gzip', index=False, sep=' ',
             columns=['ID','A','B','Nd','t','p'], header=['SNP','A1','A2','N','Z','P'])

In [None]:
source ~/data/GWAS-2/genreg/ldsc_venv/bin/activate

In [None]:
parallel --eta $munge_sumstats_py --sumstats {} --out {.} --merge-alleles $ldsc_refdir/snplist.txt ::: $out_dir/*.gz

cd $out_dir
files=$(ls -m *.sumstats.gz | tr -d ' ' | tr -d '\n')
for file in *.sumstats.gz; do
    fl="${file%.sumstats.gz}";
    $ldsc_py --rg $file,$files --ref-ld-chr $ldsc_refdir/ --w-ld-chr $ldsc_refdir/ --out $fl; done

In [None]:
renamedict = {'andersen_list':'Healthspan','andersen_canceronly':'cancer','andersen_MI':'MI',
              'andersen_diabetes':'diabetes',
              'andersen_CHF':'CHF','andersen_COPD':'COPD','andersen_death':'death','andersen_dementia':'dementia',
              'andersen_stroke':'stroke'}#,'surv_fa':'father_death','surv_mo':'mother_death'}

from StringIO import StringIO
cordf = {}

for filename in glob(join(out_dir,'*.sumstats.gz')):
    with open(filename.replace('.sumstats.gz','.log'),'r') as rl:
        
        flag = 0
        strdata = ''
        for line in rl.readlines():
            
            if 'Summary of Genetic Correlation Results' in line:
                flag = 1
                continue
            if flag==1:
                if len(line)>1:
                    strdata += line
                else:
                    break

        cordf[basename(filename).split('.')[0]] = pd.read_csv(StringIO(strdata), delim_whitespace=True)

corrm = pd.DataFrame(index=cordf.keys(), columns=cordf.keys())
corrp = pd.DataFrame(index=cordf.keys(), columns=cordf.keys())

for key1 in cordf.keys():
    for key2 in cordf.keys():
        corrm.loc[key1,key2] = cordf[key1].set_index('p2').loc['%s.sumstats.gz'%key2,'rg']
        corrp.loc[key1,key2] = cordf[key1].set_index('p2').loc['%s.sumstats.gz'%key2,'p']

corrm = corrm.rename(index=renamedict, columns=renamedict)
corrp = corrp.rename(index=renamedict, columns=renamedict)

order = ['Healthspan','death','diabetes','stroke','COPD','MI','CHF','cancer','dementia']
corrm = corrm.T.loc[order,order]
corrp = corrp.T.loc[order,order]

In [None]:
with pd.ExcelWriter('/data/tables/supdata3.xlsx') as writer:

    corrp.to_excel(writer, sheet_name='p')
    corrm.to_excel(writer, sheet_name='rg')

In [None]:
for y_index, y in enumerate(corrm.columns):
    for x_index, x in enumerate(corrm.columns):
        if y_index <= x_index:
            corrm.iloc[x_index,y_index]=np.nan
            corrp.iloc[x_index,y_index]=np.nan

corrm = corrm.iloc[:,::-1].iloc[:-1,:-1]
corrp = corrp.iloc[:,::-1].iloc[:-1,:-1]

In [None]:
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.size'] = 8

In [None]:
boldmask = pd.DataFrame('regular', index=corrp.index, columns=corrp.columns)
boldmask[corrp<(0.05/28.)] = 'bold'

fig = pl.figure(figsize=(4,4))
ax = fig.add_subplot(111)
cm = ax.imshow(corrm.values.astype(float), interpolation=None, cmap='coolwarm_r', vmin=-1, vmax=1)
pl.xticks(np.arange(len(corrm.columns)), corrm.columns, rotation=35, ha='left');
pl.yticks(np.arange(len(corrm.index)), corrm.index);

for y_index, y in enumerate(corrm.index):
    for x_index, x in enumerate(corrm.columns):
        if corrp.loc[y,x] is not np.nan:#y_index <= x_index:
            label = '%.2f'%corrm.values[y_index, x_index]
            ax.text(x_index, y_index, label, color='black', ha='center', va='center',
                    weight=boldmask.iloc[y_index, x_index])

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.xaxis.set_ticks_position('top')

# pl.title('Genetic Correlation')
pl.colorbar(cm, fraction=0.046, pad=0.04)
pl.tight_layout()

In [None]:
fig.savefig('/data/tables/GCcalls.pdf', dpi=300)
fig.savefig('/data/tables/GCcalls.png', dpi=300)

In [None]:
cordf['andersen_list'].set_index('p2').loc[:,['rg','p']].drop(['andersen_list.sumstats.gz']).rename(
    index=dict(zip(cordf['andersen_list']['p2'].values,[elem.split('.')[0] for elem in cordf['andersen_list']['p2'].values]))).rename(
    index=renamedict).to_csv('/data/tables/suptable15.tsv')