In [1]:
import os, sys
import glob
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)

# Real data download and format
1. Download 30 GWASs, 9 genic and 127 cell-type specific annotations as well as 1000 Genome reference from https://drive.google.com/file/d/15btr71PD1lI6oqrOtf_T-i8aZM0YCRaP/view?usp=sharing.
2. Create a folder `real/data` and save all the real data in this folder.
3. Exclude SNPs in the MHC region (CHR6: 25-35 Mb).
4. Merge 9 genic annotations and 127 cell-type specific annotations; format GWAS data and annotations as the required form of PALM.
5. Create a folder `gwas30_noMHC` in `real_data` and save processed GWASs in this folder; save the merged annotations as `region9tissue127.csv` in `real_data`.

In [2]:
# raw data files
raw_dir = '../real/data'
format_dir = '../real/gwas30_noMHC'
Path(raw_dir).mkdir(parents=True, exist_ok=True)
Path(format_dir).mkdir(parents=True, exist_ok=True)
    
# reference
ref = pd.read_csv(f'{raw_dir}/reference.1000G.maf.0.005.txt', sep=' ')

In [3]:
# gwas names
gwas_names = []
for file in glob.glob(f"{raw_dir}/*.csv"):
    if 'tissue127' in file or 'region9' in file:
        continue
    gwas_names.append(file.split('/')[-1].split('.')[0])

In [4]:
# Format gwas
for k in gwas_names:
    if os.path.exists(f'{format_dir}/{k}.csv'):
        continue
    gwas = pd.read_csv(f'{raw_dir}/{k}.csv', sep='\t')
    # keep columns
    if 'snpid' in gwas.columns:
        gwas.rename(columns={'snpid': 'SNP'}, inplace=True)
    if 'p' in gwas.columns:
        gwas.rename(columns={'p': 'pvalue'}, inplace=True)
    # merge with ref: SNP, CHR, BP, MAF, A1, A2
    merge = pd.merge(ref, gwas[['SNP', 'pvalue']], on='SNP')
    # exclude MHC region
    keep = merge.loc[~((merge['CHR']==6) & (merge['BP'] > 25e6) & (merge['BP'] < 35e6))]
    # save
    keep.to_csv(f'{format_dir}/{k}.csv', sep='\t', index=False)
    print(f'Written {k}.')
print('***** Finish formatting 30 GWASs. *****')

***** Finish formatting 30 GWASs. *****


In [5]:
# merge region9 and tissue127
region9 = pd.read_csv(f'{raw_dir}/region9.csv', sep='\t')
tissue127 = pd.read_csv(f'{raw_dir}/tissue127.csv', sep='\t')
reg9tis127 = pd.merge(region9, tissue127, on='SNP', how='inner')
reg9tis127.to_csv(f'{raw_dir}/region9tissue127.csv', sep='\t', index=False)
print('***** Finish merging region9 and tissue127. *****')

***** Finish merging region9 and tissue127. *****


In [6]:
# Look at an example GWAS and merged annotation
gwas = pd.read_csv(f'{format_dir}/Bipolar_Disorder.csv', sep='\t')
reg9tis127 = pd.read_csv(f'{raw_dir}/region9tissue127.csv', sep='\t')

In [7]:
gwas.head()

Unnamed: 0,SNP,CHR,BP,MAF,A1,A2,pvalue
0,rs10907175,1,1130727,0.084493,A,C,0.352
1,rs2887286,1,1156131,0.175944,T,C,0.3904
2,rs6685064,1,1211292,0.0666,C,T,0.9438
3,rs1571150,1,1474304,0.333002,C,A,0.6677
4,rs7290,1,1477244,0.306163,T,C,0.8478


In [8]:
reg9tis127.head()

Unnamed: 0,downstream,exonic,intergenic,intronic,ncRNA_exonic,ncRNA_intronic,upstream,UTR3,UTR5,SNP,E001,E002,E003,E004,E005,E006,E007,E008,E009,E010,E011,E012,E013,E014,E015,E016,E017,E018,E019,E020,E021,E022,E023,E024,E025,E026,E027,E028,E029,E030,E031,E032,E033,E034,E035,E036,E037,E038,E039,E040,...,E080,E081,E082,E083,E084,E085,E086,E087,E088,E089,E090,E091,E092,E093,E094,E095,E096,E097,E098,E099,E100,E101,E102,E103,E104,E105,E106,E107,E108,E109,E110,E111,E112,E113,E114,E115,E116,E117,E118,E119,E120,E121,E122,E123,E124,E125,E126,E127,E128,E129
0,0,0,1,0,0,0,0,0,0,rs1000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,rs10000010,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,rs10000023,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,rs1000003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,rs10000033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
