In [3]:
import os
import glob
import pandas as pd 
from chromolooper import sgls
import re
from myvariant import MyVariantInfo
import numpy as np

pd.options.mode.chained_assignment = None  # default='warn'

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/Loop-Catalog-SGLs/')

outdir = 'results/hg38/finemapping/snps/'
os.makedirs(outdir, exist_ok=True)

## Load all finemapped snp data

In [7]:
# add meta information
causal_metadata_fn = 'workflow/scripts/finemap_sgls/causal_db/init.gwas_study.causal_db.immune_select_samples.tsv'
causal_metadata = pd.read_table(causal_metadata_fn, header=None)

causal_metadata_mapper =  causal_metadata.iloc[:, [2, 8, 18]]
causal_metadata_mapper.columns = ['mesh_term', 'author', 'filename']

In [10]:
# load all the finemapping snp data
fns = glob.glob('/mnt/BioAdHoc/Groups/vd-ay/sourya/Projects/2020_IQTL_HiChIP/Data/CausalDB/credible_set/*_total_credible_set.txt')

all_data = []
for i, fn in enumerate(fns):

    info = fn.split('/')

    causaldb_fn = info[-1].split('_')[0]

    if causaldb_fn in causal_metadata_mapper.filename.tolist():

        # loading the data
        tdf = pd.read_table(fn)
        if tdf.shape[0] > 0:
            tdf.loc[:, 'causaldb_fn'] = causaldb_fn
            all_data.append(tdf)

In [11]:
all_df = pd.concat(all_data)

# add causaldb metadata
all_df = all_df.merge(causal_metadata_mapper, left_on='causaldb_fn', right_on='filename')

all_df.loc[:, 'rsID'] = all_df.loc[:, 'rsID'].apply(lambda x: 'rs{}'.format(x))

ValueError: No objects to concatenate

In [5]:
# extracing all the unique rsids for query
rs_numbers = all_df.rsID.unique().tolist()

len(rs_numbers)

8840

## Querying and adding coordinates from hg38 with checks

In [6]:
# querying the variants in hg38
mv = MyVariantInfo()
rs_query_hg38 = mv.querymany(rs_numbers, 
                        scopes='dbsnp.rsid',
                        fields=['hg38.start', 'dbsnp.chrom'],
                        assembly='hg38',
                        as_dataframe=True,
                        verbose=True)

# drop NA's and duplicates
rs_query_hg38.dropna(subset=['hg38.start'], inplace=True)
rs_query_hg38.drop_duplicates(subset=['dbsnp.chrom', 'hg38.start'], inplace=True)

                        
# querying the variants in hg19
rs_query_hg19 = mv.querymany(rs_numbers, 
                        scopes='dbsnp.rsid',
                        fields=['hg19.start', 'dbsnp.chrom'],
                        assembly='hg19',
                        as_dataframe=True,
                        verbose=True)

# drop NA's and duplicates
rs_query_hg19.dropna(subset=['hg19.start'], inplace=True)
rs_query_hg19.drop_duplicates(subset=['dbsnp.chrom', 'hg19.start'], inplace=True)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-8840...done.
Finished.
3923 input query terms found dup hits:
	[('rs34372802', 2), ('rs77411920', 2), ('rs6657092', 3), ('rs11264130', 3), ('rs552564', 2), ('rs120
398 input query terms found no hit:
	['rs142528524', 'rs113429865', 'rs79725762', 'rs77516441', 'rs111289255', 'rs527435679', 'rs14809495
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-8840...done.
Finished.
3920 input query terms found dup hits:
	[('rs34372802', 2), ('rs77411920', 2), ('rs6657092', 3), ('rs11264130', 3), ('rs552564', 2), ('rs

In [7]:
rs_query = rs_query_hg38.merge(rs_query_hg19, suffixes=['_hg38', '_hg19'], left_index=True, right_index=True)

In [8]:
rs_query.drop(['_id_hg38', '_score_hg38', 'dbsnp._license_hg38',
                '_id_hg19', '_score_hg19', 'dbsnp._license_hg19'], axis=1, inplace=True)

In [9]:
rs_query['hg38.start'] = rs_query['hg38.start'].astype(int)
rs_query['hg19.start'] = rs_query['hg19.start'].astype(int)

In [10]:
# merge back to the all_df
all_hg38_df = all_df.merge(rs_query, left_on='rsID', right_index=True)

# filtering for snp where the rs query returns the same hg19 coordinates 
# as the myvariant service
chr_check = all_hg38_df['CHR'].astype(str) == all_hg38_df['dbsnp.chrom_hg19'].astype(str)
snp_check = all_hg38_df['BP'].astype(int) == all_hg38_df['hg19.start'].astype(int)

good_hg38_df = all_hg38_df.loc[chr_check & snp_check]

#### Checking issues with incongruent hg19 coordiates between CAUSALdb and dbSNP

In [11]:
print('There are {} initial SNP instances'.format(all_hg38_df.shape[0]))

There are 10466 initial SNP instances


In [12]:
print('There are {} good SNP instances'.format(good_hg38_df.shape[0]))

There are 10114 good SNP instances


In [13]:
bad_pos_hg38_df = all_hg38_df.loc[chr_check & ~snp_check][['rsID', 'CHR', 'BP', 'dbsnp.chrom_hg19', 'hg19.start']]
print('There are {} bad SNP instances due to pos'.format(bad_pos_hg38_df.shape[0]))

There are 260 bad SNP instances due to pos


In [14]:
bad_pos_hg38_df.head()

Unnamed: 0,rsID,CHR,BP,dbsnp.chrom_hg19,hg19.start
1338,rs34937962,1,200761255,1,200761257
2917,rs34937962,1,200761255,1,200761257
1353,rs66733041,2,100762450,2,100762452
3303,rs66733041,2,100762450,2,100762452
1406,rs66739067,2,100793341,2,100793343


In [15]:
bad_chr_hg38_df = all_hg38_df.loc[~chr_check & ~snp_check][['rsID', 'CHR', 'BP', 'dbsnp.chrom_hg19', 'hg19.start']]
print('There are {} bad SNP instances due to chr'.format(bad_chr_hg38_df.shape[0]))

There are 92 bad SNP instances due to chr


In [16]:
bad_chr_hg38_df.head()

Unnamed: 0,rsID,CHR,BP,dbsnp.chrom_hg19,hg19.start
2067,rs1388113297,12,9910164,8,122004498
4894,rs1388113297,12,9910164,8,122004498
4793,rs1388091085,12,9123932,4,160140439
4794,rs1388091076,12,9123477,9,70154083
4795,rs1388091050,12,9122609,4,146752648


## Save the hg38 versions of the total_credible_set.txt

In [17]:
# swap the hg19 with hg38 coords
good_hg38_df['BP'] = good_hg38_df['hg38.start']

# drop uncessary columns
good_hg38_df.drop(['dbsnp.chrom_hg38',
                    'hg38.start',
                    'notfound_hg38',
                    'dbsnp.chrom_hg19',
                    'hg19.start',
                    'notfound_hg19'], axis=1, inplace=True)

In [18]:
good_hg38_df.columns.tolist()

['CHR',
 'BP',
 'rsID',
 'MAF',
 'EA',
 'NEA',
 'BETA',
 'SE',
 'P',
 'Zscore',
 'PAINTOR',
 'CAVIARBF',
 'FINEMAP',
 'meta_id',
 'block_id',
 'label',
 'causaldb_fn',
 'mesh_term',
 'author',
 'filename']

In [19]:
tmpl = 'results/hg38/finemapping/snps/singles/{}_total_credible_set.hg38.txt'
for filename, grp_df in good_hg38_df.groupby('filename'):
    outfn = tmpl.format(filename)
    grp_df.drop(['mesh_term', 'author', 'filename'], axis=1, inplace=True)
    grp_df.to_csv(outfn, sep='\t', index=False, header=True)