In [1]:
import os
import glob
import pandas as pd 
from chromolooper import sgls
import re
from myvariant import MyVariantInfo
import numpy as np

pd.options.mode.chained_assignment = None  # default='warn'

os.chdir('<project-dir>')

outdir = 'results/hg38/eqtl/eqtl_catalogue/'
os.makedirs(outdir, exist_ok=True)

## Load all eQTL data

In [2]:
# helper function to find snps only
def is_snp(variant_id):
    var1, var2 = variant_id.split('_')[-2:]
    if len(var1) == 1 and len(var2) == 1:
        return(True)
    return(False)

unique_snps = set()
credible_set_fns = glob.glob('results/hg38/eqtl/eqtl_catalogue/*.credible_sets.tsv.gz')

for fn in credible_set_fns:
    tdf = pd.read_table(fn)
    tdf = tdf[['rsid', 'variant']]
    tdf = tdf.loc[tdf.variant.apply(is_snp)]
    snps = (tuple(x) for x in tdf.values)
    unique_snps.update(snps)

## Save the unique set of SNPs within eQTLs

In [3]:
# make a dataframe of the unique snps
unique_snps_df = pd.DataFrame(unique_snps)
unique_snps_df.columns = ['rsid', 'gnomad_var_id']

# convert to full gnomad variant id
unique_snps_df.loc[:, 'gnomad_var_id'] = unique_snps_df.loc[:, 'gnomad_var_id'].str.replace('_', '-')

# split the gnomad variant id to get other fields
unique_snps_extra_cols_df = unique_snps_df.gnomad_var_id.str.split('-', expand=True)
unique_snps_extra_cols_df.columns = ['chrom', 'pos', 'ea', 'nea']
unique_snps_extra_cols_df['genome'] = 'hg38'

# concat the two
unique_snps_df = pd.concat([unique_snps_df, unique_snps_extra_cols_df], axis=1)

In [4]:
snp_fn = 'results/hg38/eqtl/eqtl_catalogue/snps.hg38.tsv'
unique_snps_df = unique_snps_df[['rsid', 'chrom', 'pos', 'ea', 'nea', 'gnomad_var_id']]
unique_snps_df.to_csv(snp_fn, index=False, sep='\t')

In [5]:
unique_snps_df.shape

(687515, 6)

## Create a test set using QTD000479

This eQTL Catalog ID corresponds to (Schmiedel_2018 - CD4_T-cell_naive - CL_0000624 - CD4+ T cell - naive)

In [6]:
test_fn = 'results/hg38/eqtl/eqtl_catalogue/QTD000479.credible_sets.tsv.gz'
test_eqtls = pd.read_table(fn)
test_eqtls.loc[:, 'gnomad_var_id'] = test_eqtls.loc[:, 'variant'].str.replace('_', '-')
test_snps_df = unique_snps_df.loc[unique_snps_df.gnomad_var_id.isin(test_eqtls.gnomad_var_id.tolist())]

In [7]:
test_fn = 'results/hg38/eqtl/eqtl_catalogue/snps.QTD000479.hg38.tsv'
test_snps_df = test_snps_df[['rsid', 'chrom', 'pos', 'ea', 'nea', 'gnomad_var_id']]
test_snps_df.to_csv(test_fn, index=False, sep='\t')

In [8]:
test_snps_df.shape

(56473, 6)