In [1]:
import os, sys
sys.path.append("../../")
import collections
import re
import numpy as np
from utils.readvcf_snp import ReadVCF

pheno_file = "/cbscratch/franco/datasets/gtex_v8/phenotypes/gtex_v8_basic_phenotypes.txt"
#admix_file = "/cbscratch/franco/datasets/gtex_v8/genotypes/gtex-admixed0.9.txt"
gteur_file = "/cbscratch/franco/datasets/gtex_v8/genotypes/gtex_v8_eur.sample"
gtall_file = "/cbscratch/franco/datasets/gtex_v8/genotypes/gtex_v8.sample"

def read_samples(samplefile):
    if os.path.exists(samplefile):
        with open(samplefile, 'r') as samfile:
            sample = 0
            samplenames = list()
            next(samfile)
            next(samfile)
            for line in samfile:
                if re.search('^#', line):
                    continue
                sample += 1
                samplenames.append(line.strip().split()[0])
        nsample = sample
        samplenames = samplenames
        return samplenames, nsample
    
allsamples, nall = read_samples(gtall_file)

sample_pheno_dict = dict()
with open(pheno_file) as instream:
    for line in instream:
        if line.strip() == "":
            continue
        if re.search("#", line):
            continue
        if re.search("dbGaP", line):
            header = line.strip().split()
            continue
        arr = line.strip().split("\t")
        sampleid = arr[1]
        race = arr[5]
        sample_pheno_dict[sampleid] = race
        
# "1" "Asian"
# "2" "Black or African American"
# "3" "White"
# "4" "American Indian or Alaska Native"
# "98" "Not Reported"
# "99" "Unknown"

In [2]:
def calc_simple_fst_power(pops_gt, pops): 
    
    nsnps = pops_gt[pops[0]].shape[0]
    for j in range(1, len(pops)):
        if nsnps != pops_gt[pops[j]].shape[0]:
            print("SNP numbers differ between populations")
            raise
    
    power_list = list()
    fst_list = list()
    # all_gt = np.hstack((eur_gt1kg, afr_gt1kg))
    all_gt = np.hstack(tuple([pops_gt[x] for x in pops]))
    all_n  = all_gt.shape[1] # size of total population
    for snpi in range(all_gt.shape[0]):
        #print(snp_info[snpi])
        # maf_all = sum(all_gt[snpi,:] / 2 / len(all_gt[snpi,:]))
        # maf_eur = sum(eur_gt[snpi,:] / 2 / len(eur_gt[snpi,:]))
        # maf_afr = sum(afr_gt[snpi,:] / 2 / len(afr_gt[snpi,:]))
        maf_all = sum(all_gt[snpi,:] / 2 / len(all_gt[snpi,:]))
        maf_pops = [sum(pops_gt[x][snpi,:] / 2 / len(pops_gt[x][snpi,:])) for x in pops]

        c_pops = [pops_gt[x].shape[1]/all_n for x in pops]
        #c_eur = eur_n / all_n
        #c_afr = afr_n / all_n
        
        p_all = maf_all*(1-maf_all)
        sum_p = np.sum(np.array([c_pops[i]*maf_pops[i]*(1-maf_pops[i]) for i in range(len(pops))]))
        power = all_n * sum_p # (c_eur*maf_eur*(1-maf_eur) + c_afr*maf_afr*(1-maf_afr) )
        
        Fst = (p_all  - sum_p ) / p_all
        power_list.append(power)
        fst_list.append(Fst)
    return fst_list, power_list


def get_gt_count(dosage, allele='ref'):
    n_het = dosage.count(1)
    if allele == "ref":
        n_hom = dosage.count(0)
    if allele == "alt":
        n_hom = dosage.count(2)
    return n_hom, n_het

def weir_cockerman_fst(pops_gt, pops=["eur", "afr"], alleles=["ref", "alt"]):
    
    fst_list = list()
    nsnps = pops_gt[pops[0]].shape[0]
    if nsnps != pops_gt[pops[1]].shape[0]:
        print("SNP numbers differ between populations")
        raise
        
    for snpi in range(nsnps):
        n_pops = len(pops)
        n_alleles = len(alleles)
        n = np.zeros(n_pops)
        p = np.zeros((n_pops, n_alleles))
        pbar = np.zeros(n_alleles)
        hbar = np.zeros(n_alleles)
        ssqr = np.zeros(n_alleles)

        nbar = 0
        sum_nsqr = 0
        for pop in range(n_pops):
            for al in range(n_alleles):
                n_hom, n_het = get_gt_count(list(pops_gt[pops[pop]][snpi,:]), allele=alleles[al])
                n[pop] += n_hom + 0.5*n_het
                p[pop,al] = n_het + 2*n_hom

                nbar += n[pop]
                pbar[al] += p[pop][al]
                hbar[al] += n_het
            for al in range(n_pops):
                p[pop,al] /= 2.0*n[pop]

            sum_nsqr += (n[pop] * n[pop])

        n_sum = sum(n)
        nbar  = n_sum / n_pops

        for al in range(n_alleles):
            pbar[al] /= n_sum * 2.0
            hbar[al] /= n_sum

        for al in range(n_alleles):
            for pop in range(n_pops):
                ssqr[al] += n[pop]*(p[pop,al] - pbar[al])*(p[pop,al] - pbar[al])
            ssqr[al] /= (n_pops-1)*nbar
        nc = (n_sum - (sum_nsqr / n_sum)) / (n_pops - 1)

        snp_Fst = np.zeros(n_alleles)
        a = np.zeros(n_alleles)
        b = np.zeros(n_alleles)
        c = np.zeros(n_alleles)
        r = n_pops
        sum_a = 0
        sum_all = 0
        for al in range(n_alleles):
            a[al] = (ssqr[al] - ( pbar[al]*(1.0-pbar[al]) - (((r-1.0)*ssqr[al])/r) - (hbar[al]/4.0) )/(nbar-1.0))*nbar/nc;
            b[al] = (pbar[al]*(1.0-pbar[al]) - (ssqr[al]*(r-1.0)/r) - hbar[al]*( ((2.0*nbar)-1.0) / (4.0*nbar) ))*nbar / (nbar-1.0) ;
            c[al] = hbar[al] / 2.0;
            snp_Fst[al] = a[al]/(a[al]+b[al]+c[al]);

            if not np.all([np.isnan(a[al]),np.isnan(b[al]),np.isnan(c[al])]):
                sum_a += a[al]
                sum_all += a[al] + b[al] + c[al]
        fst = sum_a/sum_all
        fst_list.append(fst)
    return fst_list

In [3]:
import json
from utils import utils

tissue_file = "../../plots/tissue_table.txt"
json_file   = "../../gtex_v8_metadata.json"
tshorts, tfulls, tstrings = utils.read_tissues_str(tissue_file)
with open(json_file) as instream:
    gtex_meta = json.load(instream)
tissue_colors = dict()
tissue_names = dict()
tissue_nsamples = dict()

for tshort, tfull, tstring in zip(tshorts, tfulls, tstrings):
    if tshort in tshorts:
        tissue_names[tshort] = tstring
        tissue_colors[tshort] = "#" + gtex_meta[tfull]["colorHex"]
        tissue_nsamples[tshort] = gtex_meta[tfull]["rnaSeqSampleCount"]
        
brain_tissues = ['bam', 'ban', 'bca', 'bceh', 'bce', 'bco', 'bfr', 'bhi', 'bhy', 'bnu', 'bpu', 'bsp', 'bsu']
altsb_tissues = ['haa', 'pan', 'spl', 'wb']

SNPRES_FIELDS = ['rsid', 'chrom', 'pos', 'logp', 'maf']
class SNPRes(collections.namedtuple('_SNPRes', SNPRES_FIELDS)):
    __slots__ = ()

def tejaas(filepath):
    res = list()
    with open(filepath, 'r') as mfile:
        next(mfile)
        for line in mfile:
            arr   = line.strip().split("\t")
            rsid  = arr[0]
            #chrom = rsid.split("_")[0][3:]
            chrom = int(arr[1])
            pos   = int(arr[2])
            p     = float(arr[7])
            logp  = np.log10(p) if p!=0 else np.log10(10e-30)
            maf   = float(arr[3])
            res.append(SNPRes(rsid=rsid, chrom=chrom, pos=pos, logp=-logp, maf=maf))
    return res    
    
basename = "protein_coding_lncRNA_{:s}_knn30_cut5e-8"
gammas = ["gamma01", "gamma0006"]

In [4]:
basepath = "/cbscratch/franco/trans-eqtl"
trans_dict = dict()
for tissue in tshorts:
    if tissue in altsb_tissues:
        config = basename.format(gammas[1])
    else:
        config = basename.format(gammas[0])
    tejaas_file = os.path.join(basepath, config, tissue, "trans_eqtls_ldpruned.txt")
    
    if not os.path.exists(tejaas_file):
        print("{:s} has no trans-eqtl results".format(tissue))
        continue
    print("Loading ", tissue, end="")
    transeqtls = tejaas(tejaas_file)
    if len(transeqtls) > 0:
        trans_dict[tissue] = transeqtls
        print(" has {:d} trans-eqtls".format(len(transeqtls)))
    else:
        trans_dict[tissue] = []
        print(" has 0 trans-eqtls")

Loading  as has 586 trans-eqtls
Loading  av has 463 trans-eqtls
Loading  ag has 184 trans-eqtls
Loading  aa has 1298 trans-eqtls
Loading  ac has 1539 trans-eqtls
Loading  at has 391 trans-eqtls
Loading  bam has 1182 trans-eqtls
Loading  ban has 954 trans-eqtls
Loading  bca has 19 trans-eqtls
Loading  bceh has 128 trans-eqtls
Loading  bce has 40 trans-eqtls
Loading  bco has 64 trans-eqtls
Loading  bfr has 32 trans-eqtls
Loading  bhi has 51 trans-eqtls
Loading  bhy has 83 trans-eqtls
Loading  bnu has 379 trans-eqtls
Loading  bpu has 10 trans-eqtls
Loading  bsp has 61 trans-eqtls
Loading  bsu has 28 trans-eqtls
Loading  br has 505 trans-eqtls
Loading  ebv has 297 trans-eqtls
Loading  fib has 209 trans-eqtls
Loading  cols has 75 trans-eqtls
Loading  colt has 1174 trans-eqtls
Loading  esog has 17 trans-eqtls
Loading  esom has 9 trans-eqtls
Loading  esomu has 20 trans-eqtls
Loading  haa has 64 trans-eqtls
Loading  hlv has 247 trans-eqtls
Loading  kc has 14 trans-eqtls
Loading  liv has 37 tra

In [5]:
teqtl_varids = list()
for tissue in tshorts:
    teqtl_varids += [snp.rsid for snp in trans_dict[tissue]]
teqtl_varids = list(set(teqtl_varids))

chrm_teqtls = dict()
suma = 0
for chrm in range(1,23):
    chrm_teqtls[chrm] = [x for x in teqtl_varids if x.startswith("chr{:d}_".format(chrm))]
    print(f"chr{chrm} has {len(chrm_teqtls[chrm])} trans-eqtls")
    suma += len(chrm_teqtls[chrm])
print(suma)
print(len(teqtl_varids))

chr1 has 1423 trans-eqtls
chr2 has 1629 trans-eqtls
chr3 has 1410 trans-eqtls
chr4 has 1180 trans-eqtls
chr5 has 1003 trans-eqtls
chr6 has 1176 trans-eqtls
chr7 has 1075 trans-eqtls
chr8 has 855 trans-eqtls
chr9 has 906 trans-eqtls
chr10 has 983 trans-eqtls
chr11 has 853 trans-eqtls
chr12 has 827 trans-eqtls
chr13 has 495 trans-eqtls
chr14 has 637 trans-eqtls
chr15 has 722 trans-eqtls
chr16 has 915 trans-eqtls
chr17 has 720 trans-eqtls
chr18 has 529 trans-eqtls
chr19 has 414 trans-eqtls
chr20 has 514 trans-eqtls
chr21 has 301 trans-eqtls
chr22 has 284 trans-eqtls
18851
18851


In [6]:
SNPGT_FIELDS = ['varid', 'chrom', 'pos', 'maf', 'dosage']
class SNPGT(collections.namedtuple('_SNPGT', SNPGT_FIELDS)):
    __slots__ = ()

full_teqtls_gt = collections.defaultdict(dict)
first_donors = list()
for chrm in range(1,23):
    print(f"Reading CHR{chrm}")
    f_vcf = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_SHAPEIT2/0.01/GTEX_v8_2020-02-21_WGS_838Indiv_Freeze.SHAPEIT2_phased_NoMissingGT_SNPfilter_MAF0.01_chr{:d}.vcf.gz".format(chrm)
    # f_vcf = "/cbscratch/franco/datasets/gtex_v8/genotypes/vcfs_SHAPEIT2/ldpruned/GTEX_v8.SHAPEIT2_chr1.ldpruned.vcf.gz"
    # samplefile = "/cbscratch/franco/datasets/gtex_v8/genotypes/gtex_v8.sample"
    samplefile = None
    vcf = ReadVCF(f_vcf, snplist=chrm_teqtls[chrm])
    gtfull = vcf.dosage
    gt_donors = vcf.donor_ids
    if chrm == 1:
        first_donors = gt_donors
    else:
        if first_donors != gt_donors or len(gt_donors) != len(first_donors):
            print("donor error!")
            raise
    snpinfos = vcf.snpinfo
    for i,snp in enumerate(snpinfos):
        full_teqtls_gt[chrm][snp.varid] = SNPGT(varid=snp.varid, chrom=snp.chrom, pos=snp.bp_pos, maf=snp.maf, dosage=gtfull[i,:])

Reading CHR1
all 1423 found!
Reading CHR2
all 1629 found!
Reading CHR3
all 1410 found!
Reading CHR4
all 1180 found!
Reading CHR5
all 1003 found!
Reading CHR6
all 1176 found!
Reading CHR7
all 1075 found!
Reading CHR8
all 855 found!
Reading CHR9
all 906 found!
Reading CHR10
all 983 found!
Reading CHR11
all 853 found!
Reading CHR12
all 827 found!
Reading CHR13
all 495 found!
Reading CHR14
all 637 found!
Reading CHR15
all 722 found!
Reading CHR16
all 915 found!
Reading CHR17
all 720 found!
Reading CHR18
all 529 found!
Reading CHR19
all 414 found!
Reading CHR20
all 514 found!
Reading CHR21
all 301 found!
Reading CHR22
all 284 found!


In [11]:
# read the samples for each tissue
expr_dir = "/cbscratch/franco/trans-eqtl/new_preprocess_feb2020_freeze/gtex_v8/expression/tpms"
expr_file = os.path.join(expr_dir, "{:s}_tpms_qcfilter.txt")

tissue_samples = dict()
for tissue in tshorts:
    with open(expr_file.format(tissue)) as instream:
        samplenames = instream.readline().strip().split("\t")[1:]
        tissue_samples[tissue] = samplenames

In [33]:
def match_samples(gt_donors, expr_donors):
    common  = [x for x in gt_donors if x in expr_donors]
    vcfmask = [gt_donors.index(x) for x in common]
    return vcfmask, common

def find_ancestry(samples, sample_pheno_dict):
    ix_eur = list()
    ix_afr = list()
    for i,sid in enumerate(samples):
        if sample_pheno_dict[sid] == '2':
            ix_afr.append(i)
        if sample_pheno_dict[sid] == '3':
            ix_eur.append(i)
    return ix_eur, ix_afr

for tissue in tshorts:
tissue = "as"
vcfmask, sampids = match_samples(gt_donors, tissue_samples[tissue])
pops = ["eur", "afr"]
alleles = ["ref", "alt"]
ix_eur, ix_afr = find_ancestry(sampids, sample_pheno_dict)
print(len(vcfmask))
for snp in trans_dict[tissue]:
    print(snp.rsid)
    chrm = int(snp.rsid.split("_")[0][3:])
    snp_data = full_teqtls_gt[chrm][snp.rsid]
    pops_gt = dict()
    pops_gt["eur"] = np.array([snp_data.dosage[i] for i in ix_eur]).reshape(1,-1)
    pops_gt["afr"] = np.array([snp_data.dosage[i] for i in ix_afr]).reshape(1,-1)
    fsts_gtex, power_gtex = calc_simple_fst_power(pops_gt, pops)
    weir_fsts_gtex = weir_cockerman_fst(pops_gt, pops, alleles)
    raise

581
chr1_1170732_A_G_b38


RuntimeError: No active exception to reraise

In [35]:
print(fsts_gtex, power_gtex, weir_fsts_gtex)

[0.003201556332503024] [49.06665092179091] [0.0077605747023750836]
