In [1]:
import os
import numpy as np
import pandas as pd

import pyranges as pr


In [2]:
#Parse vcf and extract eQTL rows

eqtl_file = '/home/drk/seqnn/data/gtex_fine/susie_pip90/pos_merge.vcf'


In [16]:
#Read eQTLs

eqtl_df = pd.read_csv(eqtl_file, sep='\t', skiprows=1, names=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'feat1', 'feat2', 'featNaN'])[['#CHROM', 'POS', 'ID', 'REF', 'ALT']]
eqtl_set = set(sorted(eqtl_df['ID'].values.tolist()))

print("len(eqtl_df) = " + str(len(eqtl_df)))
print("len(eqtl_set) = " + str(len(eqtl_set)))


len(eqtl_df) = 17925
len(eqtl_set) = 17925


In [18]:
#Load sequence bed; output trained-on and held-out snp lists respectively

valid_shift = False

eqtl_out_file = 'gtex_susie_pip90'

seq_bed_file = '/scratch3/drk/seqnn/data/v9/hg38/sequences.bed'

num_folds = 8

repl_index = [
    'f3c0',
    'f3c1',
    'f3c2',
    'f3c3',
]

#Loop over replicates
for repl_str in repl_index :
    
    fi, ci = int(repl_str.split("c")[0][1:]), int(repl_str.split("c")[1])
    
    print("--- replicate = " + repl_str + " (fi = " + str(fi) + ", ci = " + str(ci) + ") ---")

    vcf_df = eqtl_df.copy().reset_index(drop=True)

    print("len(vcf_df) = " + str(len(vcf_df)))

    #Load sequence bed
    seq_df = pd.read_csv(seq_bed_file, sep='\t', names=['chrom', 'start', 'end', 'label'])

    seq_df['start'] -= 163840
    seq_df['end'] += 163840

    test_fold = fi
    valid_fold = -1
    if valid_shift :
        valid_fold = (fi+1+ci) % num_folds
    else :
        valid_fold = (fi+1) % num_folds

    def _label_train(x) :
        if x == 'fold' + str(test_fold) :
            return 'test'
        elif x == 'fold' + str(valid_fold) :
            return 'valid'
        else :
            return 'train'

    seq_df['label'] = seq_df['label'].apply(_label_train)

    print("len(seq_df) = " + str(len(seq_df)))

    #Intersect vcf against sequence bed
    seq_pr = pr.PyRanges(seq_df.rename(columns={'chrom' : 'Chromosome', 'start' : 'Start', 'end' : 'End'}))

    vcf_df['End'] = vcf_df['POS'] + 1
    vcf_pr = pr.PyRanges(vcf_df[['#CHROM', 'POS', 'End', 'ID', 'REF', 'ALT']].rename(columns={'#CHROM' : 'Chromosome', 'POS' : 'Start', 'REF' : 'ref', 'ALT' : 'alt'}))

    vcf_seq_df = vcf_pr.join(seq_pr, strandedness=False).df.copy().reset_index(drop=True)
    vcf_train_set = sorted(list(set(vcf_seq_df.query("label == 'train'")['ID'].values)))

    print("len(vcf_train_set) = " + str(len(vcf_train_set)))

    #Mark loci in the vcf that had been seen during training
    is_train_locus = []
    for _, row in vcf_df.iterrows () :
        if row['ID'] in vcf_train_set :
            is_train_locus.append(True)
        else :
            is_train_locus.append(False)

    vcf_df['is_train_locus'] = is_train_locus

    #Store final list of trained-on and non-trained-on SNP positions for the given fold
    vcf_df_train = vcf_df.query("is_train_locus == True").copy().reset_index(drop=True)
    snp_list_train = sorted(list(set(vcf_df_train['ID'].values.tolist())))

    vcf_df_test = vcf_df.query("is_train_locus == False").copy().reset_index(drop=True)
    snp_list_test = sorted(list(set(vcf_df_test['ID'].values.tolist())))

    print("len(snp_list_train) = " + str(len(snp_list_train)))
    print("len(snp_list_test) = " + str(len(snp_list_test)))

    with open(eqtl_out_file + "_" + repl_str + ("s" if valid_shift else "") + "_train.txt", 'wt') as out_f :
        for snp_id in snp_list_train :
            out_f.write(snp_id + '\n')

    with open(eqtl_out_file + "_" + repl_str + ("s" if valid_shift else "") + "_test.txt", 'wt') as out_f :
        for snp_id in snp_list_test :
            out_f.write(snp_id + '\n')


--- replicate = f3c0 (fi = 3, ci = 0) ---
len(vcf_df) = 17925
len(seq_df) = 55497
len(vcf_train_set) = 13750
len(snp_list_train) = 13750
len(snp_list_test) = 4175
--- replicate = f3c1 (fi = 3, ci = 1) ---
len(vcf_df) = 17925
len(seq_df) = 55497
len(vcf_train_set) = 13750
len(snp_list_train) = 13750
len(snp_list_test) = 4175
--- replicate = f3c2 (fi = 3, ci = 2) ---
len(vcf_df) = 17925
len(seq_df) = 55497
len(vcf_train_set) = 13750
len(snp_list_train) = 13750
len(snp_list_test) = 4175
--- replicate = f3c3 (fi = 3, ci = 3) ---
len(vcf_df) = 17925
len(seq_df) = 55497
len(vcf_train_set) = 13750
len(snp_list_train) = 13750
len(snp_list_test) = 4175
