## Import Packages and Set Directory

In [4]:
import pandas as pd
import numpy as np
import sys
import argparse
import os
os.chdir('/Users/abc6435/Desktop/RABvcfs')

## Functions

In [7]:
#Define readvcf()
def readvcf(vcf_file, pop_file):
    #Import Population File
    pops = pd.read_csv(pop_file, sep="\t", header=None)
    popA = pops[pops[0]=="popA"][1].astype(str)
    popB = pops[pops[0]=="popB"][1].astype(str)

    #Import VCF
    with open(vcf_file) as file:
        for line in file:
            if line.startswith('#CHROM'):
                cols = line.lstrip("#").strip().split("\t")
    data = pd.read_csv(vcf_file, sep='\t', comment='#', names=cols)

    #Extract Genotype Information
    loci = data[['CHROM','POS','REF','ALT']]
    samples = data.drop(columns=['CHROM', 'POS', 'REF', 'ALT','ID','QUAL','FILTER','INFO','FORMAT'])
    samples = samples.apply(lambda col: col.str.split(":").str[0])
    genotypes = pd.concat([loci,samples], axis=1)

    #Count Genotypes
    genotypes['popA_0'] = 0
    genotypes['popA_1'] = 0
    genotypes['popB_0'] = 0
    genotypes['popB_1'] = 0

    for sample in popA:
        for row in range(len(genotypes)):
            if genotypes.loc[row, sample] == '0/0' or genotypes.loc[row, sample] == '0|0':
                genotypes.loc[row, 'popA_0'] += 2
            if genotypes.loc[row, sample] == '1/1' or genotypes.loc[row, sample] == '1|1':
                genotypes.loc[row, 'popA_1'] += 2
            if genotypes.loc[row, sample] == '0/1' or genotypes.loc[row, sample] == '0|1':
                genotypes.loc[row, 'popA_0'] += 1
                genotypes.loc[row, 'popA_1'] += 1
            if genotypes.loc[row, sample] == '1/0' or genotypes.loc[row, sample] == '1|0':
                genotypes.loc[row, 'popA_0'] += 1
                genotypes.loc[row, 'popA_1'] += 1
                
    for sample in popB:
        for row in range(len(genotypes)):
            if genotypes.loc[row, sample] == '0/0' or genotypes.loc[row, sample] == '0|0':
                genotypes.loc[row, 'popB_0'] += 2
            if genotypes.loc[row, sample] == '1/1' or genotypes.loc[row, sample] == '1|1':
                genotypes.loc[row, 'popB_1'] += 2
            if genotypes.loc[row, sample] == '0/1' or genotypes.loc[row, sample] == '0|1':
                genotypes.loc[row, 'popB_0'] += 1
                genotypes.loc[row, 'popB_1'] += 1
            if genotypes.loc[row, sample] == '1/0' or genotypes.loc[row, sample] == '1|0':
                genotypes.loc[row, 'popB_0'] += 1
                genotypes.loc[row, 'popB_1'] += 1

   #Calculate Frequencies
    genotypes['f_popA_0'] = genotypes['popA_0']/(genotypes['popA_0'] + genotypes['popA_1'])
    genotypes['f_popA_1'] = genotypes['popA_1']/(genotypes['popA_0'] + genotypes['popA_1'])
    genotypes['f_popB_0'] = genotypes['popB_0']/(genotypes['popB_0'] + genotypes['popB_1'])
    genotypes['f_popB_1'] = genotypes['popB_1']/(genotypes['popB_0'] + genotypes['popB_1'])
    genotypes['nA'] = (genotypes['popA_0'] + genotypes['popA_1']) / 2
    genotypes['nB'] = (genotypes['popB_0'] + genotypes['popB_1']) / 2
    freq = genotypes[['CHROM','POS','REF','ALT',
                      'f_popA_1','f_popB_1','nA','nB']].dropna(axis="rows")
    return freq[(freq["nA"] > 1) & (freq["nB"] > 1)]

#Define importsites()
def importsites(neutral_file, mutation_file, derived_sites):
    neutral = pd.read_csv(neutral_file, sep='\t', header=(0))
    mutation = pd.read_csv(mutation_file, sep='\t', header=(0))
    neutral.rename(columns={'chromo':'CHROM', 'position':'POS'}, inplace=True)
    mutation.rename(columns={'chromo':'CHROM', 'position':'POS'}, inplace=True)
    neu_der = pd.merge(derived_sites, neutral, on=['CHROM','POS'],how='inner', indicator=True)
    mut_der = pd.merge(derived_sites, mutation, on=['CHROM','POS'],how='inner', indicator=True)
    neu_der = neu_der[neu_der['_merge']=='both'].drop(columns=['_merge'])
    mut_der = mut_der[mut_der['_merge']=='both'].drop(columns=['_merge'])
    #Report Number of Sites
    print("N_mutation =", len(mut_der))
    print("N_neutral =", len(neu_der))
    return neu_der, mut_der

#Define calcRAB()
def calcRAB(neu_der, mut_der, seed):
    np.random.seed(seed)
    index1=np.random.permutation(len(neu_der))[:10000]
    neu1=neu_der.iloc[index1]
    f_AD = mut_der['f_popA_1']
    f_BD = mut_der['f_popB_1']
    f_AN = neu1['f_popA_1']
    f_BN = neu1['f_popB_1']
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_AN))
    RAB = LAB/LBA
    return RAB

#Define calcRAB_neu()
def calcRAB_neu(neu_der, seed):
    np.random.seed(seed)
    index1=np.random.permutation(len(neu_der))[:10000]
    neu1=neu_der.iloc[index1]
    index2=np.random.permutation(len(neu_der))[:10000]
    neu2=neu_der.iloc[index2]
    f_AD = neu1['f_popA_1']
    f_BD = neu1['f_popB_1']
    f_AN = neu2['f_popA_1']
    f_BN = neu2['f_popB_1']
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_AN))
    RAB_neu = LAB/LBA
    return RAB_neu

#Define calcRAB_subs()
def calcRAB_sub(neu_sub, mut_sub):
    index1=np.random.permutation(len(neu_sub))[:10000]
    neu1=neu_sub.iloc[index1]
    f_AD = mut_sub['f_popA_1']
    f_BD = mut_sub['f_popB_1']
    f_AN = neu1['f_popA_1']
    f_BN = neu1['f_popB_1']
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_AN))
    RAB_sub = LAB/LBA
    return RAB_sub

#Define samplesites()
def samplesites(sites, psites):
    nsites = int(round(len(sites) * psites))
    indices = np.random.permutation(len(sites))[:nsites]
    subsamp = sites.iloc[indices]
    return subsamp

#Define jackknife()
def jackknife(neu_der, mut_der, psites, iter):
    jx = []
    for i in range(iter):
        neu_sub = samplesites(neu_der, psites)
        mut_sub = samplesites(mut_der, psites)
        jx.append(calcRAB_sub(neu_sub, mut_sub))
    return np.array(jx)
    

## Run

In [8]:
der = readvcf("/Users/abc6435/Desktop/RABvcfs/rab_chr29_test.vcf", "/Users/abc6435/Desktop/RABvcfs/pops.txt")
neutral, mutation = importsites('/Users/abc6435/Desktop/RABvcfs/intergenic.txt',
                                '/Users/abc6435/Desktop/RABvcfs/tolerated.txt', der)

len(neutral)

N_mutation = 4
N_neutral = 119


119

In [9]:
data = readvcf("/Users/abc6435/Desktop/RABvcfs/rab_chr29_test.vcf", "/Users/abc6435/Desktop/RABvcfs/pops.txt")
neutral, mutation = importsites('/Users/abc6435/Desktop/RABvcfs/intergenic.txt',
                                '/Users/abc6435/Desktop/RABvcfs/tolerated.txt', data)
print("calcRAB", calcRAB(neutral, mutation, 20))
print("calcRAB_neu",calcRAB_neu(neutral, 20))
jx_array=jackknife(neutral, mutation, 0.30, 50)
print("jackknife", jx_array)
np.percentile(jx_array, [2.5, 97.5])


N_mutation = 4
N_neutral = 119
calcRAB 0.7163863178952442
calcRAB_neu 0.9999999999999998
jackknife [0.25666064 0.18805467 0.28849752 0.71764261 0.85804196 0.32871484
 0.8847219  3.9706191  0.30843541 0.29407548 0.56080367 0.39727864
 0.45529478 0.53005464 0.49125881 2.8240563  0.23455621 2.14758751
 0.30511464 0.26732392 0.3996898  0.34386134 2.89028213 0.43047055
 0.81019674 2.63941793 0.29153128 0.3044713  2.20276498 0.75272282
 0.29645907 0.44553108 0.92414734 0.19849611 0.36159799 0.976788
 0.26396486 0.43277297 0.23876003 0.49032589 0.37972969 0.31632814
 0.2671471  0.94431735 0.44873056 1.82591093 0.27123347 0.45965376
 0.36965568 0.25899222]


array([0.20660963, 2.87538132])